File Coverage

blib/lib/Algorithm/DecisionTree.pm
Criterion Covered Total %
statement 788 2250 35.0
branch 200 708 28.2
condition 41 232 17.6
subroutine 45 92 48.9
pod 8 40 20.0
total 1082 3322 32.5


line stmt bran cond sub pod time code
1             package Algorithm::DecisionTree;
2              
3             #--------------------------------------------------------------------------------------
4             # Copyright (c) 2016 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::DecisionTree is a Perl module for decision-tree based classification of
9             # multidimensional data.
10             # -------------------------------------------------------------------------------------
11              
12             #use 5.10.0;
13 1     1   10953 use strict;
  1         2  
  1         21  
14 1     1   3 use warnings;
  1         1  
  1         17  
15 1     1   3 use Carp;
  1         3  
  1         12203  
16              
17             our $VERSION = '3.41';
18              
19             ############################################ Constructor ##############################################
20             sub new {
21 1     1 1 310 my ($class, %args, $eval_or_boosting_mode);
22 1 50       10 if (@_ % 2 != 0) {
23 1         5 ($class, %args) = @_;
24             } else {
25 0         0 $class = shift;
26 0         0 $eval_or_boosting_mode = shift;
27 0 0 0     0 die unless $eval_or_boosting_mode eq 'evalmode' || $eval_or_boosting_mode eq 'boostingmode';
28 0 0       0 die "Only one string arg allowed in eval and boosting modes" if @_;
29             }
30 1 50       2 unless ($eval_or_boosting_mode) {
31 1         4 my @params = keys %args;
32 1 50       2 croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
33             if check_for_illegal_params2(@params) == 0;
34             }
35             bless {
36             _training_datafile => $args{training_datafile},
37             _entropy_threshold => $args{entropy_threshold} || 0.01,
38             _max_depth_desired => exists $args{max_depth_desired} ?
39             $args{max_depth_desired} : undef,
40             _debug1 => $args{debug1} || 0,
41             _debug2 => $args{debug2} || 0,
42             _debug3 => $args{debug3} || 0,
43             _csv_class_column_index => $args{csv_class_column_index} || undef,
44             _csv_columns_for_features => $args{csv_columns_for_features} || undef,
45             _symbolic_to_numeric_cardinality_threshold
46             => $args{symbolic_to_numeric_cardinality_threshold} || 10,
47             _number_of_histogram_bins => $args{number_of_histogram_bins} || undef,
48 1 50 50     45 _training_data => [],
      50        
      50        
      50        
      50        
      50        
      50        
      50        
49             _root_node => undef,
50             _probability_cache => {},
51             _entropy_cache => {},
52             _training_data_hash => {},
53             _features_and_values_hash => {},
54             _samples_class_label_hash => {},
55             _class_names => [],
56             _class_priors => [],
57             _class_priors_hash => {},
58             _feature_names => [],
59             _numeric_features_valuerange_hash => {},
60             _sampling_points_for_numeric_feature_hash => {},
61             _feature_values_how_many_uniques_hash => {},
62             _prob_distribution_numeric_features_hash => {},
63             _histogram_delta_hash => {},
64             _num_of_histogram_bins_hash => {},
65             }, $class;
66              
67             }
68              
69             #################################### Classify with Decision Tree #######################################
70              
71             ## Classifies one test sample at a time using the decision tree constructed from
72             ## your training file. The data record for the test sample must be supplied as
73             ## shown in the scripts in the `examples' subdirectory. See the scripts
74             ## construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
75             sub classify {
76 0     0 1 0 my $self = shift;
77 0         0 my $root_node = shift;
78 0         0 my $feature_and_values = shift;
79 0         0 my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
80 0         0 my @features_and_values = @$feature_and_values;
81 0         0 @features_and_values = @{deep_copy_array(\@features_and_values)};
  0         0  
82 0 0       0 die "Error in the names you have used for features and/or values"
83             unless $self->check_names_used(\@features_and_values);
84 0         0 my @new_features_and_values = ();
85 0         0 my $pattern = '(\S+)\s*=\s*(\S+)';
86 0         0 foreach my $feature_and_value (@features_and_values) {
87 0         0 $feature_and_value =~ /$pattern/;
88 0         0 my ($feature, $value) = ($1, $2);
89 0         0 my $newvalue = $value;
90 0         0 my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
  0         0  
91 0         0 my $not_all_values_float = 0;
92 0 0       0 map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
  0         0  
93 0 0 0     0 if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
  0         0  
94             $not_all_values_float == 0) {
95 0         0 $newvalue = closest_sampling_point($value, \@unique_values_for_feature);
96             }
97 0         0 push @new_features_and_values, "$feature" . '=' . "$newvalue";
98             }
99 0         0 @features_and_values = @new_features_and_values;
100 0 0       0 print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
101 0         0 my %answer = ();
102 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
103 0         0 $answer{$class_name} = undef;
104             }
105 0         0 $answer{'solution_path'} = [];
106 0         0 my %classification = %{$self->recursive_descent_for_classification($root_node,
  0         0  
107             \@features_and_values,\%answer)};
108 0         0 @{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
  0         0  
  0         0  
109 0 0       0 if ($self->{_debug3}) {
110 0         0 print "\nCL2 The classification:\n";
111 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
112 0         0 print " $class_name with probability $classification{$class_name}\n";
113             }
114             }
115 0         0 my %classification_for_display = ();
116 0         0 foreach my $item (keys %classification) {
117 0 0       0 if ($item ne 'solution_path') {
118 0         0 $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
119             } else {
120 0         0 my @outlist = ();
121 0         0 foreach my $x (@{$classification{$item}}) {
  0         0  
122 0         0 push @outlist, "NODE$x";
123             }
124 0         0 $classification_for_display{$item} = \@outlist;
125             }
126             }
127 0         0 return \%classification_for_display;
128             }
129              
130             sub recursive_descent_for_classification {
131 0     0 0 0 my $self = shift;
132 0         0 my $node = shift;
133 0         0 my $features_and_values = shift;
134 0         0 my $answer = shift;
135 0         0 my @features_and_values = @$features_and_values;
136 0         0 my %answer = %$answer;
137 0         0 my @children = @{$node->get_children()};
  0         0  
138 0 0       0 if (@children == 0) {
139 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
140 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
141 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
142             }
143 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
144 0         0 return \%answer;
145             }
146 0         0 my $feature_tested_at_node = $node->get_feature();
147             print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n"
148 0 0       0 if $self->{_debug3};
149 0         0 my $value_for_feature;
150             my $path_found;
151 0         0 my $pattern = '(\S+)\s*=\s*(\S+)';
152 0         0 foreach my $feature_and_value (@features_and_values) {
153 0         0 $feature_and_value =~ /$pattern/;
154 0 0       0 $value_for_feature = $2 if $feature_tested_at_node eq $1;
155             }
156             # The following clause introduced in Version 3.20
157 0 0       0 if (!defined $value_for_feature) {
158 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
159 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
160 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
161             }
162 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
163 0         0 return \%answer;
164             }
165 0 0       0 if ($value_for_feature) {
166 0 0       0 if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
  0         0  
167 0 0       0 print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};
168 0         0 my $pattern1 = '(.+)<(.+)';
169 0         0 my $pattern2 = '(.+)>(.+)';
170 0         0 foreach my $child (@children) {
171 0         0 my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
  0         0  
172 0         0 my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
173 0 0       0 if ($last_feature_and_value_on_branch =~ /$pattern1/) {
174 0         0 my ($feature, $threshold) = ($1,$2);
175 0 0       0 if ($value_for_feature <= $threshold) {
176 0         0 $path_found = 1;
177 0         0 %answer = %{$self->recursive_descent_for_classification($child,
  0         0  
178             $features_and_values,\%answer)};
179 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
180 0         0 last;
181             }
182             }
183 0 0       0 if ($last_feature_and_value_on_branch =~ /$pattern2/) {
184 0         0 my ($feature, $threshold) = ($1,$2);
185 0 0       0 if ($value_for_feature > $threshold) {
186 0         0 $path_found = 1;
187 0         0 %answer = %{$self->recursive_descent_for_classification($child,
  0         0  
188             $features_and_values,\%answer)};
189 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
190 0         0 last;
191             }
192             }
193             }
194 0 0       0 return \%answer if $path_found;
195             } else {
196 0         0 my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";
197             print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n"
198 0 0       0 if $self->{_debug3};
199 0         0 foreach my $child (@children) {
200 0         0 my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
  0         0  
201 0 0       0 print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};
202 0         0 my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
203 0 0       0 if ($last_feature_and_value_on_branch eq $feature_value_combo) {
204 0         0 %answer = %{$self->recursive_descent_for_classification($child,
  0         0  
205             $features_and_values,\%answer)};
206 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
207 0         0 $path_found = 1;
208 0         0 last;
209             }
210             }
211 0 0       0 return \%answer if $path_found;
212             }
213             }
214 0 0       0 if (! $path_found) {
215 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
216 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
217 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
218             }
219 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
220             }
221 0         0 return \%answer;
222             }
223              
224             ## If you want classification to be carried out by engaging a human user in a
225             ## question-answer session, this is the method to use for that purpose. See, for
226             ## example, the script classify_by_asking_questions.pl in the `examples'
227             ## subdirectory for an illustration of how to do that.
228             sub classify_by_asking_questions {
229 0     0 1 0 my $self = shift;
230 0         0 my $root_node = shift;
231 0         0 my %answer = ();
232 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
233 0         0 $answer{$class_name} = undef;
234             }
235 0         0 $answer{'solution_path'} = [];
236 0         0 my %scratchpad_for_numeric_answers = ();
237 0         0 foreach my $feature_name (keys %{$self->{_prob_distribution_numeric_features_hash}}) {
  0         0  
238 0         0 $scratchpad_for_numeric_answers{$feature_name} = undef;
239             }
240 0         0 my %classification = %{$self->interactive_recursive_descent_for_classification($root_node,
  0         0  
241             \%answer, \%scratchpad_for_numeric_answers)};
242 0         0 @{$classification{'solution_path'}} = reverse @{$classification{'solution_path'}};
  0         0  
  0         0  
243 0         0 my %classification_for_display = ();
244 0         0 foreach my $item (keys %classification) {
245 0 0       0 if ($item ne 'solution_path') {
246 0         0 $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
247             } else {
248 0         0 my @outlist = ();
249 0         0 foreach my $x (@{$classification{$item}}) {
  0         0  
250 0         0 push @outlist, "NODE$x";
251             }
252 0         0 $classification_for_display{$item} = \@outlist;
253             }
254             }
255 0         0 return \%classification_for_display;
256             }
257              
258             sub interactive_recursive_descent_for_classification {
259 0     0 0 0 my $self = shift;
260 0         0 my $node = shift;
261 0         0 my $answer = shift;
262 0         0 my $scratchpad_for_numerics = shift;
263 0         0 my %answer = %$answer;
264 0         0 my %scratchpad_for_numerics = %$scratchpad_for_numerics;
265 0         0 my $pattern1 = '(.+)<(.+)';
266 0         0 my $pattern2 = '(.+)>(.+)';
267 0         0 my $user_value_for_feature;
268 0         0 my @children = @{$node->get_children()};
  0         0  
269 0 0       0 if (@children == 0) {
270 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
271 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
272 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
273             }
274 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
275 0         0 return \%answer;
276             }
277 0         0 my @list_of_branch_attributes_to_children = ();
278 0         0 foreach my $child (@children) {
279 0         0 my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
  0         0  
280 0         0 my $feature_and_value_on_branch = $branch_features_and_values[-1];
281 0         0 push @list_of_branch_attributes_to_children, $feature_and_value_on_branch;
282             }
283 0         0 my $feature_tested_at_node = $node->get_feature();
284 0         0 my $feature_value_combo;
285 0         0 my $path_found = 0;
286 0 0       0 if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
  0         0  
287 0 0       0 if ($scratchpad_for_numerics{$feature_tested_at_node}) {
288 0         0 $user_value_for_feature = $scratchpad_for_numerics{$feature_tested_at_node};
289             } else {
290 0         0 my @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_tested_at_node}};
  0         0  
291 0         0 while (1) {
292 0         0 print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
293 0         0 print "\nEnter a value in the range (@valuerange): ";
294 0         0 $user_value_for_feature = ;
295 0         0 $user_value_for_feature =~ s/\r?\n?$//;
296 0         0 $user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
297 0         0 my $answer_found = 0;
298 0 0 0     0 if ($user_value_for_feature >= $valuerange[0] && $user_value_for_feature <= $valuerange[1]) {
299 0         0 $answer_found = 1;
300 0         0 last;
301             }
302 0 0       0 last if $answer_found;
303 0         0 print("You entered illegal value. Let's try again")
304             }
305 0         0 $scratchpad_for_numerics{$feature_tested_at_node} = $user_value_for_feature;
306             }
307 0         0 foreach my $i (0..@list_of_branch_attributes_to_children-1) {
308 0         0 my $branch_attribute = $list_of_branch_attributes_to_children[$i];
309 0 0       0 if ($branch_attribute =~ /$pattern1/) {
310 0         0 my ($feature,$threshold) = ($1,$2);
311 0 0       0 if ($user_value_for_feature <= $threshold) {
312 0         0 %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
  0         0  
313             \%answer, \%scratchpad_for_numerics)};
314 0         0 $path_found = 1;
315 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
316 0         0 last;
317             }
318             }
319 0 0       0 if ($branch_attribute =~ /$pattern2/) {
320 0         0 my ($feature,$threshold) = ($1,$2);
321 0 0       0 if ($user_value_for_feature > $threshold) {
322 0         0 %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
  0         0  
323             \%answer, \%scratchpad_for_numerics)};
324 0         0 $path_found = 1;
325 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
326 0         0 last;
327             }
328             }
329             }
330 0 0       0 return \%answer if $path_found;
331             } else {
332 0         0 my @possible_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature_tested_at_node}};
  0         0  
333 0         0 while (1) {
334 0         0 print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
335 0         0 print "\nEnter a value from the list (@possible_values_for_feature): ";
336 0         0 $user_value_for_feature = ;
337 0         0 $user_value_for_feature =~ s/\r?\n?$//;
338 0         0 $user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
339 0         0 my $answer_found = 0;
340 0 0       0 if (contained_in($user_value_for_feature, @possible_values_for_feature)) {
341 0         0 $answer_found = 1;
342 0         0 last;
343             }
344 0 0       0 last if $answer_found;
345 0         0 print("You entered illegal value. Let's try again");
346             }
347 0         0 $feature_value_combo = "$feature_tested_at_node=$user_value_for_feature";
348 0         0 foreach my $i (0..@list_of_branch_attributes_to_children-1) {
349 0         0 my $branch_attribute = $list_of_branch_attributes_to_children[$i];
350 0 0       0 if ($branch_attribute eq $feature_value_combo) {
351 0         0 %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
  0         0  
352             \%answer, \%scratchpad_for_numerics)};
353 0         0 $path_found = 1;
354 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
355 0         0 last;
356             }
357             }
358 0 0       0 return \%answer if $path_found;
359             }
360 0 0       0 if (! $path_found) {
361 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
362 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
363 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
364             }
365 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
366             }
367 0         0 return \%answer;
368             }
369              
370             ###################################### Decision Tree Construction ####################################
371              
372             ## At the root node, we find the best feature that yields the greatest reduction in
373             ## class entropy from the entropy based on just the class priors. The logic for
374             ## finding this feature is different for symbolic features and for numeric features.
375             ## That logic is built into the method shown later for best feature calculations.
376             sub construct_decision_tree_classifier {
377 1     1 1 7 print "\nConstructing the decision tree ...\n";
378 1         1 my $self = shift;
379 1 50       3 if ($self->{_debug3}) {
380 0         0 $self->determine_data_condition();
381 0         0 print "\nStarting construction of the decision tree:\n";
382             }
383 1         5 my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
  2         5  
  1         2  
384 1 50       3 if ($self->{_debug3}) {
385 0         0 print "\nPrior class probabilities: @class_probabilities\n";
386 0         0 print "\nClass names: @{$self->{_class_names}}\n";
  0         0  
387             }
388 1         4 my $entropy = $self->class_entropy_on_priors();
389 1 50       3 print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};
390 1         7 my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');
391 1         1 $root_node->set_class_names(\@{$self->{_class_names}});
  1         3  
392 1         1 $self->{_root_node} = $root_node;
393 1         3 $self->recursive_descent($root_node);
394 1         5 return $root_node;
395             }
396              
397             ## After the root node of the decision tree is calculated by the previous methods,
398             ## we invoke this method recursively to create the rest of the tree. At each node,
399             ## we find the feature that achieves the largest entropy reduction with regard to
400             ## the partitioning of the training data samples that correspond to that node.
401             sub recursive_descent {
402 30     30 0 21 my $self = shift;
403 30         20 my $node = shift;
404             print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"
405 30 50       44 if $self->{_debug3};
406 30         31 my $node_serial_number = $node->get_serial_num();
407 30         23 my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
  30         32  
408 30         35 my $existing_node_entropy = $node->get_node_entropy();
409 30 50       42 if ($self->{_debug3}) {
410 0         0 print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";
411 0         0 print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
412 0         0 print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
413 0         0 my @class_probs = @{$node->get_class_probabilities()};
  0         0  
414 0         0 print "\nRD4 Class probabilities: @class_probs\n";
415             }
416 30 100       61 if ($existing_node_entropy < $self->{_entropy_threshold}) {
417 1 50       3 print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
418 1         3 return;
419             }
420 29         19 my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  29         34  
421 29         42 my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
422             $self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
423 29         46 $node->set_feature($best_feature);
424 29 50       38 $node->display_node() if $self->{_debug3};
425 29 50 33     47 if (defined($self->{_max_depth_desired}) &&
426             (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
427 0 0       0 print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3};
428 0         0 return;
429             }
430 29 50       41 return if ! defined $best_feature;
431 29 50       32 if ($self->{_debug3}) {
432 0         0 print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
433 0         0 print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
434 0         0 print "\nRD9 Best feature entropy: $best_feature_entropy\n";
435 0         0 print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
436             }
437 29         32 my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
438 29 50       32 print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
439 29 100       35 if ($entropy_gain > $self->{_entropy_threshold}) {
440 19 50 33     39 if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} &&
441             $self->{_feature_values_how_many_uniques_hash}->{$best_feature} >
442             $self->{_symbolic_to_numeric_cardinality_threshold}) {
443 0         0 my $best_threshold = $decision_val; # as returned by best feature calculator
444 0         0 my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
445             my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
446 0         0 @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  0         0  
447             my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
448 0         0 @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  0         0  
449 0         0 my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
450 0         0 my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
451 0         0 push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
452             $feature_threshold_combo_for_less_than;
453 0         0 push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
454             $feature_threshold_combo_for_greater_than;
455 0 0       0 if ($self->{_debug3}) {
456 0         0 print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
457             "@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
458 0         0 print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
459             "@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
460             }
461             my @class_probabilities_for_lessthan_child_node =
462 0         0 map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
463 0         0 \@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
  0         0  
464             my @class_probabilities_for_greaterthan_child_node =
465 0         0 map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
466 0         0 \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
  0         0  
467 0 0       0 if ($self->{_debug3}) {
468 0         0 print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
469 0         0 print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
470             }
471 0 0       0 if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
472 0         0 my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
473             \@class_probabilities_for_lessthan_child_node,
474             \@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
475 0         0 $node->add_child_link($left_child_node);
476 0         0 $self->recursive_descent($left_child_node);
477             }
478 0 0       0 if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
479 0         0 my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
480             \@class_probabilities_for_greaterthan_child_node,
481             \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
482 0         0 $node->add_child_link($right_child_node);
483 0         0 $self->recursive_descent($right_child_node);
484             }
485             } else {
486             print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
487 19 50       22 if $self->{_debug3};
488 19         14 my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
  19         36  
489 19 50       25 print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
490 19         17 my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
  61         109  
491 19         19 my @class_entropies_for_children = ();
492 19         30 foreach my $feature_and_value_index (0..@feature_value_combos-1) {
493             print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
494 61 50       84 if $self->{_debug3};
495 61         33 my @extended_branch_features_and_values_or_thresholds;
496 61 100       71 if (! @features_and_values_or_thresholds_on_branch) {
497 4         6 @extended_branch_features_and_values_or_thresholds =
498             ($feature_value_combos[$feature_and_value_index]);
499             } else {
500             @extended_branch_features_and_values_or_thresholds =
501 57         40 @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  57         64  
502 57         68 push @extended_branch_features_and_values_or_thresholds,
503             $feature_value_combos[$feature_and_value_index];
504             }
505             my @class_probabilities =
506 122         177 map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
507 61         35 \@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
  61         72  
508 61         73 my $class_entropy_for_child =
509             $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
510             \@extended_branch_features_and_values_or_thresholds);
511 61 50       84 if ($self->{_debug3}) {
512 0         0 print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
513 0         0 print "\nRD20 class entropy for child: $class_entropy_for_child\n";
514             }
515 61 100       79 if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
516 29         53 my $child_node = DTNode->new(undef, $class_entropy_for_child,
517             \@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
518 29         40 $node->add_child_link($child_node);
519 29         80 $self->recursive_descent($child_node);
520             } else {
521 32 50       92 print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
522             }
523             }
524             }
525             } else {
526             print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n"
527 10 50       17 if $self->{_debug3};
528 10         39 return;
529             }
530             }
531              
532             ## This is the heart of the decision tree constructor. Its main job is to figure
533             ## out the best feature to use for partitioning the training data samples that
534             ## correspond to the current node. The search for the best feature is carried out
535             ## differently for symbolic features and for numeric features. For a symbolic
536             ## feature, the method estimates the entropy for each value of the feature and then
537             ## averages out these entropies as a measure of the discriminatory power of that
538             ## features. For a numeric feature, on the other hand, it estimates the entropy
539             ## reduction that can be achieved if were to partition the set of training samples
540             ## for each possible threshold. For a numeric feature, all possible sampling points
541             ## relevant to the node in question are considered as candidates for thresholds.
542             sub best_feature_calculator {
543 29     29 0 24 my $self = shift;
544 29         27 my $features_and_values_or_thresholds_on_branch = shift;
545 29         16 my $existing_node_entropy = shift;
546 29         29 my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
547 29         23 my $pattern1 = '(.+)=(.+)';
548 29         18 my $pattern2 = '(.+)<(.+)';
549 29         17 my $pattern3 = '(.+)>(.+)';
550 29         34 my @all_symbolic_features = ();
551 29         19 foreach my $feature_name (@{$self->{_feature_names}}) {
  29         38  
552             push @all_symbolic_features, $feature_name
553 116 50       171 if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
554             }
555 29         25 my @symbolic_features_already_used = ();
556 29         27 foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
557 81 50       309 push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
558             }
559 29         17 my @symbolic_features_not_yet_used;
560 29         71 foreach my $x (@all_symbolic_features) {
561 116 100       112 push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
562             }
563 29         26 my @true_numeric_types = ();
564 29         23 my @symbolic_types = ();
565 29         18 my @true_numeric_types_feature_names = ();
566 29         32 my @symbolic_types_feature_names = ();
567 29         26 foreach my $item (@features_and_values_or_thresholds_on_branch) {
568 81 50       250 if ($item =~ /$pattern2/) {
    50          
    50          
569 0         0 push @true_numeric_types, $item;
570 0         0 push @true_numeric_types_feature_names, $1;
571             } elsif ($item =~ /$pattern3/) {
572 0         0 push @true_numeric_types, $item;
573 0         0 push @true_numeric_types_feature_names, $1;
574             } elsif ($item =~ /$pattern1/) {
575 81         64 push @symbolic_types, $item;
576 81         103 push @symbolic_types_feature_names, $1;
577             } else {
578 0         0 die "format error in the representation of feature and values or thresholds";
579             }
580             }
581 29         34 my %seen = ();
582 29 0       27 @true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
  0         0  
583 29         15 %seen = ();
584 29 50       26 @symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
  81         191  
585             my @bounded_intervals_numeric_types =
586 29         23 @{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
  29         41  
587             # Calculate the upper and the lower bounds to be used when searching for the best
588             # threshold for each of the numeric features that are in play at the current node:
589 29         25 my (%upperbound, %lowerbound);
590 29         26 foreach my $feature (@true_numeric_types_feature_names) {
591 0         0 $upperbound{$feature} = undef;
592 0         0 $lowerbound{$feature} = undef;
593             }
594 29         27 foreach my $item (@bounded_intervals_numeric_types) {
595 0         0 foreach my $feature_grouping (@$item) {
596 0 0       0 if ($feature_grouping->[1] eq '>') {
597 0         0 $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
598             } else {
599 0         0 $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
600             }
601             }
602             }
603 29         30 my %entropy_values_for_different_features = ();
604 29         17 my %partitioning_point_child_entropies_hash = ();
605 29         22 my %partitioning_point_threshold = ();
606 29         25 my %entropies_for_different_values_of_symbolic_feature = ();
607 29         34 foreach my $feature (@{$self->{_feature_names}}) {
  29         37  
608 116         113 $entropy_values_for_different_features{$feature} = [];
609 116         90 $partitioning_point_child_entropies_hash{$feature} = {};
610 116         89 $partitioning_point_threshold{$feature} = undef;
611 116         126 $entropies_for_different_values_of_symbolic_feature{$feature} = [];
612             }
613 29         22 foreach my $i (0..@{$self->{_feature_names}}-1) {
  29         49  
614 116         95 my $feature_name = $self->{_feature_names}->[$i];
615 116 50       139 print "\n\nBFC1 FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
616 116 100 33     116 if (contained_in($feature_name, @symbolic_features_already_used)) {
    50          
617 81         87 next;
618 35         58 } elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
619             $self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
620             $self->{_symbolic_to_numeric_cardinality_threshold}) {
621 0         0 my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
  0         0  
622 0 0       0 print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};
623 0         0 my @newvalues = ();
624 0 0       0 if (contained_in($feature_name, @true_numeric_types_feature_names)) {
625 0 0 0     0 if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
    0 0        
    0 0        
    0 0        
626             $lowerbound{$feature_name} >= $upperbound{$feature_name}) {
627 0         0 next;
628             } elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
629             $lowerbound{$feature_name} < $upperbound{$feature_name}) {
630 0         0 foreach my $x (@values) {
631 0 0 0     0 push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
632             }
633             } elsif (defined($upperbound{$feature_name})) {
634 0         0 foreach my $x (@values) {
635 0 0       0 push @newvalues, $x if $x <= $upperbound{$feature_name};
636             }
637             } elsif (defined($lowerbound{$feature_name})) {
638 0         0 foreach my $x (@values) {
639 0 0       0 push @newvalues, $x if $x > $lowerbound{$feature_name};
640             }
641             } else {
642 0         0 die "Error is bound specifications in best feature calculator";
643             }
644             } else {
645 0         0 @newvalues = @{deep_copy_array(\@values)};
  0         0  
646             }
647 0 0       0 next if @newvalues == 0;
648 0         0 my @partitioning_entropies = ();
649 0         0 foreach my $value (@newvalues) {
650 0         0 my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
651 0         0 my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
652 0         0 my @for_left_child;
653             my @for_right_child;
654 0 0       0 if (@features_and_values_or_thresholds_on_branch) {
655 0         0 @for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  0         0  
656 0         0 push @for_left_child, $feature_and_less_than_value_string;
657 0         0 @for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  0         0  
658 0         0 push @for_right_child, $feature_and_greater_than_value_string;
659             } else {
660 0         0 @for_left_child = ($feature_and_less_than_value_string);
661 0         0 @for_right_child = ($feature_and_greater_than_value_string);
662             }
663 0         0 my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(
664             \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
665 0         0 my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
666             \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
667 0         0 my $partitioning_entropy = $entropy1 *
668             $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
669             $entropy2 *
670             $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);
671              
672 0         0 push @partitioning_entropies, $partitioning_entropy;
673 0         0 $partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
674             }
675 0         0 my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
676 0 0       0 if ($min_entropy < $existing_node_entropy) {
677 0         0 $partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
678 0         0 $entropy_values_for_different_features{$feature_name} = $min_entropy;
679             }
680             } else {
681 35 50       44 print "\nBFC2: Entering section reserved for symbolic features\n" if $self->{_debug3};
682 35 50       49 print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
683 35         21 my %seen;
684 110 50       359 my @values = grep {$_ ne 'NA' && !$seen{$_}++}
685 35         24 @{$self->{_features_and_unique_values_hash}->{$feature_name}};
  35         47  
686 35         65 @values = sort @values;
687 35 50       45 print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};
688              
689 35         25 my $entropy = 0;
690 35         29 foreach my $value (@values) {
691 110         163 my $feature_value_string = "$feature_name" . '=' . "$value";
692 110 50       139 print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
693 110         67 my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  110         133  
694 110 100       154 if (@features_and_values_or_thresholds_on_branch) {
695 96         79 push @extended_attributes, $feature_value_string;
696             } else {
697 14         15 @extended_attributes = ($feature_value_string);
698             }
699 110         143 $entropy +=
700             $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) *
701             $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
702             print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
703 110 50       160 "is $entropy\n" if $self->{_debug3};
704 110         68 push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
  110         208  
705             }
706 35 50       49 if ($entropy < $existing_node_entropy) {
707 35         88 $entropy_values_for_different_features{$feature_name} = $entropy;
708             }
709             }
710             }
711 29         25 my $min_entropy_for_best_feature;
712             my $best_feature_name;
713 29         53 foreach my $feature_nom (keys %entropy_values_for_different_features) {
714 116 100       112 if (!defined($best_feature_name)) {
715 29         24 $best_feature_name = $feature_nom;
716 29         44 $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
717             } else {
718 87 100       133 if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
719 30         27 $best_feature_name = $feature_nom;
720 30         32 $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
721             }
722             }
723             }
724 29         30 my $threshold_for_best_feature;
725 29 50       31 if (exists $partitioning_point_threshold{$best_feature_name}) {
726 29         22 $threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
727             } else {
728 0         0 $threshold_for_best_feature = undef;
729             }
730 29         31 my $best_feature_entropy = $min_entropy_for_best_feature;
731 29         22 my @val_based_entropies_to_be_returned;
732             my $decision_val_to_be_returned;
733 29 50 33     74 if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} &&
734             $self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} >
735             $self->{_symbolic_to_numeric_cardinality_threshold}) {
736             @val_based_entropies_to_be_returned =
737 0         0 @{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
  0         0  
738             } else {
739 29         25 @val_based_entropies_to_be_returned = ();
740             }
741 29 50       32 if (exists $partitioning_point_threshold{$best_feature_name}) {
742 29         20 $decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
743             } else {
744 0         0 $decision_val_to_be_returned = undef;
745             }
746             print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
747 29 50       47 "@val_based_entropies_to_be_returned\n" if $self->{_debug3};
748 29         164 return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned,
749             $decision_val_to_be_returned);
750             }
751              
752             ######################################### Entropy Calculators #####################################
753              
754             sub class_entropy_on_priors {
755 1     1 0 2 my $self = shift;
756             return $self->{_entropy_cache}->{'priors'}
757 1 50       2 if exists $self->{_entropy_cache}->{"priors"};
758 1         2 my @class_names = @{$self->{_class_names}};
  1         2  
759 1         1 my $entropy;
760 1         2 foreach my $class (@class_names) {
761 2         6 my $prob = $self->prior_probability_for_class($class);
762 2 50 33     18 my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;
763 2 50       5 $log_prob = 0 if $prob < 0.0001; # since X.log(X)->0 as X->0
764 2 50       4 $log_prob = 0 if $prob > 0.999; # since log(1) = 0
765 2 100       4 if (!defined $entropy) {
766 1         1 $entropy = -1.0 * $prob * $log_prob;
767 1         2 next;
768             }
769 1         3 $entropy += -1.0 * $prob * $log_prob;
770             }
771 1         2 $self->{_entropy_cache}->{'priors'} = $entropy;
772 1         3 return $entropy;
773             }
774              
775             sub entropy_scanner_for_a_numeric_feature {
776 0     0 0 0 local $| = 1;
777 0         0 my $self = shift;
778 0         0 my $feature = shift;
779 0         0 my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};
  0         0  
780 0         0 my @entropies_for_less_than_thresholds = ();
781 0         0 my @entropies_for_greater_than_thresholds = ();
782 0         0 foreach my $point (@all_sampling_points) {
783 0         0 print ". ";
784 0         0 push @entropies_for_less_than_thresholds,
785             $self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);
786 0         0 push @entropies_for_greater_than_thresholds,
787             $self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);
788             }
789 0         0 print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ".
790             "@entropies_for_less_than_thresholds\n";
791 0         0 print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ".
792             "@entropies_for_greater_than_thresholds\n";
793             }
794              
795             sub class_entropy_for_less_than_threshold_for_feature {
796 0     0 0 0 my $self = shift;
797 0         0 my $arr = shift;
798 0         0 my $feature = shift;
799 0         0 my $threshold = shift;
800 0         0 my @array_of_features_and_values_or_thresholds = @$arr;
801 0         0 my $feature_threshold_combo = "$feature" . '<' . "$threshold";
802 0         0 my $sequence = join ":", @array_of_features_and_values_or_thresholds;
803 0         0 $sequence .= ":" . $feature_threshold_combo;
804 0 0       0 return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
805             my @copy_of_array_of_features_and_values_or_thresholds =
806 0         0 @{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
  0         0  
807 0         0 push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
808 0         0 my $entropy = 0;
809 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
810 0         0 my $log_prob = undef;
811 0         0 my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
812             $class_name, \@copy_of_array_of_features_and_values_or_thresholds);
813 0 0 0     0 if ($prob >= 0.0001 && $prob <= 0.999) {
    0          
    0          
814 0         0 $log_prob = log($prob) / log(2.0);
815             } elsif ($prob < 0.0001) {
816 0         0 $log_prob = 0;
817             } elsif ($prob > 0.999) {
818 0         0 $log_prob = 0;
819             } else {
820 0         0 die "An error has occurred in log_prob calculation";
821             }
822 0         0 $entropy += -1.0 * $prob * $log_prob;
823             }
824 0 0       0 if (abs($entropy) < 0.0000001) {
825 0         0 $entropy = 0.0;
826             }
827 0         0 $self->{_entropy_cache}->{$sequence} = $entropy;
828 0         0 return $entropy;
829             }
830              
831             sub class_entropy_for_greater_than_threshold_for_feature {
832 0     0 0 0 my $self = shift;
833 0         0 my $arr = shift;
834 0         0 my $feature = shift;
835 0         0 my $threshold = shift;
836 0         0 my @array_of_features_and_values_or_thresholds = @$arr;
837 0         0 my $feature_threshold_combo = "$feature" . '>' . "$threshold";
838 0         0 my $sequence = join ":", @array_of_features_and_values_or_thresholds;
839 0         0 $sequence .= ":" . $feature_threshold_combo;
840 0 0       0 return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
841             my @copy_of_array_of_features_and_values_or_thresholds =
842 0         0 @{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
  0         0  
843 0         0 push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
844 0         0 my $entropy = 0;
845 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
846 0         0 my $log_prob = undef;
847 0         0 my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
848             $class_name, \@copy_of_array_of_features_and_values_or_thresholds);
849 0 0 0     0 if ($prob >= 0.0001 && $prob <= 0.999) {
    0          
    0          
850 0         0 $log_prob = log($prob) / log(2.0);
851             } elsif ($prob < 0.0001) {
852 0         0 $log_prob = 0;
853             } elsif ($prob > 0.999) {
854 0         0 $log_prob = 0;
855             } else {
856 0         0 die "An error has occurred in log_prob calculation";
857             }
858 0         0 $entropy += -1.0 * $prob * $log_prob;
859             }
860 0 0       0 if (abs($entropy) < 0.0000001) {
861 0         0 $entropy = 0.0;
862             }
863 0         0 $self->{_entropy_cache}->{$sequence} = $entropy;
864 0         0 return $entropy;
865             }
866              
867             sub class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds {
868 171     171 0 119 my $self = shift;
869 171         100 my $array_of_features_and_values_or_thresholds = shift;
870 171         178 my @array_of_features_and_values_or_thresholds = @$array_of_features_and_values_or_thresholds;
871 171         191 my $sequence = join ":", @array_of_features_and_values_or_thresholds;
872 171 100       302 return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
873 110         82 my $entropy = 0;
874 110         75 foreach my $class_name (@{$self->{_class_names}}) {
  110         138  
875 220         147 my $log_prob = undef;
876 220         250 my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
877             $class_name, \@array_of_features_and_values_or_thresholds);
878 220 100 100     585 if ($prob >= 0.0001 && $prob <= 0.999) {
    100          
    50          
879 218         242 $log_prob = log($prob) / log(2.0);
880             } elsif ($prob < 0.0001) {
881 1         2 $log_prob = 0;
882             } elsif ($prob > 0.999) {
883 1         2 $log_prob = 0;
884             } else {
885 0         0 die "An error has occurred in log_prob calculation";
886             }
887 220         258 $entropy += -1.0 * $prob * $log_prob;
888             }
889 110 100       156 if (abs($entropy) < 0.0000001) {
890 1         1 $entropy = 0.0;
891             }
892 110         150 $self->{_entropy_cache}->{$sequence} = $entropy;
893 110         172 return $entropy;
894             }
895              
896              
897             ##################################### Probability Calculators ########################################
898              
899             sub prior_probability_for_class {
900 4     4 0 3 my $self = shift;
901 4         2 my $class = shift;
902 4         5 my $class_name_in_cache = "prior" . '::' . $class;
903             return $self->{_probability_cache}->{$class_name_in_cache}
904 4 50       11 if exists $self->{_probability_cache}->{$class_name_in_cache};
905 0         0 my $total_num_of_samples = keys %{$self->{_samples_class_label_hash}};
  0         0  
906 0         0 my @values = values %{$self->{_samples_class_label_hash}};
  0         0  
907 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
908 0         0 my @trues = grep {$_ eq $class_name} @values;
  0         0  
909 0         0 my $prior_for_this_class = (1.0 * @trues) / $total_num_of_samples;
910 0         0 my $this_class_name_in_cache = "prior" . '::' . $class_name;
911 0         0 $self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
912             }
913 0         0 return $self->{_probability_cache}->{$class_name_in_cache};
914             }
915              
916             sub calculate_class_priors {
917 1     1 1 6 my $self = shift;
918 1 50       1 return if scalar keys %{$self->{_class_priors_hash}} > 1;
  1         5  
919 1         2 foreach my $class_name (@{$self->{_class_names}}) {
  1         2  
920 2         4 my $class_name_in_cache = "prior::$class_name";
921 2         1 my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};
  2         4  
922 2         2 my @all_values = values %{$self->{_samples_class_label_hash}};
  2         10  
923 2         3 my @trues_for_this_class = grep {$_ eq $class_name} @all_values;
  70         54  
924 2         4 my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;
925 2         2 $self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;
926 2         4 my $this_class_name_in_cache = "prior::$class_name";
927 2         7 $self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
928             }
929 1 50       4 if ($self->{_debug1}) {
930 0         0 foreach my $class (sort keys %{$self->{_class_priors_hash}}) {
  0         0  
931 0         0 print "$class => $self->{_class_priors_hash}->{$class}\n";
932             }
933             }
934             }
935              
936             sub calculate_first_order_probabilities {
937 1     1 1 15 print "\nEstimating probabilities...\n";
938 1         2 my $self = shift;
939 1         1 foreach my $feature (@{$self->{_feature_names}}) {
  1         3  
940 4         10 $self->probability_of_feature_value($feature, undef);
941 4 50       9 if ($self->{_debug2}) {
942 0 0       0 if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {
943 0         0 print "\nPresenting probability distribution for a numeric feature:\n";
944 0         0 foreach my $sampling_point (sort {$a <=> $b} keys
  0         0  
945 0         0 %{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {
946 0         0 my $sampling_pt_for_display = sprintf("%.2f", $sampling_point);
947             print "$feature :: $sampling_pt_for_display=" . sprintf("%.5f",
948 0         0 $self->{_prob_distribution_numeric_features_hash}->{$feature}{$sampling_point}) . "\n";
949             }
950             } else {
951 0         0 print "\nPresenting probabilities for the values of a feature considered to be symbolic:\n";
952 0         0 my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
  0         0  
953 0         0 foreach my $value (sort @values_for_feature) {
954 0         0 my $prob = $self->probability_of_feature_value($feature,$value);
955 0         0 print "$feature :: $value = " . sprintf("%.5f", $prob) . "\n";
956             }
957             }
958             }
959             }
960             }
961              
962             sub probability_of_feature_value {
963 286     286 0 193 my $self = shift;
964 286         209 my $feature_name = shift;
965 286         211 my $value = shift;
966 286 50 66     1097 $value = sprintf("%.1f", $value) if defined($value) && $value =~ /^\d+$/;
967 286 50 66     535 if (defined($value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
968             $value = closest_sampling_point($value,
969 0         0 $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
970             }
971 286         153 my $feature_and_value;
972 286 100       363 if (defined($value)) {
973 282         291 $feature_and_value = "$feature_name=$value";
974             }
975 286 50 66     535 if (defined($value) && exists($self->{_probability_cache}->{$feature_and_value})) {
976 282         507 return $self->{_probability_cache}->{$feature_and_value};
977             }
978 4         7 my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
979 4 50       6 if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
980 0 0       0 if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
981             $self->{_symbolic_to_numeric_cardinality_threshold}) {
982 0 0       0 if (! exists $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}) {
983 0         0 @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
  0         0  
984 0         0 $diffrange = $valuerange[1] - $valuerange[0];
985 0         0 my %seen = ();
986 0 0 0     0 my @unique_values_for_feature = sort {$a <=> $b} grep {$_ if $_ ne 'NA' && !$seen{$_}++}
  0         0  
987 0         0 @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
988 0         0 my @diffs = sort {$a <=> $b} map {$unique_values_for_feature[$_] -
  0         0  
  0         0  
989             $unique_values_for_feature[$_-1]} 1..@unique_values_for_feature-1;
990 0         0 my $median_diff = $diffs[int(@diffs/2) - 1];
991 0         0 $histogram_delta = $median_diff * 2;
992 0 0       0 if ($histogram_delta < $diffrange / 500.0) {
993 0 0       0 if (defined $self->{_number_of_histogram_bins}) {
994 0         0 $histogram_delta = $diffrange / $self->{_number_of_histogram_bins};
995             } else {
996 0         0 $histogram_delta = $diffrange / 500.0;
997             }
998             }
999 0         0 $self->{_histogram_delta_hash}->{$feature_name} = $histogram_delta;
1000 0         0 $num_of_histogram_bins = int($diffrange / $histogram_delta) + 1;
1001 0         0 $self->{_num_of_histogram_bins_hash}->{$feature_name} = $num_of_histogram_bins;
1002 0         0 my @sampling_points_for_feature = map {$valuerange[0] + $histogram_delta * $_}
  0         0  
1003             0..$num_of_histogram_bins-1;
1004 0         0 $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name} =
1005             \@sampling_points_for_feature;
1006             }
1007             }
1008             }
1009 4 50       6 if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1010 0 0       0 if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1011             $self->{_symbolic_to_numeric_cardinality_threshold}) {
1012             my @sampling_points_for_feature =
1013 0         0 @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
  0         0  
1014 0         0 my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1015 0         0 my @actual_values_for_feature = grep {$_ ne 'NA'}
1016 0         0 @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
1017 0         0 foreach my $i (0..@sampling_points_for_feature-1) {
1018 0         0 foreach my $j (0..@actual_values_for_feature-1) {
1019 0 0       0 if (abs($sampling_points_for_feature[$i]-$actual_values_for_feature[$j]) < $histogram_delta) {
1020 0         0 $counts_at_sampling_points[$i]++
1021             }
1022             }
1023             }
1024 0         0 my $total_counts = 0;
1025 0         0 map {$total_counts += $_} @counts_at_sampling_points;
  0         0  
1026 0         0 my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
  0         0  
1027 0         0 my %bin_prob_hash = ();
1028 0         0 foreach my $i (0..@sampling_points_for_feature-1) {
1029 0         0 $bin_prob_hash{$sampling_points_for_feature[$i]} = $probs[$i];
1030             }
1031 0         0 $self->{_prob_distribution_numeric_features_hash}->{$feature_name} = \%bin_prob_hash;
1032 0         0 my @values_for_feature = map "$feature_name=$_", map {sprintf("%.5f", $_)}
  0         0  
1033             @sampling_points_for_feature;
1034 0         0 foreach my $i (0..@values_for_feature-1) {
1035 0         0 $self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1036             }
1037 0 0 0     0 if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1038 0         0 return $self->{_probability_cache}->{$feature_and_value};
1039             } else {
1040 0         0 return 0;
1041             }
1042             } else {
1043 0         0 my %seen = ();
1044 0 0 0     0 my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1045 0         0 @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
1046 0         0 @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
  0         0  
1047 0         0 my @value_counts = (0) x @values_for_feature;
1048             # foreach my $sample (sort {sample_index($a) cmp sample_index($b)}keys %{$self->{_training_data_hash}}){
1049 0         0 foreach my $sample (sort {sample_index($a) <=> sample_index($b)}keys %{$self->{_training_data_hash}}){
  0         0  
  0         0  
1050 0         0 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
  0         0  
1051 0         0 foreach my $i (0..@values_for_feature-1) {
1052 0         0 foreach my $current_value (@features_and_values) {
1053 0 0       0 $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1054             }
1055             }
1056             }
1057 0         0 my $total_counts = 0;
1058 0         0 map {$total_counts += $_} @value_counts;
  0         0  
1059 0 0       0 die "PFV Something is wrong with your training file. It contains no training samples \
1060             for feature named $feature_name" if $total_counts == 0;
1061 0         0 my @probs = map {$_ / (1.0 * $total_counts)} @value_counts;
  0         0  
1062 0         0 foreach my $i (0..@values_for_feature-1) {
1063 0         0 $self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1064             }
1065 0 0 0     0 if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1066 0         0 return $self->{_probability_cache}->{$feature_and_value};
1067             } else {
1068 0         0 return 0;
1069             }
1070             }
1071             } else {
1072             # This section is only for purely symbolic features:
1073 4         4 my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
  4         18  
1074 4         5 @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
  140         187  
1075 4         17 my @value_counts = (0) x @values_for_feature;
1076             # foreach my $sample (sort {sample_index($a) cmp sample_index($b)} keys %{$self->{_training_data_hash}}) {
1077 4         4 foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}}) {
  564         505  
  4         21  
1078 140         79 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
  140         180  
1079 140         127 foreach my $i (0..@values_for_feature-1) {
1080 4900         3257 for my $current_value (@features_and_values) {
1081 19600 100       22722 $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1082             }
1083             }
1084             }
1085 4         13 foreach my $i (0..@values_for_feature-1) {
1086             $self->{_probability_cache}->{$values_for_feature[$i]} =
1087 140         82 $value_counts[$i] / (1.0 * scalar(keys %{$self->{_training_data_hash}}));
  140         176  
1088             }
1089 4 50 33     11 if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1090 0         0 return $self->{_probability_cache}->{$feature_and_value};
1091             } else {
1092 4         16 return 0;
1093             }
1094             }
1095             }
1096              
1097             sub probability_of_feature_value_given_class {
1098 572     572 0 393 my $self = shift;
1099 572         423 my $feature_name = shift;
1100 572         327 my $feature_value = shift;
1101 572         384 my $class_name = shift;
1102 572 50 33     1647 $feature_value = sprintf("%.1f", $feature_value) if defined($feature_value) && $feature_value =~ /^\d+$/;
1103 572 50 33     1057 if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
1104             $feature_value = closest_sampling_point($feature_value,
1105 0         0 $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
1106             }
1107 572         308 my $feature_value_class;
1108 572 50       634 if (defined($feature_value)) {
1109 572         764 $feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";
1110             }
1111 572 100 33     1190 if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {
1112             print "\nNext answer returned by cache for feature $feature_name and " .
1113 564 50       653 "value $feature_value given class $class_name\n" if $self->{_debug2};
1114 564         1101 return $self->{_probability_cache}->{$feature_value_class};
1115             }
1116 8         9 my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
1117              
1118 8 50       12 if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1119 0 0       0 if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1120             $self->{_symbolic_to_numeric_cardinality_threshold}) {
1121 0         0 $histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};
1122 0         0 $num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};
1123 0         0 @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
  0         0  
1124 0         0 $diffrange = $valuerange[1] - $valuerange[0];
1125             }
1126             }
1127 8         7 my @samples_for_class = ();
1128             # Accumulate all samples names for the given class:
1129 8         3 foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
  8         34  
1130 280 100       320 if ($self->{_samples_class_label_hash}->{$sample_name} eq $class_name) {
1131 140         116 push @samples_for_class, $sample_name;
1132             }
1133             }
1134 8 50       18 if (exists($self->{_numeric_features_valuerange_hash}->{$feature_name})) {
1135 0 0       0 if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1136             $self->{_symbolic_to_numeric_cardinality_threshold}) {
1137             my @sampling_points_for_feature =
1138 0         0 @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
  0         0  
1139 0         0 my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1140 0         0 my @actual_feature_values_for_samples_in_class = ();
1141 0         0 foreach my $sample (@samples_for_class) {
1142 0         0 foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
  0         0  
1143 0         0 my $pattern = '(.+)=(.+)';
1144 0         0 $feature_and_value =~ /$pattern/;
1145 0         0 my ($feature, $value) = ($1, $2);
1146 0 0 0     0 if (($feature eq $feature_name) && ($value ne 'NA')) {
1147 0         0 push @actual_feature_values_for_samples_in_class, $value;
1148             }
1149             }
1150             }
1151 0         0 foreach my $i (0..@sampling_points_for_feature-1) {
1152 0         0 foreach my $j (0..@actual_feature_values_for_samples_in_class-1) {
1153 0 0       0 if (abs($sampling_points_for_feature[$i] -
1154             $actual_feature_values_for_samples_in_class[$j]) < $histogram_delta) {
1155 0         0 $counts_at_sampling_points[$i]++;
1156             }
1157             }
1158             }
1159 0         0 my $total_counts = 0;
1160 0         0 map {$total_counts += $_} @counts_at_sampling_points;
  0         0  
1161 0 0       0 die "PFVC1 Something is wrong with your training file. It contains no training " .
1162             "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1163 0         0 my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
  0         0  
1164 0         0 my @values_for_feature_and_class = map {"$feature_name=$_" . "::" . "$class_name"}
  0         0  
1165             @sampling_points_for_feature;
1166 0         0 foreach my $i (0..@values_for_feature_and_class-1) {
1167 0         0 $self->{_probability_cache}->{$values_for_feature_and_class[$i]} = $probs[$i];
1168             }
1169 0 0       0 if (exists $self->{_probability_cache}->{$feature_value_class}) {
1170 0         0 return $self->{_probability_cache}->{$feature_value_class};
1171             } else {
1172 0         0 return 0;
1173             }
1174             } else {
1175             # This section is for numeric features that will be treated symbolically
1176 0         0 my %seen = ();
1177 0 0 0     0 my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1178 0         0 @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
1179 0         0 @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
  0         0  
1180 0         0 my @value_counts = (0) x @values_for_feature;
1181 0         0 foreach my $sample (@samples_for_class) {
1182 0         0 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
  0         0  
1183 0         0 foreach my $i (0..@values_for_feature-1) {
1184 0         0 foreach my $current_value (@features_and_values) {
1185 0 0       0 $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1186             }
1187             }
1188             }
1189 0         0 my $total_counts = 0;
1190 0         0 map {$total_counts += $_} @value_counts;
  0         0  
1191 0 0       0 die "PFVC2 Something is wrong with your training file. It contains no training " .
1192             "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1193             # We normalize by total_count because the probabilities are conditioned on a given class
1194 0         0 foreach my $i (0..@values_for_feature-1) {
1195 0         0 my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1196 0         0 $self->{_probability_cache}->{$feature_and_value_and_class} =
1197             $value_counts[$i] / (1.0 * $total_counts);
1198             }
1199 0 0       0 if (exists $self->{_probability_cache}->{$feature_value_class}) {
1200 0         0 return $self->{_probability_cache}->{$feature_value_class};
1201             } else {
1202 0         0 return 0;
1203             }
1204             }
1205             } else {
1206             # This section is for purely symbolic features
1207 8         5 my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
  8         36  
1208 8         8 my %seen = ();
1209 280 100 66     729 @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1210 8         6 @{$self->{_features_and_values_hash}->{$feature_name}};
  8         11  
1211 8         9 @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
  28         40  
1212 8         12 my @value_counts = (0) x @values_for_feature;
1213 8         8 foreach my $sample (@samples_for_class) {
1214 140         73 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
  140         169  
1215 140         123 foreach my $i (0..@values_for_feature-1) {
1216 490         327 foreach my $current_value (@features_and_values) {
1217 1960 100       2381 $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1218             }
1219             }
1220             }
1221 8         9 my $total_counts = 0;
1222 8         6 map {$total_counts += $_} @value_counts;
  28         23  
1223 8 50       10 die "PFVC2 Something is wrong with your training file. It contains no training " .
1224             "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1225             # We normalize by total_count because the probabilities are conditioned on a given class
1226 8         9 foreach my $i (0..@values_for_feature-1) {
1227 28         29 my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1228 28         55 $self->{_probability_cache}->{$feature_and_value_and_class} =
1229             $value_counts[$i] / (1.0 * $total_counts);
1230             }
1231 8 50       13 if (exists $self->{_probability_cache}->{$feature_value_class}) {
1232 8         32 return $self->{_probability_cache}->{$feature_value_class};
1233             } else {
1234 0         0 return 0;
1235             }
1236             }
1237             }
1238              
1239             sub probability_of_feature_less_than_threshold {
1240 0     0 0 0 my $self = shift;
1241 0         0 my $feature_name = shift;
1242 0         0 my $threshold = shift;
1243 0         0 my $feature_threshold_combo = "$feature_name" . '<' . "$threshold";
1244             return $self->{_probability_cache}->{$feature_threshold_combo}
1245 0 0       0 if (exists $self->{_probability_cache}->{$feature_threshold_combo});
1246 0 0       0 my @all_values = grep {$_ if $_ ne 'NA'} @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
  0         0  
1247 0 0       0 my @all_values_less_than_threshold = grep {$_ if $_ <= $threshold} @all_values;
  0         0  
1248 0         0 my $probability = (1.0 * @all_values_less_than_threshold) / @all_values;
1249 0         0 $self->{_probability_cache}->{$feature_threshold_combo} = $probability;
1250 0         0 return $probability;
1251             }
1252              
1253             sub probability_of_feature_less_than_threshold_given_class {
1254 0     0 0 0 my $self = shift;
1255 0         0 my $feature_name = shift;
1256 0         0 my $threshold = shift;
1257 0         0 my $class_name = shift;
1258 0         0 my $feature_threshold_class_combo = "$feature_name" . "<" . "$threshold" . "::" . "$class_name";
1259             return $self->{_probability_cache}->{$feature_threshold_class_combo}
1260 0 0       0 if (exists $self->{_probability_cache}->{$feature_threshold_class_combo});
1261 0         0 my @data_samples_for_class = ();
1262             # Accumulate all samples names for the given class:
1263 0         0 foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
  0         0  
1264             push @data_samples_for_class, $sample_name
1265 0 0       0 if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;
1266             }
1267 0         0 my @actual_feature_values_for_samples_in_class = ();
1268 0         0 foreach my $sample (@data_samples_for_class) {
1269 0         0 foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
  0         0  
1270 0         0 my $pattern = '(.+)=(.+)';
1271 0         0 $feature_and_value =~ /$pattern/;
1272 0         0 my ($feature,$value) = ($1,$2);
1273 0 0 0     0 push @actual_feature_values_for_samples_in_class, $value
1274             if $feature eq $feature_name && $value ne 'NA';
1275             }
1276             }
1277 0 0       0 my @actual_points_for_feature_less_than_threshold = grep {$_ if $_ <= $threshold} @actual_feature_values_for_samples_in_class;
  0         0  
1278             # The condition in the assignment that follows was a bug correction in Version 3.20
1279 0 0       0 my $probability = @actual_feature_values_for_samples_in_class > 0 ? ((1.0 * @actual_points_for_feature_less_than_threshold) / @actual_feature_values_for_samples_in_class) : 0.0;
1280 0         0 $self->{_probability_cache}->{$feature_threshold_class_combo} = $probability;
1281 0         0 return $probability;
1282             }
1283              
1284             # This method requires that all truly numeric types only be expressed as '<' or '>'
1285             # constructs in the array of branch features and thresholds
1286             sub probability_of_a_sequence_of_features_and_values_or_thresholds {
1287 329     329 0 213 my $self = shift;
1288 329         236 my $arr = shift;
1289 329         379 my @array_of_features_and_values_or_thresholds = @$arr;
1290 329 50       422 return if scalar @array_of_features_and_values_or_thresholds == 0;
1291 329         392 my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1292 329 100       650 return $self->{_probability_cache}->{$sequence} if exists $self->{_probability_cache}->{$sequence};
1293 96         74 my $probability = undef;
1294 96         64 my $pattern1 = '(.+)=(.+)';
1295 96         55 my $pattern2 = '(.+)<(.+)';
1296 96         68 my $pattern3 = '(.+)>(.+)';
1297 96         64 my @true_numeric_types = ();
1298 96         62 my @true_numeric_types_feature_names = ();
1299 96         66 my @symbolic_types = ();
1300 96         65 my @symbolic_types_feature_names = ();
1301 96         82 foreach my $item (@array_of_features_and_values_or_thresholds) {
1302 282 50       502 if ($item =~ /$pattern2/) {
    50          
1303 0         0 push @true_numeric_types, $item;
1304 0         0 my ($feature,$value) = ($1,$2);
1305 0         0 push @true_numeric_types_feature_names, $feature;
1306             } elsif ($item =~ /$pattern3/) {
1307 0         0 push @true_numeric_types, $item;
1308 0         0 my ($feature,$value) = ($1,$2);
1309 0         0 push @true_numeric_types_feature_names, $feature;
1310             } else {
1311 282         215 push @symbolic_types, $item;
1312 282         409 $item =~ /$pattern1/;
1313 282         323 my ($feature,$value) = ($1,$2);
1314 282         299 push @symbolic_types_feature_names, $feature;
1315             }
1316             }
1317 96         91 my %seen1 = ();
1318 96 0       89 @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
  0         0  
1319 96         63 my %seen2 = ();
1320 96 50       85 @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
  282         622  
1321 96         132 my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1322             print_array_with_msg("POS: Answer returned by find_bounded: ",
1323 96 50       124 $bounded_intervals_numeric_types) if $self->{_debug2};
1324             # Calculate the upper and the lower bounds to be used when searching for the best
1325             # threshold for each of the numeric features that are in play at the current node:
1326 96         68 my (%upperbound, %lowerbound);
1327 96         87 foreach my $feature_name (@true_numeric_types_feature_names) {
1328 0         0 $upperbound{$feature_name} = undef;
1329 0         0 $lowerbound{$feature_name} = undef;
1330             }
1331 96         88 foreach my $item (@$bounded_intervals_numeric_types) {
1332 0         0 foreach my $feature_grouping (@$item) {
1333 0 0       0 if ($feature_grouping->[1] eq '>') {
1334 0         0 $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1335             } else {
1336 0         0 $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1337             }
1338             }
1339             }
1340 96         85 foreach my $feature_name (@true_numeric_types_feature_names) {
1341 0 0 0     0 if (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name}) &&
    0 0        
    0 0        
    0 0        
      0        
1342             $upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1343 0         0 return 0;
1344             } elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1345 0 0       0 if (! $probability) {
1346             $probability = $self->probability_of_feature_less_than_threshold($feature_name,
1347             $upperbound{$feature_name}) -
1348 0         0 $self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name});
1349             } else {
1350             $probability *= ($self->probability_of_feature_less_than_threshold($feature_name,
1351             $upperbound{$feature_name}) -
1352 0         0 $self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name}))
1353             }
1354             } elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1355 0 0       0 if (! $probability) {
1356             $probability = $self->probability_of_feature_less_than_threshold($feature_name,
1357 0         0 $upperbound{$feature_name});
1358             } else {
1359             $probability *= $self->probability_of_feature_less_than_threshold($feature_name,
1360 0         0 $upperbound{$feature_name});
1361             }
1362             } elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1363 0 0       0 if (! $probability) {
1364             $probability = 1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1365 0         0 $lowerbound{$feature_name});
1366             } else {
1367             $probability *= (1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1368 0         0 $lowerbound{$feature_name}));
1369             }
1370             } else {
1371 0         0 die("Ill formatted call to 'probability_of_sequence' method");
1372             }
1373             }
1374 96         85 foreach my $feature_and_value (@symbolic_types) {
1375 282 50       750 if ($feature_and_value =~ /$pattern1/) {
1376 282         338 my ($feature,$value) = ($1,$2);
1377 282 100       276 if (! $probability) {
1378 96         138 $probability = $self->probability_of_feature_value($feature, $value);
1379             } else {
1380 186         203 $probability *= $self->probability_of_feature_value($feature, $value);
1381             }
1382             }
1383             }
1384 96         176 $self->{_probability_cache}->{$sequence} = $probability;
1385 96         197 return $probability;
1386             }
1387              
1388             ## The following method requires that all truly numeric types only be expressed as
1389             ## '<' or '>' constructs in the array of branch features and thresholds
1390             sub probability_of_a_sequence_of_features_and_values_or_thresholds_given_class {
1391 220     220 0 150 my $self = shift;
1392 220         133 my $arr = shift;
1393 220         167 my $class_name = shift;
1394 220         229 my @array_of_features_and_values_or_thresholds = @$arr;
1395 220 50       315 return if scalar @array_of_features_and_values_or_thresholds == 0;
1396 220         245 my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1397 220         212 my $sequence_with_class = "$sequence" . "::" . $class_name;
1398             return $self->{_probability_cache}->{$sequence_with_class}
1399 220 100       343 if exists $self->{_probability_cache}->{$sequence_with_class};
1400 200         126 my $probability = undef;
1401 200         147 my $pattern1 = '(.+)=(.+)';
1402 200         103 my $pattern2 = '(.+)<(.+)';
1403 200         128 my $pattern3 = '(.+)>(.+)';
1404 200         134 my @true_numeric_types = ();
1405 200         144 my @true_numeric_types_feature_names = ();
1406 200         128 my @symbolic_types = ();
1407 200         132 my @symbolic_types_feature_names = ();
1408 200         171 foreach my $item (@array_of_features_and_values_or_thresholds) {
1409 572 50       1104 if ($item =~ /$pattern2/) {
    50          
1410 0         0 push @true_numeric_types, $item;
1411 0         0 my ($feature,$value) = ($1,$2);
1412 0         0 push @true_numeric_types_feature_names, $feature;
1413             } elsif ($item =~ /$pattern3/) {
1414 0         0 push @true_numeric_types, $item;
1415 0         0 my ($feature,$value) = ($1,$2);
1416 0         0 push @true_numeric_types_feature_names, $feature;
1417             } else {
1418 572         416 push @symbolic_types, $item;
1419 572         898 $item =~ /$pattern1/;
1420 572         733 my ($feature,$value) = ($1,$2);
1421 572         578 push @symbolic_types_feature_names, $feature;
1422             }
1423             }
1424 200         198 my %seen1 = ();
1425 200 0       187 @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
  0         0  
1426 200         138 my %seen2 = ();
1427 200 50       161 @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
  572         1307  
1428 200         269 my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1429             print_array_with_msg("POSC: Answer returned by find_bounded: ",
1430 200 50       250 $bounded_intervals_numeric_types) if $self->{_debug2};
1431             # Calculate the upper and the lower bounds to be used when searching for the best
1432             # threshold for each of the numeric features that are in play at the current node:
1433 200         135 my (%upperbound, %lowerbound);
1434 200         177 foreach my $feature_name (@true_numeric_types_feature_names) {
1435 0         0 $upperbound{$feature_name} = undef;
1436 0         0 $lowerbound{$feature_name} = undef;
1437             }
1438 200         171 foreach my $item (@$bounded_intervals_numeric_types) {
1439 0         0 foreach my $feature_grouping (@$item) {
1440 0 0       0 if ($feature_grouping->[1] eq '>') {
1441 0         0 $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1442             } else {
1443 0         0 $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1444             }
1445             }
1446             }
1447 200         158 foreach my $feature_name (@true_numeric_types_feature_names) {
1448 0 0 0     0 if ($lowerbound{$feature_name} && $upperbound{$feature_name} &&
    0 0        
    0 0        
    0 0        
      0        
1449             $upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1450 0         0 return 0;
1451             } elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1452 0 0       0 if (! $probability) {
1453              
1454             $probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1455             $upperbound{$feature_name}, $class_name) -
1456             $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1457 0         0 $lowerbound{$feature_name}, $class_name);
1458             } else {
1459             $probability *= ($self->probability_of_feature_less_than_threshold_given_class($feature_name,
1460             $upperbound{$feature_name}, $class_name) -
1461             $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1462 0         0 $lowerbound{$feature_name}, $class_name))
1463             }
1464             } elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1465 0 0       0 if (! $probability) {
1466             $probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1467 0         0 $upperbound{$feature_name}, $class_name);
1468             } else {
1469             $probability *= $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1470 0         0 $upperbound{$feature_name}, $class_name);
1471             }
1472             } elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1473 0 0       0 if (! $probability) {
1474             $probability = 1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1475 0         0 $lowerbound{$feature_name}, $class_name);
1476             } else {
1477             $probability *= (1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1478 0         0 $lowerbound{$feature_name}, $class_name));
1479             }
1480             } else {
1481 0         0 die("Ill formatted call to 'probability of sequence given class' method");
1482             }
1483             }
1484 200         150 foreach my $feature_and_value (@symbolic_types) {
1485 572 50       1496 if ($feature_and_value =~ /$pattern1/) {
1486 572         688 my ($feature,$value) = ($1,$2);
1487 572 100       560 if (! $probability) {
1488 200         287 $probability = $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1489             } else {
1490 372         379 $probability *= $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1491             }
1492             }
1493             }
1494 200         383 $self->{_probability_cache}->{$sequence_with_class} = $probability;
1495 200         454 return $probability;
1496             }
1497              
1498             sub probability_of_a_class_given_sequence_of_features_and_values_or_thresholds {
1499 342     342 0 222 my $self = shift;
1500 342         257 my $class_name = shift;
1501 342         187 my $arr = shift;
1502 342         355 my @array_of_features_and_values_or_thresholds = @$arr;
1503 342         344 my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1504 342         327 my $class_and_sequence = "$class_name" . "::" . $sequence;
1505             return $self->{_probability_cache}->{$class_and_sequence}
1506 342 100       822 if exists $self->{_probability_cache}->{$class_and_sequence};
1507 110         119 my @array_of_class_probabilities = (0) x scalar @{$self->{_class_names}};
  110         174  
1508 110         73 foreach my $i (0..@{$self->{_class_names}}-1) {
  110         137  
1509 220         173 my $class_name = $self->{_class_names}->[$i];
1510 220         261 my $prob = $self->probability_of_a_sequence_of_features_and_values_or_thresholds_given_class(
1511             \@array_of_features_and_values_or_thresholds, $class_name);
1512 220 100       307 if ($prob < 0.000001) {
1513 1         1 $array_of_class_probabilities[$i] = 0.0;
1514 1         5 next;
1515             }
1516 219         278 my $prob_of_feature_sequence = $self->probability_of_a_sequence_of_features_and_values_or_thresholds(
1517             \@array_of_features_and_values_or_thresholds);
1518             # die "PCS Something is wrong with your sequence of feature values and thresholds in " .
1519             # "probability_of_a_class_given_sequence_of_features_and_values_or_thresholds()"
1520             # if ! $prob_of_feature_sequence;
1521 219         272 my $prior = $self->{_class_priors_hash}->{$self->{_class_names}->[$i]};
1522 219 50       227 if ($prob_of_feature_sequence) {
1523 219         302 $array_of_class_probabilities[$i] = $prob * $prior / $prob_of_feature_sequence;
1524             } else {
1525 0         0 $array_of_class_probabilities[$i] = $prior;
1526             }
1527             }
1528 110         78 my $sum_probability;
1529 110         98 map {$sum_probability += $_} @array_of_class_probabilities;
  220         229  
1530 110 50       122 if ($sum_probability == 0) {
1531 0         0 @array_of_class_probabilities = map {1.0 / (scalar @{$self->{_class_names}})}
  0         0  
1532 0         0 (0..@{$self->{_class_names}}-1);
  0         0  
1533             } else {
1534 110         88 @array_of_class_probabilities = map {$_ * 1.0 / $sum_probability} @array_of_class_probabilities;
  220         240  
1535             }
1536 110         97 foreach my $i (0..@{$self->{_class_names}}-1) {
  110         197  
1537 220         266 my $this_class_and_sequence = "$self->{_class_names}->[$i]" . "::" . "$sequence";
1538 220         464 $self->{_probability_cache}->{$this_class_and_sequence} = $array_of_class_probabilities[$i];
1539             }
1540 110         192 return $self->{_probability_cache}->{$class_and_sequence};
1541             }
1542              
1543             ####################################### Class Based Utilities ##########################################
1544              
1545             ## Given a list of branch attributes for the numeric features of the form, say,
1546             ## ['g2<1','g2<2','g2<3','age>34','age>36','age>37'], this method returns the
1547             ## smallest list that is relevant for the purpose of calculating the probabilities.
1548             ## To explain, the probability that the feature `g2' is less than 1 AND, at the same
1549             ## time, less than 2, AND, at the same time, less than 3, is the same as the
1550             ## probability that the feature less than 1. Similarly, the probability that 'age'
1551             ## is greater than 34 and also greater than 37 is the same as `age' being greater
1552             ## than 37.
1553             sub find_bounded_intervals_for_numeric_features {
1554 325     325 0 243 my $self = shift;
1555 325         194 my $arr = shift;
1556 325         277 my @arr = @$arr;
1557 325         180 my @features = @{$self->{_feature_names}};
  325         454  
1558 325         247 my @arr1 = map {my @x = split /(>|<)/, $_; \@x} @arr;
  0         0  
  0         0  
1559 325 50       436 print_array_with_msg("arr1", \@arr1) if $self->{_debug2};
1560 325         201 my @arr3 = ();
1561 325         287 foreach my $feature_name (@features) {
1562 1300         812 my @temp = ();
1563 1300         956 foreach my $x (@arr1) {
1564 0 0 0     0 push @temp, $x if @$x > 0 && $x->[0] eq $feature_name;
1565             }
1566 1300 50       1657 push @arr3, \@temp if @temp > 0;
1567             }
1568 325 50       403 print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
1569             # Sort each list so that '<' entries occur before '>' entries:
1570 325         190 my @arr4;
1571 325         237 foreach my $li (@arr3) {
1572 0         0 my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
  0         0  
1573 0         0 push @arr4, \@sorted;
1574             }
1575 325 50       388 print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
1576 325         179 my @arr5;
1577 325         274 foreach my $li (@arr4) {
1578 0         0 my @temp1 = ();
1579 0         0 my @temp2 = ();
1580 0         0 foreach my $inner (@$li) {
1581 0 0       0 if ($inner->[1] eq '<') {
1582 0         0 push @temp1, $inner;
1583             } else {
1584 0         0 push @temp2, $inner;
1585             }
1586             }
1587 0 0 0     0 if (@temp1 > 0 && @temp2 > 0) {
    0          
1588 0         0 push @arr5, [\@temp1, \@temp2];
1589             } elsif (@temp1 > 0) {
1590 0         0 push @arr5, [\@temp1];
1591             } else {
1592 0         0 push @arr5, [\@temp2];
1593             }
1594             }
1595 325 50       366 print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
1596 325         226 my @arr6 = ();
1597 325         224 foreach my $li (@arr5) {
1598 0         0 my @temp1 = ();
1599 0         0 foreach my $inner (@$li) {
1600 0         0 my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
  0         0  
1601 0         0 push @temp1, \@sorted;
1602             }
1603 0         0 push @arr6, \@temp1;
1604             }
1605 325 50       384 print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
1606 325         214 my @arr9 = ();
1607 325         276 foreach my $li (@arr6) {
1608 0         0 foreach my $alist (@$li) {
1609 0         0 my @newalist = ();
1610 0 0       0 if ($alist->[0][1] eq '<') {
1611 0         0 push @newalist, $alist->[0];
1612             } else {
1613 0         0 push @newalist, $alist->[-1];
1614             }
1615 0 0       0 if ($alist->[0][1] ne $alist->[-1][1]) {
1616 0         0 push @newalist, $alist->[-1];
1617             }
1618 0         0 push @arr9, \@newalist;
1619             }
1620             }
1621 325 50       366 print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
1622 325         405 return \@arr9;
1623              
1624             }
1625              
1626             ## This method is used to verify that you used legal feature names in the test
1627             ## sample that you want to classify with the decision tree.
1628             sub check_names_used {
1629 0     0 0 0 my $self = shift;
1630 0         0 my $features_and_values_test_data = shift;
1631 0         0 my @features_and_values_test_data = @$features_and_values_test_data;
1632 0         0 my $pattern = '(\S+)\s*=\s*(\S+)';
1633 0         0 foreach my $feature_and_value (@features_and_values_test_data) {
1634 0         0 $feature_and_value =~ /$pattern/;
1635 0         0 my ($feature,$value) = ($1,$2);
1636 0 0 0     0 die "Your test data has formatting error" unless defined($feature) && defined($value);
1637 0 0       0 return 0 unless contained_in($feature, @{$self->{_feature_names}});
  0         0  
1638             }
1639 0         0 return 1;
1640             }
1641              
1642             ####################################### Data Condition Calculator ######################################
1643              
1644             ## This method estimates the worst-case fan-out of the decision tree taking into
1645             ## account the number of values (and therefore the number of branches emanating from
1646             ## a node) for the symbolic features.
1647             sub determine_data_condition {
1648 0     0 0 0 my $self = shift;
1649 0         0 my $num_of_features = scalar @{$self->{_feature_names}};
  0         0  
1650 0         0 my @values = ();
1651 0         0 my @number_of_values;
1652 0         0 foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
  0         0  
1653 0         0 push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
1654 0 0       0 if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
  0         0  
1655 0         0 push @number_of_values, scalar @values;
1656             }
1657 0 0       0 return if ! @values;
1658 0         0 print "Number of features: $num_of_features\n";
1659 0         0 my @minmax = minmax(\@number_of_values);
1660 0         0 my $max_num_values = $minmax[1];
1661 0         0 print "Largest number of values for symbolic features is: $max_num_values\n";
1662 0         0 my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
1663 0         0 print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
1664             "nodes. The exact number of nodes created depends critically on " .
1665             "the entropy_threshold used for node expansion (the default value " .
1666             "for this threshold is 0.01) and on the value set for max_depth_desired " .
1667             "for the depth of the tree.\n";
1668 0 0       0 if ($estimated_number_of_nodes > 10000) {
1669 0         0 print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
1670             "large value for entropy_threshold and/or a small value for " .
1671             "for max_depth_desired to reduce the number of nodes created.\n";
1672 0         0 print "\nDo you wish to continue anyway? Enter 'y' for yes: ";
1673 0         0 my $answer = ;
1674 0         0 $answer =~ s/\r?\n?$//;
1675 0   0     0 while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
1676 0         0 print "\nAnswer not recognized. Let's try again. Enter 'y' or 'n': ";
1677 0         0 $answer = ;
1678 0         0 $answer =~ s/\r?\n?$//;
1679             }
1680 0 0       0 die unless $answer =~ /y(es)?/i;
1681             }
1682             }
1683              
1684              
1685             #################################### Read Training Data From File ######################################
1686              
1687              
1688             sub get_training_data {
1689 1     1 1 6 my $self = shift;
1690 1 50       8 die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
1691 1         1 my %class_names = ();
1692 1         3 my %all_record_ids_with_class_labels;
1693             my $firstline;
1694 0         0 my %data_hash;
1695 1         3 $|++;
1696 1   50     22 open FILEIN, $self->{_training_datafile} || die "unable to open $self->{_training_datafile}: $!";
1697 1         1 my $record_index = 0;
1698 1         1 my $firsetline;
1699 1         23 while () {
1700 36 50       65 next if /^[ ]*\r?\n?$/;
1701 36         75 $_ =~ s/\r?\n?$//;
1702 36         38 my $record = cleanup_csv($_);
1703 36 100       49 if ($record_index == 0) {
1704 1         1 $firstline = $record;
1705 1         1 $record_index++;
1706 1         3 next;
1707             }
1708 35         112 my @parts = split /,/, $record;
1709 35         35 my $classname = $parts[$self->{_csv_class_column_index}];
1710 35         25 $class_names{$classname} = 1;
1711 35         23 my $record_label = shift @parts;
1712 35         45 $record_label =~ s/^\s*\"|\"\s*$//g;
1713 35         40 $data_hash{$record_label} = \@parts;
1714 35         32 $all_record_ids_with_class_labels{$record_label} = $classname;
1715 35 50       43 print "." if $record_index % 10000 == 0;
1716 35         70 $record_index++;
1717             }
1718 1         5 close FILEIN;
1719 1         2 $|--;
1720 1         3 $self->{_how_many_total_training_samples} = $record_index - 1; # must subtract 1 for the header record
1721 1 50       3 print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
1722 1         4 my @all_feature_names = split /,/, substr($firstline, index($firstline,','));
1723 1         2 my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
1724 1         3 my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
  2         6  
1725 1         2 my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
  4         5  
  1         2  
1726 1         5 my %class_for_sample_hash = map {"sample_" . $_ => "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
  35         57  
1727 1         7 my @sample_names = map {"sample_$_"} keys %data_hash;
  35         29  
1728 1 50       5 my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $data_hash{$sampleID}->[$_-1] } } @{$self->{_csv_columns_for_features}} ] } keys %data_hash;
  35         26  
  35         15  
  140         107  
  140         96  
  140         288  
  35         34  
1729 1 50       5 my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};
  4         4  
  4         3  
  4         11  
  140         81  
  140         193  
  140         108  
  1         2  
1730 1         2 my %numeric_features_valuerange_hash = ();
1731 1         1 my %feature_values_how_many_uniques_hash = ();
1732 1         1 my %features_and_unique_values_hash = ();
1733 1         2 my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
1734 1         3 foreach my $feature (keys %features_and_values_hash) {
1735 4         3 my %seen1 = ();
1736 140 100 66     398 my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
1737 4         4 @{$features_and_values_hash{$feature}};
  4         4  
1738 4         6 $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
1739 4         3 my $not_all_values_float = 0;
1740 4 50       5 map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
  14         89  
1741 4 50       6 if ($not_all_values_float == 0) {
1742 0         0 my @minmaxvalues = minmax(\@unique_values_for_feature);
1743 0         0 $numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
1744             }
1745 4         10 $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
1746             }
1747 1 50       7 if ($self->{_debug1}) {
1748 0         0 print "\nAll class names: @all_class_names\n";
1749 0         0 print "\nEach sample data record:\n";
1750 0         0 foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
  0         0  
1751 0         0 print "$sample => @{$feature_values_for_samples_hash{$sample}}\n";
  0         0  
1752             }
1753 0         0 print "\nclass label for each data sample:\n";
1754 0         0 foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %class_for_sample_hash) {
  0         0  
1755 0         0 print "$sample => $class_for_sample_hash{$sample}\n";
1756             }
1757 0         0 print "\nFeatures used: @feature_names\n\n";
1758 0         0 print "\nfeatures and the values taken by them:\n";
1759 0         0 foreach my $feature (sort keys %features_and_values_hash) {
1760 0         0 print "$feature => @{$features_and_values_hash{$feature}}\n";
  0         0  
1761             }
1762 0         0 print "\nnumeric features and their ranges:\n";
1763 0         0 foreach my $feature (sort keys %numeric_features_valuerange_hash) {
1764 0         0 print "$feature => @{$numeric_features_valuerange_hash{$feature}}\n";
  0         0  
1765             }
1766 0         0 print "\nnumber of unique values in each feature:\n";
1767 0         0 foreach my $feature (sort keys %feature_values_how_many_uniques_hash) {
1768 0         0 print "$feature => $feature_values_how_many_uniques_hash{$feature}\n";
1769             }
1770             }
1771 1         2 $self->{_class_names} = \@all_class_names;
1772 1         2 $self->{_feature_names} = \@feature_names;
1773 1         1 $self->{_samples_class_label_hash} = \%class_for_sample_hash;
1774 1         6 $self->{_training_data_hash} = \%feature_values_for_samples_hash;
1775 1         2 $self->{_features_and_values_hash} = \%features_and_values_hash;
1776 1         1 $self->{_features_and_unique_values_hash} = \%features_and_unique_values_hash;
1777 1         2 $self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
1778 1         19 $self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
1779             }
1780              
1781             sub show_training_data {
1782 0     0 1 0 my $self = shift;
1783 0         0 my @class_names = @{$self->{_class_names}};
  0         0  
1784 0         0 my %features_and_values_hash = %{$self->{_features_and_values_hash}};
  0         0  
1785 0         0 my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};
  0         0  
1786 0         0 my %training_data_hash = %{$self->{_training_data_hash}};
  0         0  
1787 0         0 print "\n\nClass Names: @class_names\n";
1788 0         0 print "\n\nFeatures and Their Values:\n\n";
1789 0         0 while ( my ($k, $v) = each %features_and_values_hash ) {
1790 0         0 print "$k ---> @{$features_and_values_hash{$k}}\n";
  0         0  
1791             }
1792 0         0 print "\n\nSamples vs. Class Labels:\n\n";
1793 0         0 foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {
  0         0  
1794 0         0 print "$kee => $samples_class_label_hash{$kee}\n";
1795             }
1796 0         0 print "\n\nTraining Samples:\n\n";
1797 0         0 foreach my $kee (sort {sample_index($a) <=> sample_index($b)}
  0         0  
1798             keys %training_data_hash) {
1799 0         0 print "$kee => @{$training_data_hash{$kee}}\n";
  0         0  
1800             }
1801             }
1802              
1803             sub get_class_names {
1804 0     0 0 0 my $self = shift;
1805 0         0 return @{$self->{_class_names}}
  0         0  
1806             }
1807              
1808             ########################################## Utility Routines ############################################
1809              
1810             sub closest_sampling_point {
1811 0     0 0 0 my $value = shift;
1812 0         0 my $arr_ref = shift;
1813 0         0 my @arr = @{$arr_ref};
  0         0  
1814 0         0 my @compare = map {abs($_ - $value)} @arr;
  0         0  
1815 0         0 my ($minval,$index) = minimum(\@compare);
1816 0         0 return $arr[$index];
1817             }
1818              
1819             ## returns the array index that contains a specified STRING value: (meant only for array of strings)
1820             sub get_index_at_value {
1821 0     0 0 0 my $value = shift;
1822 0         0 my @array = @{shift @_};
  0         0  
1823 0         0 foreach my $i (0..@array-1) {
1824 0 0       0 return $i if $value eq $array[$i];
1825             }
1826             }
1827              
1828             ## When the training data is read from a CSV file, we assume that the first column
1829             ## of each data record contains a unique integer identifier for the record in that
1830             ## row. This training data is stored in a hash whose keys are the prefix 'sample_'
1831             ## followed by the identifying integers. The purpose of this function is to return
1832             ## the identifying integer associated with a data record.
1833             sub sample_index {
1834 1128     1128 0 629 my $arg = shift;
1835 1128         1115 $arg =~ /_(.+)$/;
1836 1128         1253 return $1;
1837             }
1838              
1839             # Returns the minimum value and its positional index in an array
1840             sub minimum {
1841 0     0 0 0 my $arr = shift;
1842 0         0 my $min;
1843             my $index;
1844 0         0 foreach my $i (0..@{$arr}-1) {
  0         0  
1845 0 0 0     0 if ( (!defined $min) || ($arr->[$i] < $min) ) {
1846 0         0 $index = $i;
1847 0         0 $min = $arr->[$i];
1848             }
1849             }
1850 0         0 return ($min, $index);
1851             }
1852              
1853             # Returns an array of two values, the min and the max, of an array of floats
1854             sub minmax {
1855 0     0 0 0 my $arr = shift;
1856 0         0 my ($min, $max);
1857 0         0 foreach my $i (0..@{$arr}-1) {
  0         0  
1858 0 0 0     0 if ( (!defined $min) || ($arr->[$i] < $min) ) {
1859 0         0 $min = $arr->[$i];
1860             }
1861 0 0 0     0 if ( (!defined $max) || ($arr->[$i] > $max) ) {
1862 0         0 $max = $arr->[$i];
1863             }
1864             }
1865 0         0 return ($min, $max);
1866             }
1867              
1868             # checks whether an element is in an array:
1869             sub contained_in {
1870 267     267 0 176 my $ele = shift;
1871 267         269 my @array = @_;
1872 267         160 my $count = 0;
1873 267 100       181 map {$count++ if $ele eq $_} @array;
  648         1035  
1874 267         427 return $count;
1875             }
1876              
1877             # Meant only for an array of strings (no nesting):
1878             sub deep_copy_array {
1879 196     196 0 152 my $ref_in = shift;
1880 196         116 my $ref_out;
1881 196 100       254 return [] if scalar @$ref_in == 0;
1882 181         133 foreach my $i (0..@{$ref_in}-1) {
  181         232  
1883 393         423 $ref_out->[$i] = $ref_in->[$i];
1884             }
1885 181         283 return $ref_out;
1886             }
1887              
1888             sub check_for_illegal_params2 {
1889 1     1 0 2 my @params = @_;
1890 1         5 my @legal_params = qw / training_datafile
1891             entropy_threshold
1892             max_depth_desired
1893             csv_class_column_index
1894             csv_columns_for_features
1895             symbolic_to_numeric_cardinality_threshold
1896             number_of_histogram_bins
1897             debug1
1898             debug2
1899             debug3
1900             /;
1901 1         1 my $found_match_flag;
1902 1         2 foreach my $param (@params) {
1903 3         3 foreach my $legal (@legal_params) {
1904 10         9 $found_match_flag = 0;
1905 10 100       14 if ($param eq $legal) {
1906 3         1 $found_match_flag = 1;
1907 3         4 last;
1908             }
1909             }
1910 3 50       4 last if $found_match_flag == 0;
1911             }
1912 1         4 return $found_match_flag;
1913             }
1914              
1915             sub print_array_with_msg {
1916 0     0 0 0 my $message = shift;
1917 0         0 my $arr = shift;
1918 0         0 print "\n$message: ";
1919 0         0 print_nested_array( $arr );
1920             }
1921              
1922             sub print_nested_array {
1923 0     0 0 0 my $arr = shift;
1924 0         0 my @arr = @$arr;
1925 0         0 print "[";
1926 0         0 foreach my $item (@arr) {
1927 0 0       0 if (ref $item) {
1928 0         0 print_nested_array($item);
1929             } else {
1930 0         0 print "$item";
1931             }
1932             }
1933 0         0 print "]";
1934             }
1935              
1936             ## Introduced in Version 3.21, I wrote this function in response to a need to
1937             ## create a decision tree for a very large national econometric database. The
1938             ## fields in the CSV file for this database are allowed to be double quoted and such
1939             ## fields may contain commas inside them. This function also replaces empty fields
1940             ## with the generic string 'NA' as a shorthand for "Not Available". IMPORTANT: This
1941             ## function skips over the first field in each record. It is assumed that the first
1942             ## field in the first record that defines the feature names is the empty string ("")
1943             ## and the same field in all other records is an ID number for the record.
1944             sub cleanup_csv {
1945 36     36 0 28 my $line = shift;
1946 36         33 $line =~ tr/()[]{}'/ /;
1947 36         51 my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
1948 36         35 for (@double_quoted) {
1949 0         0 my $item = $_;
1950 0         0 $item = substr($item, 1, -1);
1951 0         0 $item =~ s/^s+|,|\s+$//g;
1952 0         0 $item = join '_', split /\s+/, $item;
1953 0         0 substr($line, index($line, $_), length($_)) = $item;
1954             }
1955 36         124 my @white_spaced = $line =~ /,\s*[^,]+\s+[^,]+\s*,/g;
1956 36         30 for (@white_spaced) {
1957 0         0 my $item = $_;
1958 0         0 $item = substr($item, 0, -1);
1959 0 0       0 $item = join '_', split /\s+/, $item unless $item =~ /,\s+$/;
1960 0         0 substr($line, index($line, $_), length($_)) = "$item,";
1961             }
1962 36         52 $line =~ s/,\s*(?=,)/,NA/g;
1963 36         38 return $line;
1964             }
1965            
1966             ######################################### Class EvalTrainingData ########################################
1967              
1968             ## This subclass of the DecisionTree class is used to evaluate the quality of your
1969             ## training data by running a 10-fold cross-validation test on it. This test divides
1970             ## all of the training data into ten parts, with nine parts used for training a
1971             ## decision tree and one part used for testing its ability to classify correctly.
1972             ## This selection of nine parts for training and one part for testing is carried out
1973             ## in all of the ten different possible ways. This testing functionality can also
1974             ## be used to find the best values to use for the constructor parameters
1975             ## entropy_threshold, max_depth_desired, and
1976             ## symbolic_to_numeric_cardinality_threshold.
1977              
1978             ## Only the CSV training files can be evaluated in this manner (because only CSV
1979             ## training are allowed to have numeric features --- which is the more interesting
1980             ## case for evaluation analytics.
1981              
1982             package EvalTrainingData;
1983              
1984             @EvalTrainingData::ISA = ('Algorithm::DecisionTree');
1985              
1986             sub new {
1987 0     0   0 my $class = shift;
1988 0         0 my $instance = Algorithm::DecisionTree->new(@_);
1989 0         0 bless $instance, $class;
1990             }
1991              
1992             sub evaluate_training_data {
1993 0     0   0 my $self = shift;
1994 0         0 my $evaldebug = 0;
1995             die "The data evaluation function in the module can only be used when your " .
1996 0 0       0 "training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
1997 0         0 print "\nWill run a 10-fold cross-validation test on your training data to test its " .
1998             "class-discriminatory power:\n";
1999 0         0 my %all_training_data = %{$self->{_training_data_hash}};
  0         0  
2000 0         0 my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=>
  0         0  
2001             Algorithm::DecisionTree::sample_index($b)} keys %all_training_data;
2002 0         0 my $fold_size = int(0.1 * (scalar keys %all_training_data));
2003 0         0 print "fold size: $fold_size\n";
2004 0         0 my %confusion_matrix = ();
2005 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
2006 0         0 foreach my $inner_class_name (@{$self->{_class_names}}) {
  0         0  
2007 0         0 $confusion_matrix{$class_name}->{$inner_class_name} = 0;
2008             }
2009             }
2010 0         0 foreach my $fold_index (0..9) {
2011 0         0 print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n";
2012 0         0 my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
2013 0         0 my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],
2014             @all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
2015 0         0 my %testing_data = ();
2016 0         0 foreach my $x (@testing_samples) {
2017 0         0 $testing_data{$x} = $all_training_data{$x};
2018             }
2019 0         0 my %training_data = ();
2020 0         0 foreach my $x (@training_samples) {
2021 0         0 $training_data{$x} = $all_training_data{$x};
2022             }
2023 0         0 my $trainingDT = Algorithm::DecisionTree->new('evalmode');
2024 0         0 $trainingDT->{_training_data_hash} = \%training_data;
2025 0         0 $trainingDT->{_class_names} = $self->{_class_names};
2026 0         0 $trainingDT->{_feature_names} = $self->{_feature_names};
2027 0         0 $trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
2028 0         0 $trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
2029             $trainingDT->{_symbolic_to_numeric_cardinality_threshold} =
2030 0         0 $self->{_symbolic_to_numeric_cardinality_threshold};
2031 0         0 foreach my $sample_name (@training_samples) {
2032             $trainingDT->{_samples_class_label_hash}->{$sample_name} =
2033 0         0 $self->{_samples_class_label_hash}->{$sample_name};
2034             }
2035 0         0 foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
  0         0  
2036 0         0 $trainingDT->{_features_and_values_hash}->{$feature} = ();
2037             }
2038 0         0 my $pattern = '(\S+)\s*=\s*(\S+)';
2039 0         0 foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=>
  0         0  
2040             Algorithm::DecisionTree::sample_index($b)}
2041 0         0 keys %{$trainingDT->{_training_data_hash}}) {
2042 0         0 foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
  0         0  
2043 0         0 $feature_and_value =~ /$pattern/;
2044 0         0 my ($feature,$value) = ($1,$2);
2045 0 0       0 push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
  0         0  
2046             }
2047             }
2048 0         0 foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
  0         0  
2049 0         0 my %seen = ();
2050 0 0 0     0 my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
2051 0         0 @{$trainingDT->{_features_and_values_hash}->{$feature}};
  0         0  
2052 0 0       0 if (Algorithm::DecisionTree::contained_in($feature,
2053 0         0 keys %{$self->{_numeric_features_valuerange_hash}})) {
2054 0         0 @unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;
  0         0  
2055             } else {
2056 0         0 @unique_values_for_feature = sort @unique_values_for_feature;
2057             }
2058 0         0 $trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
2059             }
2060 0         0 foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
  0         0  
2061             my @minmaxvalues = Algorithm::DecisionTree::minmax(
2062 0         0 \@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
  0         0  
2063 0         0 $trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
2064             }
2065 0 0       0 if ($evaldebug) {
2066 0         0 print "\n\nprinting samples in the testing set: @testing_samples\n";
2067 0         0 print "\n\nPrinting features and their values in the training set:\n";
2068 0         0 foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
  0         0  
2069 0         0 print "$item => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
  0         0  
2070             }
2071 0         0 print "\n\nPrinting unique values for features:\n";
2072 0         0 foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
  0         0  
2073 0         0 print "$item => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
  0         0  
2074             }
2075 0         0 print "\n\nPrinting unique value ranges for features:\n";
2076 0         0 foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
  0         0  
2077 0         0 print "$item => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
  0         0  
2078             }
2079             }
2080 0         0 foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
  0         0  
2081             $trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} =
2082 0         0 scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
  0         0  
2083             }
2084 0 0       0 $trainingDT->{_debug2} = 1 if $evaldebug;
2085 0         0 $trainingDT->calculate_first_order_probabilities();
2086 0         0 $trainingDT->calculate_class_priors();
2087 0         0 my $root_node = $trainingDT->construct_decision_tree_classifier();
2088 0 0       0 $root_node->display_decision_tree(" ") if $evaldebug;
2089 0         0 foreach my $test_sample_name (@testing_samples) {
2090 0         0 my @test_sample_data = @{$all_training_data{$test_sample_name}};
  0         0  
2091 0 0       0 print "original data in test sample: @test_sample_data\n" if $evaldebug;
2092 0 0 0     0 @test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
  0         0  
2093 0 0       0 print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
2094 0         0 my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
  0         0  
2095 0         0 my @solution_path = @{$classification{'solution_path'}};
  0         0  
2096 0         0 delete $classification{'solution_path'};
2097 0         0 my @which_classes = keys %classification;
2098 0         0 @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
  0         0  
2099 0         0 my $most_likely_class_label = $which_classes[0];
2100 0 0       0 if ($evaldebug) {
2101 0         0 print "\nClassification:\n\n";
2102 0         0 print " class probability\n";
2103 0         0 print " ---------- -----------\n";
2104 0         0 foreach my $which_class (@which_classes) {
2105 0         0 my $classstring = sprintf("%-30s", $which_class);
2106 0         0 my $valuestring = sprintf("%-30s", $classification{$which_class});
2107 0         0 print " $classstring $valuestring\n";
2108             }
2109 0         0 print "\nSolution path in the decision tree: @solution_path\n";
2110 0         0 print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
2111             }
2112 0         0 my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
2113 0 0       0 print "$test_sample_name: true_class: $true_class_label_for_sample " .
2114             "estimated_class: $most_likely_class_label\n" if $evaldebug;
2115 0         0 $confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
2116             }
2117             }
2118 0         0 print "\n\n DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
2119 0         0 my $matrix_header = " " x 30;
2120 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
2121 0         0 $matrix_header .= sprintf("%-30s", $class_name);
2122             }
2123 0         0 print "\n" . $matrix_header . "\n\n";
2124 0         0 foreach my $row_class_name (sort keys %confusion_matrix) {
2125 0         0 my $row_display = sprintf("%-30s", $row_class_name);
2126 0         0 foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
  0         0  
2127 0         0 $row_display .= sprintf( "%-30u", $confusion_matrix{$row_class_name}->{$col_class_name} );
2128             }
2129 0         0 print "$row_display\n\n";
2130             }
2131 0         0 print "\n\n";
2132 0         0 my ($diagonal_sum, $off_diagonal_sum) = (0,0);
2133 0         0 foreach my $row_class_name (sort keys %confusion_matrix) {
2134 0         0 foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
  0         0  
2135 0 0       0 if ($row_class_name eq $col_class_name) {
2136 0         0 $diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2137             } else {
2138 0         0 $off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2139             }
2140             }
2141             }
2142 0         0 my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
2143 0         0 print "\nTraining Data Quality Index: $data_quality_index (out of a possible maximum of 100)\n";
2144 0 0 0     0 if ($data_quality_index <= 80) {
    0 0        
    0 0        
    0          
2145 0         0 print "\nYour training data does not possess much class discriminatory " .
2146             "information. It could be that the classes are inherently not well " .
2147             "separable or that your constructor parameter choices are not appropriate.\n";
2148             } elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
2149 0         0 print "\nYour training data possesses some class discriminatory information " .
2150             "but it may not be sufficient for real-world applications. You might " .
2151             "try tweaking the constructor parameters to see if that improves the " .
2152             "class discriminations.\n";
2153             } elsif ($data_quality_index > 90 && $data_quality_index <= 95) {
2154 0         0 print "\nYour training data appears to possess good class discriminatory " .
2155             "information. Whether or not it is acceptable would depend on your " .
2156             "application.\n";
2157             } elsif ($data_quality_index > 95 && $data_quality_index <= 98) {
2158 0         0 print "\nYour training data is of excellent quality.\n";
2159             } else {
2160 0         0 print "\nYour training data is perfect.\n";
2161             }
2162              
2163             }
2164              
2165              
2166             ############################################# Class DTNode #############################################
2167              
2168             # The nodes of the decision tree are instances of this class:
2169              
2170             package DTNode;
2171              
2172 1     1   11 use strict;
  1         1  
  1         19  
2173 1     1   3 use Carp;
  1         1  
  1         806  
2174              
2175             # $feature is the feature test at the current node. $branch_features_and_values is
2176             # an anonymous array holding the feature names and corresponding values on the path
2177             # from the root to the current node:
2178             sub new {
2179 30     30   35 my ($class, $feature, $entropy, $class_probabilities,
2180             $branch_features_and_values_or_thresholds, $dt, $root_or_not) = @_;
2181 30 100       42 $root_or_not = '' if !defined $root_or_not;
2182 30 100       37 if ($root_or_not eq 'root') {
2183 1         2 $dt->{nodes_created} = -1;
2184 1         2 $dt->{class_names} = undef;
2185             }
2186 30         96 my $self = {
2187             _dt => $dt,
2188             _feature => $feature,
2189             _node_creation_entropy => $entropy,
2190             _class_probabilities => $class_probabilities,
2191             _branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
2192             _linked_to => [],
2193             };
2194 30         33 bless $self, $class;
2195 30         42 $self->{_serial_number} = $self->get_next_serial_num();
2196 30         30 return $self;
2197             }
2198              
2199             sub how_many_nodes {
2200 0     0   0 my $self = shift;
2201 0         0 return $self->{_dt}->{nodes_created} + 1;
2202             }
2203              
2204             sub set_class_names {
2205 1     1   1 my $self = shift;
2206 1         1 my $class_names_list = shift;
2207 1         2 $self->{_dt}->{class_names} = $class_names_list;
2208             }
2209              
2210             sub get_class_names {
2211 0     0   0 my $self = shift;
2212 0         0 return $self->{_dt}->{class_names};
2213             }
2214              
2215             sub get_next_serial_num {
2216 30     30   16 my $self = shift;
2217 30         33 $self->{_dt}->{nodes_created} += 1;
2218 30         42 return $self->{_dt}->{nodes_created};
2219             }
2220              
2221             sub get_serial_num {
2222 30     30   27 my $self = shift;
2223 30         29 $self->{_serial_number};
2224             }
2225              
2226             # this returns the feature test at the current node
2227             sub get_feature {
2228 0     0   0 my $self = shift;
2229 0         0 return $self->{ _feature };
2230             }
2231              
2232             sub set_feature {
2233 29     29   25 my $self = shift;
2234 29         19 my $feature = shift;
2235 29         36 $self->{_feature} = $feature;
2236             }
2237              
2238             sub get_node_entropy {
2239 30     30   21 my $self = shift;
2240 30         29 return $self->{_node_creation_entropy};
2241             }
2242              
2243             sub get_class_probabilities {
2244 0     0   0 my $self = shift;
2245 0         0 return $self->{ _class_probabilities};
2246             }
2247              
2248             sub get_branch_features_and_values_or_thresholds {
2249 30     30   19 my $self = shift;
2250 30         39 return $self->{_branch_features_and_values_or_thresholds};
2251             }
2252              
2253             sub add_to_branch_features_and_values {
2254 0     0   0 my $self = shift;
2255 0         0 my $feature_and_value = shift;
2256 0         0 push @{$self->{ _branch_features_and_values }}, $feature_and_value;
  0         0  
2257             }
2258              
2259             sub get_children {
2260 0     0   0 my $self = shift;
2261 0         0 return $self->{_linked_to};
2262             }
2263              
2264             sub add_child_link {
2265 29     29   20 my ($self, $new_node, ) = @_;
2266 29         17 push @{$self->{_linked_to}}, $new_node;
  29         40  
2267             }
2268              
2269             sub delete_all_links {
2270 0     0   0 my $self = shift;
2271 0         0 $self->{_linked_to} = undef;
2272             }
2273              
2274             sub display_node {
2275 0     0   0 my $self = shift;
2276 0   0     0 my $feature_at_node = $self->get_feature() || " ";
2277 0         0 my $node_creation_entropy_at_node = $self->get_node_entropy();
2278 0         0 my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2279 0         0 my @class_probabilities = @{$self->get_class_probabilities()};
  0         0  
2280 0         0 my @class_probabilities_for_display = map {sprintf("%0.3f", $_)} @class_probabilities;
  0         0  
2281 0         0 my $serial_num = $self->get_serial_num();
2282 0         0 my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
  0         0  
2283 0         0 print "\n\nNODE $serial_num" .
2284             ":\n Branch features and values to this node: @branch_features_and_values_or_thresholds" .
2285             "\n Class probabilities at current node: @class_probabilities_for_display" .
2286             "\n Entropy at current node: $print_node_creation_entropy_at_node" .
2287             "\n Best feature test at current node: $feature_at_node\n\n";
2288             }
2289              
2290             sub display_decision_tree {
2291 0     0   0 my $self = shift;
2292 0         0 my $offset = shift;
2293 0         0 my $serial_num = $self->get_serial_num();
2294 0 0       0 if (@{$self->get_children()} > 0) {
  0         0  
2295 0   0     0 my $feature_at_node = $self->get_feature() || " ";
2296 0         0 my $node_creation_entropy_at_node = $self->get_node_entropy();
2297 0         0 my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2298 0         0 my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
  0         0  
2299 0         0 my @class_probabilities = @{$self->get_class_probabilities()};
  0         0  
2300 0         0 my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
  0         0  
2301 0         0 my @class_names = @{$self->get_class_names()};
  0         0  
2302             my @print_class_probabilities_with_class =
2303 0         0 map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
  0         0  
2304 0         0 print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
2305 0         0 my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2306 0         0 print "$second_line_offset" . "Decision Feature: $feature_at_node Node Creation Entropy: " ,
2307             "$print_node_creation_entropy_at_node Class Probs: @print_class_probabilities_with_class\n\n";
2308 0         0 $offset .= " ";
2309 0         0 foreach my $child (@{$self->get_children()}) {
  0         0  
2310 0         0 $child->display_decision_tree($offset);
2311             }
2312             } else {
2313 0         0 my $node_creation_entropy_at_node = $self->get_node_entropy();
2314 0         0 my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2315 0         0 my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
  0         0  
2316 0         0 my @class_probabilities = @{$self->get_class_probabilities()};
  0         0  
2317 0         0 my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
  0         0  
2318 0         0 my @class_names = @{$self->get_class_names()};
  0         0  
2319             my @print_class_probabilities_with_class =
2320 0         0 map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
  0         0  
2321 0         0 print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
2322 0         0 my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2323 0         0 print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node " .
2324             "Class Probs: @print_class_probabilities_with_class\n\n";
2325             }
2326             }
2327              
2328              
2329             ############################## Generate Your Own Numeric Training Data #################################
2330             ############################# Class TrainingDataGeneratorNumeric ################################
2331              
2332             ## See the script generate_training_data_numeric.pl in the examples
2333             ## directory on how to use this class for generating your own numeric training and
2334             ## test data. The training and test data are generated in accordance with the
2335             ## specifications you place in the parameter file that is supplied as an argument to
2336             ## the constructor of this class.
2337              
2338             package TrainingDataGeneratorNumeric;
2339              
2340 1     1   4 use strict;
  1         2  
  1         15  
2341 1     1   3 use Carp;
  1         1  
  1         744  
2342              
2343             sub new {
2344 0     0   0 my ($class, %args) = @_;
2345 0         0 my @params = keys %args;
2346 0 0       0 croak "\nYou have used a wrong name for a keyword argument " .
2347             "--- perhaps a misspelling\n"
2348             if check_for_illegal_params3(@params) == 0;
2349             bless {
2350             _output_training_csv_file => $args{'output_training_csv_file'}
2351             || croak("name for output_training_csv_file required"),
2352             _output_test_csv_file => $args{'output_test_csv_file'}
2353             || croak("name for output_test_csv_file required"),
2354             _parameter_file => $args{'parameter_file'}
2355             || croak("parameter_file required"),
2356             _number_of_samples_for_training => $args{'number_of_samples_for_training'}
2357             || croak("number_of_samples_for_training"),
2358             _number_of_samples_for_testing => $args{'number_of_samples_for_testing'}
2359             || croak("number_of_samples_for_testing"),
2360 0   0     0 _debug => $args{debug} || 0,
      0        
      0        
      0        
      0        
      0        
2361             _class_names => [],
2362             _class_names_and_priors => {},
2363             _features_with_value_range => {},
2364             _features_ordered => [],
2365             _classes_and_their_param_values => {},
2366             }, $class;
2367             }
2368              
2369             sub check_for_illegal_params3 {
2370 0     0   0 my @params = @_;
2371 0         0 my @legal_params = qw / output_training_csv_file
2372             output_test_csv_file
2373             parameter_file
2374             number_of_samples_for_training
2375             number_of_samples_for_testing
2376             debug
2377             /;
2378 0         0 my $found_match_flag;
2379 0         0 foreach my $param (@params) {
2380 0         0 foreach my $legal (@legal_params) {
2381 0         0 $found_match_flag = 0;
2382 0 0       0 if ($param eq $legal) {
2383 0         0 $found_match_flag = 1;
2384 0         0 last;
2385             }
2386             }
2387 0 0       0 last if $found_match_flag == 0;
2388             }
2389 0         0 return $found_match_flag;
2390             }
2391              
2392             ## The training data generated by an instance of the class
2393             ## TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
2394             ## you supply to the class constructor through a constructor variable called
2395             ## `parameter_file'. This method is for parsing the parameter file in order to
2396             ## order to determine the names to be used for the different data classes, their
2397             ## means, and their variances.
2398             sub read_parameter_file_numeric {
2399 0     0   0 my $self = shift;
2400 0         0 my @class_names = ();
2401 0         0 my %class_names_and_priors = ();
2402 0         0 my %features_with_value_range = ();
2403 0         0 my %classes_and_their_param_values = ();
2404             # my $regex8 = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
2405 0   0     0 open FILE, $self->{_parameter_file} || die "unable to open parameter file: $!";
2406 0         0 my @params = ;
2407 0         0 my $params = join "", @params;
2408 0         0 my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
2409 0         0 $params =~ /$regex/si;
2410 0         0 my ($class_names, $class_priors) = ($1, $2);
2411 0         0 @class_names = split ' ', $class_names;
2412 0         0 my @class_priors = split ' ', $class_priors;
2413 0         0 foreach my $i (0..@class_names-1) {
2414 0         0 $class_names_and_priors{$class_names[$i]} = $class_priors[$i];
2415             }
2416 0 0       0 if ($self->{_debug}) {
2417 0         0 foreach my $cname (keys %class_names_and_priors) {
2418 0         0 print "$cname => $class_names_and_priors{$cname}\n";
2419             }
2420             }
2421 0         0 $regex = 'feature name: \w*.*?value range: [\d\. -]+';
2422 0         0 my @features = $params =~ /$regex/gsi;
2423 0         0 my @features_ordered;
2424 0         0 $regex = 'feature name: (\w+)\W*?value range:\s*([\d. -]+)';
2425 0         0 foreach my $feature (@features) {
2426 0         0 $feature =~ /$regex/i;
2427 0         0 my $feature_name = $1;
2428 0         0 push @features_ordered, $feature_name;
2429 0         0 my @value_range = split ' ', $2;
2430 0         0 $features_with_value_range{$feature_name} = \@value_range;
2431             }
2432 0 0       0 if ($self->{_debug}) {
2433 0         0 foreach my $fname (keys %features_with_value_range) {
2434 0         0 print "$fname => @{$features_with_value_range{$fname}}\n";
  0         0  
2435             }
2436             }
2437 0         0 foreach my $i (0..@class_names-1) {
2438 0         0 $classes_and_their_param_values{$class_names[$i]} = {};
2439             }
2440 0         0 $regex = 'params for class: \w*?\W+?mean:[\d\. ]+\W*?covariance:\W+?(?:[ \d.]+\W+?)+';
2441 0         0 my @class_params = $params =~ /$regex/gsi;
2442 0         0 $regex = 'params for class: (\w+)\W*?mean:\s*([\d. -]+)\W*covariance:\s*([\s\d.]+)';
2443 0         0 foreach my $class_param (@class_params) {
2444 0         0 $class_param =~ /$regex/gsi;
2445 0         0 my $class_name = $1;
2446 0         0 my @class_mean = split ' ', $2;
2447 0         0 $classes_and_their_param_values{$class_name}->{'mean'} = \@class_mean;
2448 0         0 my $class_param_string = $3;
2449 0         0 my @covar_rows = split '\n', $class_param_string;
2450 0         0 my @covar_matrix;
2451 0         0 foreach my $row (@covar_rows) {
2452 0         0 my @row = split ' ', $row;
2453 0         0 push @covar_matrix, \@row;
2454             }
2455 0         0 $classes_and_their_param_values{$class_name}->{'covariance'} = \@covar_matrix;
2456             }
2457 0 0       0 if ($self->{_debug}) {
2458 0         0 print "\nThe class parameters are:\n\n";
2459 0         0 foreach my $cname (keys %classes_and_their_param_values) {
2460 0         0 print "\nFor class name $cname:\n";
2461 0         0 my %params_hash = %{$classes_and_their_param_values{$cname}};
  0         0  
2462 0         0 foreach my $x (keys %params_hash) {
2463 0 0       0 if ($x eq 'mean') {
2464 0         0 print " $x => @{$params_hash{$x}}\n";
  0         0  
2465             } else {
2466 0 0       0 if ($x eq 'covariance') {
2467 0         0 print " The covariance matrix:\n";
2468 0         0 my @matrix = @{$params_hash{'covariance'}};
  0         0  
2469 0         0 foreach my $row (@matrix) {
2470 0         0 print " @$row\n";
2471             }
2472             }
2473             }
2474             }
2475             }
2476             }
2477 0         0 $self->{_class_names} = \@class_names;
2478 0         0 $self->{_class_names_and_priors} = \%class_names_and_priors;
2479 0         0 $self->{_features_with_value_range} = \%features_with_value_range;
2480 0         0 $self->{_classes_and_their_param_values} = \%classes_and_their_param_values;
2481 0         0 $self->{_features_ordered} = \@features_ordered;
2482             }
2483              
2484             ## After the parameter file is parsed by the previous method, this method calls on
2485             ## Math::Random::random_multivariate_normal() to generate the training and test data
2486             ## samples. Your training and test data can be of any number of of dimensions, can
2487             ## have any mean, and any covariance. The training and test data must obviously be
2488             ## drawn from the same distribution.
2489             sub gen_numeric_training_and_test_data_and_write_to_csv {
2490 1     1   579 use Math::Random;
  1         4123  
  1         654  
2491 0     0   0 my $self = shift;
2492 0         0 my %training_samples_for_class;
2493             my %test_samples_for_class;
2494 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
2495 0         0 $training_samples_for_class{$class_name} = [];
2496 0         0 $test_samples_for_class{$class_name} = [];
2497             }
2498 0         0 foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
  0         0  
2499 0         0 my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
  0         0  
2500 0         0 my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
  0         0  
2501             my @new_training_data = Math::Random::random_multivariate_normal(
2502 0         0 $self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
2503             @mean, @covariance );
2504             my @new_test_data = Math::Random::random_multivariate_normal(
2505 0         0 $self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
2506             @mean, @covariance );
2507 0 0       0 if ($self->{_debug}) {
2508 0         0 print "training data for class $class_name:\n";
2509 0         0 foreach my $x (@new_training_data) {print "@$x\n";}
  0         0  
2510 0         0 print "\n\ntest data for class $class_name:\n";
2511 0         0 foreach my $x (@new_test_data) {print "@$x\n";}
  0         0  
2512             }
2513 0         0 $training_samples_for_class{$class_name} = \@new_training_data;
2514 0         0 $test_samples_for_class{$class_name} = \@new_test_data;
2515             }
2516 0         0 my @training_data_records = ();
2517 0         0 my @test_data_records = ();
2518 0         0 foreach my $class_name (keys %training_samples_for_class) {
2519             my $num_of_samples_for_training = $self->{_number_of_samples_for_training} *
2520 0         0 $self->{_class_names_and_priors}->{$class_name};
2521             my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} *
2522 0         0 $self->{_class_names_and_priors}->{$class_name};
2523 0         0 foreach my $sample_index (0..$num_of_samples_for_training-1) {
2524 0         0 my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};
  0         0  
2525 0         0 @training_vector = map {sprintf("%.3f", $_)} @training_vector;
  0         0  
2526 0         0 my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";
2527 0         0 push @training_data_records, $training_data_record;
2528             }
2529 0         0 foreach my $sample_index (0..$num_of_samples_for_testing-1) {
2530 0         0 my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
  0         0  
2531 0         0 @test_vector = map {sprintf("%.3f", $_)} @test_vector;
  0         0  
2532 0         0 my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
2533 0         0 push @test_data_records, $test_data_record;
2534             }
2535             }
2536 0         0 fisher_yates_shuffle(\@training_data_records);
2537 0         0 fisher_yates_shuffle(\@test_data_records);
2538 0 0       0 if ($self->{_debug}) {
2539 0         0 foreach my $record (@training_data_records) {
2540 0         0 print "$record";
2541             }
2542 0         0 foreach my $record (@test_data_records) {
2543 0         0 print "$record";
2544             }
2545             }
2546 0         0 open OUTPUT, ">$self->{_output_training_csv_file}";
2547 0         0 my @feature_names_training = @{$self->{_features_ordered}};
  0         0  
2548 0         0 my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
  0         0  
2549 0         0 my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
2550 0         0 print OUTPUT "$first_row_training\n";
2551 0         0 foreach my $i (0..@training_data_records-1) {
2552 0         0 my $i1 = $i+1;
2553 0         0 my $sample_record = "\"$i1\",$training_data_records[$i]";
2554 0         0 print OUTPUT "$sample_record";
2555             }
2556 0         0 close OUTPUT;
2557 0         0 open OUTPUT, ">$self->{_output_test_csv_file}";
2558 0         0 my @feature_names_testing = keys %{$self->{_features_with_value_range}};
  0         0  
2559 0         0 my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
  0         0  
2560 0         0 my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
2561 0         0 print OUTPUT "$first_row_testing\n";
2562 0         0 foreach my $i (0..@test_data_records-1) {
2563 0         0 my $i1 = $i+1;
2564 0         0 my $sample_record = "\"$i1\",$test_data_records[$i]";
2565 0         0 print OUTPUT "$sample_record";
2566             }
2567 0         0 close OUTPUT;
2568             }
2569              
2570             # from perl docs:
2571             sub fisher_yates_shuffle {
2572 0     0   0 my $arr = shift;
2573 0         0 my $i = @$arr;
2574 0         0 while (--$i) {
2575 0         0 my $j = int rand( $i + 1 );
2576 0         0 @$arr[$i, $j] = @$arr[$j, $i];
2577             }
2578             }
2579              
2580             ########################### Generate Your Own Symbolic Training Data ###############################
2581             ########################### Class TrainingDataGeneratorSymbolic #############################
2582              
2583             ## See the sample script generate_training_and_test_data_symbolic.pl for how to use
2584             ## this class for generating purely symbolic training and test data. The data is
2585             ## generated according to the specifications you place in a parameter file whose
2586             ## name you supply as one of constructor arguments.
2587             package TrainingDataGeneratorSymbolic;
2588              
2589 1     1   8 use strict;
  1         2  
  1         16  
2590 1     1   3 use Carp;
  1         1  
  1         1641  
2591              
2592             sub new {
2593 1     1   12 my ($class, %args) = @_;
2594 1         3 my @params = keys %args;
2595 1 50       3 croak "\nYou have used a wrong name for a keyword argument " .
2596             "--- perhaps a misspelling\n"
2597             if check_for_illegal_params4(@params) == 0;
2598             bless {
2599             _output_training_datafile => $args{'output_training_datafile'}
2600             || die("name for output_training_datafile required"),
2601             _parameter_file => $args{'parameter_file'}
2602             || die("parameter_file required"),
2603             _number_of_samples_for_training => $args{'number_of_samples_for_training'}
2604             || die("number_of_samples_for_training required"),
2605 1   50     18 _debug => $args{debug} || 0,
      50        
      50        
      50        
2606             _class_names => [],
2607             _class_priors => [],
2608             _features_and_values_hash => {},
2609             _bias_hash => {},
2610             _training_sample_records => {},
2611             }, $class;
2612             }
2613              
2614             sub check_for_illegal_params4 {
2615 1     1   1 my @params = @_;
2616 1         3 my @legal_params = qw / output_training_datafile
2617             parameter_file
2618             number_of_samples_for_training
2619             debug
2620             /;
2621 1         1 my $found_match_flag;
2622 1         2 foreach my $param (@params) {
2623 3         3 foreach my $legal (@legal_params) {
2624 6         4 $found_match_flag = 0;
2625 6 100       9 if ($param eq $legal) {
2626 3         1 $found_match_flag = 1;
2627 3         3 last;
2628             }
2629             }
2630 3 50       5 last if $found_match_flag == 0;
2631             }
2632 1         3 return $found_match_flag;
2633             }
2634              
2635             ## Read a parameter file for generating symbolic training data. See the script
2636             ## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
2637             ## pass the name of the parameter file to the constructor of the
2638             ## TrainingDataGeneratorSymbolic class.
2639             sub read_parameter_file_symbolic {
2640 1     1   4 my $self = shift;
2641 1         4 my $debug = $self->{_debug};
2642 1         2 my $number_of_training_samples = $self->{_number_of_samples_for_training};
2643 1         1 my $input_parameter_file = $self->{_parameter_file};
2644 1 50       4 croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
2645 1         1 my $output_file_training = $self->{_output_training_datafile};
2646 1         20 my $output_file_testing = $self->{_output_test_datafile};
2647 1         1 my @all_params;
2648             my $param_string;
2649 1   33     25 open INPUT, $input_parameter_file || "unable to open parameter file: $!";
2650 1         28 @all_params = ;
2651 1         3 @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
  40         46  
2652 1         2 @all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
  36         58  
2653 1         9 $param_string = join ' ', @all_params;
2654 1         7 my ($class_names, $class_priors, $rest_param) =
2655             $param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
2656 1 50       21 my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
  3         11  
2657 1         2 push @{$self->{_class_names}}, @class_names;
  1         2  
2658 1 50       4 my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
  3         10  
2659 1         1 push @{$self->{_class_priors}}, @class_priors;
  1         2  
2660 1         7 my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;
2661 1         1 my %features_and_values_hash;
2662 1         9 my @features = split /(feature[:])/, $feature_string;
2663 1 50       2 @features = grep {defined($_) && length($_) > 0} @features;
  9         22  
2664 1         2 foreach my $item (@features) {
2665 8 100       14 next if $item =~ /feature/;
2666 4         11 my @splits = split / /, $item;
2667 4 50       5 @splits = grep {defined($_) && length($_) > 0} @splits;
  27         59  
2668 4         7 foreach my $i (0..@splits-1) {
2669 22 100       23 if ($i == 0) {
2670 4         7 $features_and_values_hash{$splits[0]} = [];
2671             } else {
2672 18 100       26 next if $splits[$i] =~ /values/;
2673 14         9 push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
  14         18  
2674             }
2675             }
2676             }
2677 1         2 $self->{_features_and_values_hash} = \%features_and_values_hash;
2678 1         2 my %bias_hash = %{$self->{_bias_hash}};
  1         2  
2679 1         6 my @biases = split /(bias[:]\s*class[:])/, $bias_string;
2680 1 50       1 @biases = grep {defined($_) && length($_) > 0} @biases;
  5         14  
2681 1         1 foreach my $item (@biases) {
2682 4 100       11 next if $item =~ /bias/;
2683 2         15 my @splits = split /\s+/, $item;
2684 2 50       2 @splits = grep {defined($_) && length($_) > 0} @splits;
  18         40  
2685 2         3 my $feature_name;
2686 2         3 foreach my $i (0..@splits-1) {
2687 16 100       29 if ($i == 0) {
    100          
2688 2         3 $bias_hash{$splits[0]} = {};
2689             } elsif ($splits[$i] =~ /(^.+)[:]$/) {
2690 8         5 $feature_name = $1;
2691 8         15 $bias_hash{$splits[0]}->{$feature_name} = [];
2692             } else {
2693 6 50       8 next if !defined $feature_name;
2694 6 50       8 push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
  6         11  
2695             if defined $feature_name;
2696             }
2697             }
2698             }
2699 1         1 $self->{_bias_hash} = \%bias_hash;
2700 1 50       10 if ($debug) {
2701 0         0 print "\n\nClass names: @class_names\n";
2702 0         0 my $num_of_classes = @class_names;
2703 0         0 print "Class priors: @class_priors\n";
2704 0         0 print "Number of classes: $num_of_classes\n";
2705 0         0 print "\nHere are the features and their possible values:\n";
2706 0         0 while ( my ($k, $v) = each %features_and_values_hash ) {
2707 0         0 print "$k ===> @$v\n";
2708             }
2709 0         0 print "\nHere is the biasing for each class:\n";
2710 0         0 while ( my ($k, $v) = each %bias_hash ) {
2711 0         0 print "$k:\n";
2712 0         0 while ( my ($k1, $v1) = each %$v ) {
2713 0         0 print " $k1 ===> @$v1\n";
2714             }
2715             }
2716             }
2717             }
2718              
2719             ## This method generates training data according to the specifications placed in a
2720             ## parameter file that is read by the previous method.
2721             sub gen_symbolic_training_data {
2722 1     1   4 my $self = shift;
2723 1         2 my @class_names = @{$self->{_class_names}};
  1         2  
2724 1         2 my @class_priors = @{$self->{_class_priors}};
  1         2  
2725 1         1 my %training_sample_records;
2726 1         0 my %features_and_values_hash = %{$self->{_features_and_values_hash}};
  1         4  
2727 1         2 my %bias_hash = %{$self->{_bias_hash}};
  1         1  
2728 1         2 my $how_many_training_samples = $self->{_number_of_samples_for_training};
2729 1         1 my $how_many_test_samples = $self->{_number_of_samples_for_testing};
2730 1         1 my %class_priors_to_unit_interval_map;
2731 1         1 my $accumulated_interval = 0;
2732 1         2 foreach my $i (0..@class_names-1) {
2733 2         4 $class_priors_to_unit_interval_map{$class_names[$i]}
2734             = [$accumulated_interval, $accumulated_interval + $class_priors[$i]];
2735 2         3 $accumulated_interval += $class_priors[$i];
2736             }
2737 1 50       3 if ($self->{_debug}) {
2738 0         0 print "Mapping of class priors to unit interval: \n";
2739 0         0 while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {
2740 0         0 print "$k => @$v\n";
2741             }
2742 0         0 print "\n\n";
2743             }
2744 1         1 my $ele_index = 0;
2745 1         2 while ($ele_index < $how_many_training_samples) {
2746 35         33 my $sample_name = "sample" . "_$ele_index";
2747 35         50 $training_sample_records{$sample_name} = [];
2748             # Generate class label for this training sample:
2749 35         47 my $roll_the_dice = rand(1.0);
2750 35         22 my $class_label;
2751 35         38 foreach my $class_name (keys %class_priors_to_unit_interval_map ) {
2752 49         32 my $v = $class_priors_to_unit_interval_map{$class_name};
2753 49 100 66     123 if ( ($roll_the_dice >= $v->[0]) && ($roll_the_dice <= $v->[1]) ) {
2754 35         25 push @{$training_sample_records{$sample_name}},
  35         51  
2755             "class=" . $class_name;
2756 35         23 $class_label = $class_name;
2757 35         26 last;
2758             }
2759             }
2760 35         47 foreach my $feature (keys %features_and_values_hash) {
2761 140         90 my @values = @{$features_and_values_hash{$feature}};
  140         190  
2762 140         125 my $bias_string = $bias_hash{$class_label}->{$feature}->[0];
2763 140         111 my $no_bias = 1.0 / @values;
2764 140 100       252 $bias_string = "$values[0]" . "=$no_bias" if !defined $bias_string;
2765 140         84 my %value_priors_to_unit_interval_map;
2766 140         325 my @splits = split /\s*=\s*/, $bias_string;
2767 140         103 my $chosen_for_bias_value = $splits[0];
2768 140         77 my $chosen_bias = $splits[1];
2769 140         135 my $remaining_bias = 1 - $chosen_bias;
2770 140         115 my $remaining_portion_bias = $remaining_bias / (@values -1);
2771 140 50       112 @splits = grep {defined($_) && length($_) > 0} @splits;
  280         745  
2772 140         95 my $accumulated = 0;
2773 140         152 foreach my $i (0..@values-1) {
2774 490 100       461 if ($values[$i] eq $chosen_for_bias_value) {
2775 140         188 $value_priors_to_unit_interval_map{$values[$i]}
2776             = [$accumulated, $accumulated + $chosen_bias];
2777 140         132 $accumulated += $chosen_bias;
2778             } else {
2779 350         415 $value_priors_to_unit_interval_map{$values[$i]}
2780             = [$accumulated, $accumulated + $remaining_portion_bias];
2781 350         269 $accumulated += $remaining_portion_bias;
2782             }
2783             }
2784 140         107 my $roll_the_dice = rand(1.0);
2785 140         82 my $value_label;
2786 140         160 foreach my $value_name (keys %value_priors_to_unit_interval_map ) {
2787 299         185 my $v = $value_priors_to_unit_interval_map{$value_name};
2788 299 100 100     702 if ( ($roll_the_dice >= $v->[0])
2789             && ($roll_the_dice <= $v->[1]) ) {
2790 140         75 push @{$training_sample_records{$sample_name}},
  140         244  
2791             $feature . "=" . $value_name;
2792 140         99 $value_label = $value_name;
2793 140         95 last;
2794             }
2795             }
2796 140 50       373 if ($self->{_debug}) {
2797 0         0 print "mapping feature value priors for '$feature' " .
2798             "to unit interval: \n";
2799 0         0 while ( my ($k, $v) =
2800             each %value_priors_to_unit_interval_map ) {
2801 0         0 print "$k => @$v\n";
2802             }
2803 0         0 print "\n\n";
2804             }
2805             }
2806 35         55 $ele_index++;
2807             }
2808 1         1 $self->{_training_sample_records} = \%training_sample_records;
2809 1 50       3 if ($self->{_debug}) {
2810 0         0 print "\n\nPRINTING TRAINING RECORDS:\n\n";
2811 0         0 foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {
  0         0  
2812 0         0 print "$kee => @{$training_sample_records{$kee}}\n\n";
  0         0  
2813             }
2814             }
2815 1         2 my $output_training_file = $self->{_output_training_datafile};
2816 1 50       2 print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};
2817 1         82 open FILEHANDLE, ">$output_training_file";
2818 1         7 my @features = sort keys %features_and_values_hash;
2819 1         2 my $title_string = ',class';
2820 1         1 foreach my $feature_name (@features) {
2821 4         6 $title_string .= ',' . $feature_name;
2822             }
2823 1         10 print FILEHANDLE "$title_string\n";
2824 1         11 my @sample_names = sort {$a <=> $b} map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;
  105         72  
  35         49  
  35         45  
2825 1         4 my $record_string = '';
2826 1         2 foreach my $sample_name (@sample_names) {
2827 35         40 $record_string .= "$sample_name,";
2828 35         23 my @record = @{$training_sample_records{"sample_$sample_name"}};
  35         50  
2829 35         46 my %item_parts_hash;
2830 35         26 foreach my $item (@record) {
2831 175         228 my @splits = grep $_, split /=/, $item;
2832 175         191 $item_parts_hash{$splits[0]} = $splits[1];
2833             }
2834 35         27 $record_string .= $item_parts_hash{"class"};
2835 35         26 delete $item_parts_hash{"class"};
2836 35         61 my @kees = sort keys %item_parts_hash;
2837 35         33 foreach my $kee (@kees) {
2838 140         108 $record_string .= ",$item_parts_hash{$kee}";
2839             }
2840 35         33 print FILEHANDLE "$record_string\n";
2841 35         54 $record_string = '';
2842             }
2843 1         50 close FILEHANDLE;
2844             }
2845              
2846             sub sample_index {
2847 0     0     my $arg = shift;
2848 0           $arg =~ /_(.+)$/;
2849 0           return $1;
2850             }
2851              
2852             ################################# Decision Tree Introspection #######################################
2853             ################################# Class DTIntrospection #######################################
2854              
2855             package DTIntrospection;
2856              
2857             ## Instances constructed from this class can provide explanations for the
2858             ## classification decisions at the nodes of a decision tree.
2859             ##
2860             ## When used in the interactive mode, the decision-tree introspection made possible
2861             ## by this class provides answers to the following three questions: (1) List of the
2862             ## training samples that fall in the portion of the feature space that corresponds
2863             ## to a node of the decision tree; (2) The probabilities associated with the last
2864             ## feature test that led to the node; and (3) The class probabilities predicated on
2865             ## just the last feature test on the path to that node.
2866             ##
2867             ## CAVEAT: It is possible for a node to exist even when there are no training
2868             ## samples in the portion of the feature space that corresponds to the node. That
2869             ## is because a decision tree is based on the probability densities estimated from
2870             ## the training data. When training data is non-uniformly distributed, it is
2871             ## possible for the probability associated with a point in the feature space to be
2872             ## non-zero even when there are no training samples at or in the vicinity of that
2873             ## point.
2874             ##
2875             ## For a node to exist even where there are no training samples in the portion of
2876             ## the feature space that belongs to the node is an indication of the generalization
2877             ## ability of decision-tree based classification.
2878             ##
2879             ## When used in a non-interactive mode, an instance of this class can be used to
2880             ## create a tabular display that shows what training samples belong directly to the
2881             ## portion of the feature space that corresponds to each node of the decision tree.
2882             ## An instance of this class can also construct a tabular display that shows how the
2883             ## influence of each training sample propagates in the decision tree. For each
2884             ## training sample, this display first shows the list of nodes that came into
2885             ## existence through feature test(s) that used the data provided by that sample.
2886             ## This list for each training sample is followed by a subtree of the nodes that owe
2887             ## their existence indirectly to the training sample. A training sample influences a
2888             ## node indirectly if the node is a descendant of another node that is affected
2889             ## directly by the training sample.
2890              
2891 1     1   5 use strict;
  1         0  
  1         20  
2892 1     1   4 use Carp;
  1         0  
  1         1847  
2893              
2894             sub new {
2895 0     0     my ($class, $dt) = @_;
2896 0 0         croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
2897             unless ref($dt) eq "Algorithm::DecisionTree";
2898             bless {
2899             _dt => $dt,
2900             _root_dtnode => $dt->{_root_node},
2901 0           _samples_at_nodes_hash => {},
2902             _branch_features_to_nodes_hash => {},
2903             _sample_to_node_mapping_direct_hash => {},
2904             _node_serial_num_to_node_hash => {},
2905             _awareness_raising_msg_shown => 0,
2906             _debug => 0,
2907             }, $class;
2908             }
2909              
2910             sub initialize {
2911 0     0     my $self = shift;
2912             croak "You must first construct the decision tree before using the DTIntrospection class"
2913 0 0         unless $self->{_root_dtnode};
2914 0           $self->recursive_descent($self->{_root_dtnode});
2915             }
2916              
2917             sub recursive_descent {
2918 0     0     my $self = shift;
2919 0           my $node = shift;
2920 0           my $node_serial_number = $node->get_serial_num();
2921 0           my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2922 0 0         print "\nAt node $node_serial_number: the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
  0            
2923 0           $self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
2924 0           $self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
2925 0           my @samples_at_node = ();
2926 0           foreach my $item (@$branch_features_and_values_or_thresholds) {
2927 0           my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
2928 0 0         unless (@samples_at_node) {
2929 0           @samples_at_node = @$samples_for_feature_value_combo;
2930             } else {
2931 0           my @accum;
2932 0           foreach my $sample (@samples_at_node) {
2933 0 0         push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);
2934             }
2935 0           @samples_at_node = @accum;
2936             }
2937 0 0         last unless @samples_at_node;
2938             }
2939 0           @samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node;
  0            
2940 0 0         print "Node: $node_serial_number the samples are: [@samples_at_node]\n" if ($self->{_debug});
2941 0           $self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
2942 0 0         if (@samples_at_node) {
2943 0           foreach my $sample (@samples_at_node) {
2944 0 0         if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2945 0           $self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number];
2946             } else {
2947 0           push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
  0            
2948             }
2949             }
2950             }
2951 0           my $children = $node->get_children();
2952 0           foreach my $child (@$children) {
2953 0           $self->recursive_descent($child);
2954             }
2955             }
2956              
2957             sub display_training_samples_at_all_nodes_direct_influence_only {
2958 0     0     my $self = shift;
2959             croak "You must first construct the decision tree before using the DT Introspection class."
2960 0 0         unless $self->{_root_dtnode};
2961 0           $self->recursive_descent_for_showing_samples_at_a_node($self->{_root_dtnode});
2962             }
2963              
2964             sub recursive_descent_for_showing_samples_at_a_node{
2965 0     0     my $self = shift;
2966 0           my $node = shift;
2967 0           my $node_serial_number = $node->get_serial_num();
2968 0           my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2969 0 0         if (exists $self->{_samples_at_nodes_hash}->{$node_serial_number}) {
2970 0 0         print "\nAt node $node_serial_number: the branch features and values are: [@{$branch_features_and_values_or_thresholds}]\n" if $self->{_debug};
  0            
2971 0           print "Node $node_serial_number: the samples are: [@{$self->{_samples_at_nodes_hash}->{$node_serial_number}}]\n";
  0            
2972             }
2973 0           map $self->recursive_descent_for_showing_samples_at_a_node($_), @{$node->get_children()};
  0            
2974             }
2975              
2976             sub display_training_samples_to_nodes_influence_propagation {
2977 0     0     my $self = shift;
2978 0           foreach my $sample (sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} keys %{$self->{_dt}->{_training_data_hash}}) {
  0            
  0            
2979 0 0         if (exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2980 0           my $nodes_directly_affected = $self->{_sample_to_node_mapping_direct_hash}->{$sample};
2981 0           print "\n$sample:\n nodes affected directly: [@{$nodes_directly_affected}]\n";
  0            
2982 0           print " nodes affected through probabilistic generalization:\n";
2983 0           map $self->recursive_descent_for_sample_to_node_influence($_, $nodes_directly_affected, " "), @$nodes_directly_affected;
2984             }
2985             }
2986             }
2987              
2988             sub recursive_descent_for_sample_to_node_influence {
2989 0     0     my $self = shift;
2990 0           my $node_serial_num = shift;
2991 0           my $nodes_already_accounted_for = shift;
2992 0           my $offset = shift;
2993 0           $offset .= " ";
2994 0           my $node = $self->{_node_serial_num_to_node_hash}->{$node_serial_num};
2995 0           my @children = map $_->get_serial_num(), @{$node->get_children()};
  0            
2996 0           my @children_affected = grep {!Algorithm::DecisionTree::contained_in($_, @{$nodes_already_accounted_for})} @children;
  0            
  0            
2997 0 0         if (@children_affected) {
2998 0           print "$offset $node_serial_num => [@children_affected]\n";
2999             }
3000 0           map $self->recursive_descent_for_sample_to_node_influence($_, \@children_affected, $offset), @children_affected;
3001             }
3002              
3003             sub get_samples_for_feature_value_combo {
3004 0     0     my $self = shift;
3005 0           my $feature_value_combo = shift;
3006 0           my ($feature,$op,$value) = $self->extract_feature_op_val($feature_value_combo);
3007 0           my @samples = ();
3008 0 0         if ($op eq '=') {
    0          
    0          
3009 0           @samples = grep Algorithm::DecisionTree::contained_in($feature_value_combo, @{$self->{_dt}->{_training_data_hash}->{$_}}), keys %{$self->{_dt}->{_training_data_hash}};
  0            
  0            
3010             } elsif ($op eq '<') {
3011 0           foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
  0            
3012 0           my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
  0            
3013 0           foreach my $item (@features_and_values) {
3014 0           my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3015 0 0 0       if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data <= $value)) {
      0        
3016 0           push @samples, $sample;
3017 0           last;
3018             }
3019             }
3020             }
3021             } elsif ($op eq '>') {
3022 0           foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
  0            
3023 0           my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
  0            
3024 0           foreach my $item (@features_and_values) {
3025 0           my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3026 0 0 0       if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data > $value)) {
      0        
3027 0           push @samples, $sample;
3028 0           last;
3029             }
3030             }
3031             }
3032             } else {
3033 0           die "Something strange is going on";
3034             }
3035 0           return \@samples;
3036             }
3037              
3038             sub extract_feature_op_val {
3039 0     0     my $self = shift;
3040 0           my $feature_value_combo = shift;
3041 0           my $pattern1 = '(.+)=(.+)';
3042 0           my $pattern2 = '(.+)<(.+)';
3043 0           my $pattern3 = '(.+)>(.+)';
3044 0           my ($feature,$value,$op);
3045 0 0         if ($feature_value_combo =~ /$pattern2/) {
    0          
    0          
3046 0           ($feature,$op,$value) = ($1,'<',$2);
3047             } elsif ($feature_value_combo =~ /$pattern3/) {
3048 0           ($feature,$op,$value) = ($1,'>',$2);
3049             } elsif ($feature_value_combo =~ /$pattern1/) {
3050 0           ($feature,$op,$value) = ($1,'=',$2);
3051             }
3052 0           return ($feature,$op,$value);
3053             }
3054              
3055             sub explain_classifications_at_multiple_nodes_interactively {
3056 0     0     my $self = shift;
3057             croak "You called explain_classification_at_multiple_nodes_interactively() without " .
3058             "first initializing the DTIntrospection instance in your code. Aborting."
3059 0 0         unless $self->{_samples_at_nodes_hash};
3060 0           print "\n\nIn order for the decision tree to introspect\n\n";
3061 0           print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3062             " that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3063             " with it even when there are NO training data points in that region?\n" .
3064             "\nEnter 'y' for yes or any other character for no: ";
3065 0           my $ans = ;
3066 0           $ans =~ s/^\s*|\s*$//g;
3067 0 0         die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decisions in the decision tree. Aborting!\n" if $ans !~ /^ye?s?$/;
3068 0           $self->{_awareness_raising_msg_shown} = 1;
3069 0           while (1) {
3070 0           my $node_id;
3071             my $ans;
3072 0           while (1) {
3073 0           print "\nEnter the integer ID of a node: ";
3074 0           $ans = ;
3075 0           $ans =~ s/^\s*|\s*$//g;
3076 0 0         return if $ans =~ /^exit$/;
3077 0 0         last if Algorithm::DecisionTree::contained_in($ans, keys %{$self->{_samples_at_nodes_hash}});
  0            
3078 0           print "\nYour answer must be an integer ID of a node. Try again or enter 'exit'.\n";
3079             }
3080 0           $node_id = $ans;
3081 0           $self->explain_classification_at_one_node($node_id)
3082             }
3083             }
3084              
3085             sub explain_classification_at_one_node {
3086 0     0     my $self = shift;
3087 0           my $node_id = shift;
3088             croak "You called explain_classification_at_one_node() without first initializing " .
3089 0 0         "the DTIntrospection instance in your code. Aborting." unless $self->{_samples_at_nodes_hash};
3090 0 0         unless (exists $self->{_samples_at_nodes_hash}->{$node_id}) {
3091 0           print "Node $node_id is not a node in the tree\n";
3092 0           return;
3093             }
3094 0 0         unless ($self->{_awareness_raising_msg_shown}) {
3095 0           print "\n\nIn order for the decision tree to introspect at Node $node_id: \n\n";
3096 0           print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3097             " that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3098             " with it even when there are NO training data points in that region?\n" .
3099             "\nEnter 'y' for yes or any other character for no: ";
3100 0           my $ans = ;
3101 0           $ans =~ s/^\s*|\s*$//g;
3102 0 0         die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decision at node $node_id\n" if $ans !~ /^ye?s?$/;
3103             }
3104 0           my @samples_at_node = @{$self->{_samples_at_nodes_hash}->{$node_id}};
  0            
3105 0           my @branch_features_to_node = @{$self->{_branch_features_to_nodes_hash}->{$node_id}};
  0            
3106             # my @class_names = @{DTNode->get_class_names()};
3107 0           my @class_names = @{$self->get_class_names()};
  0            
3108 0           my $class_probabilities = $self->{_root_dtnode}->get_class_probabilities();
3109 0           my ($feature,$op,$value) = $self->extract_feature_op_val( $branch_features_to_node[-1] );
3110 0 0         my $msg = @samples_at_node == 0
3111             ? "\n\n There are NO training data samples directly in the region of the feature space assigned to node $node_id: @samples_at_node\n\n"
3112             : "\n Samples in the portion of the feature space assigned to Node $node_id: @samples_at_node\n";
3113 0           $msg .= "\n Features tests on the branch to node $node_id: [@branch_features_to_node]\n\n";
3114 0           $msg .= "\n Would you like to see the probability associated with the last feature test on the branch leading to Node $node_id?\n";
3115 0           $msg .= "\n Enter 'y' if yes and any other character for 'no': ";
3116 0           print $msg;
3117 0           my $ans = ;
3118 0           $ans =~ s/^\s*|\s*$//g;
3119 0 0         if ($ans =~ /^ye?s?$/) {
3120 0           my $sequence = [$branch_features_to_node[-1]];
3121 0           my $prob = $self->{_dt}->probability_of_a_sequence_of_features_and_values_or_thresholds($sequence);
3122 0           print "\n probability of @{$sequence} is: $prob\n";
  0            
3123             }
3124 0           $msg = "\n Using Bayes rule, would you like to see the class probabilities predicated on just the last feature test on the branch leading to Node $node_id?\n";
3125 0           $msg .= "\n Enter 'y' for yes and any other character for no: ";
3126 0           print $msg;
3127 0           $ans = ;
3128 0           $ans =~ s/^\s*|\s*$//g;
3129 0 0         if ($ans =~ /^ye?s?$/) {
3130 0           my $sequence = [$branch_features_to_node[-1]];
3131 0           foreach my $cls (@class_names) {
3132 0           my $prob = $self->{_dt}->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($cls, $sequence);
3133 0           print "\n probability of class $cls given just the feature test @{$sequence} is: $prob\n";
  0            
3134             }
3135             } else {
3136 0           print "goodbye\n";
3137             }
3138 0           print "\n Finished supplying information on Node $node_id\n\n";
3139             }
3140              
3141             1;
3142              
3143             =pod
3144              
3145             =head1 NAME
3146              
3147             Algorithm::DecisionTree - A Perl module for decision-tree based classification of
3148             multidimensional data.
3149              
3150              
3151             =head1 SYNOPSIS
3152              
3153             # FOR CONSTRUCTING A DECISION TREE AND FOR CLASSIFYING A SAMPLE:
3154              
3155             # In general, your call for constructing an instance of the DecisionTree class
3156             # will look like:
3157              
3158             my $training_datafile = "stage3cancer.csv";
3159             my $dt = Algorithm::DecisionTree->new(
3160             training_datafile => $training_datafile,
3161             csv_class_column_index => 2,
3162             csv_columns_for_features => [3,4,5,6,7,8],
3163             entropy_threshold => 0.01,
3164             max_depth_desired => 8,
3165             symbolic_to_numeric_cardinality_threshold => 10,
3166             );
3167              
3168             # The constructor option `csv_class_column_index' informs the module as to which
3169             # column of your CSV file contains the class label. THE COLUMN INDEXING IS ZERO
3170             # BASED. The constructor option `csv_columns_for_features' specifies which columns
3171             # are to be used for feature values. The first row of the CSV file must specify
3172             # the names of the features. See examples of CSV files in the `Examples'
3173             # subdirectory.
3174              
3175             # The option `symbolic_to_numeric_cardinality_threshold' is also important. For
3176             # the example shown above, if an ostensibly numeric feature takes on only 10 or
3177             # fewer different values in your training datafile, it will be treated like a
3178             # symbolic features. The option `entropy_threshold' determines the granularity
3179             # with which the entropies are sampled for the purpose of calculating entropy gain
3180             # with a particular choice of decision threshold for a numeric feature or a feature
3181             # value for a symbolic feature.
3182              
3183             # After you have constructed an instance of the DecisionTree class as shown above,
3184             # you read in the training data file and initialize the probability cache by
3185             # calling:
3186              
3187             $dt->get_training_data();
3188             $dt->calculate_first_order_probabilities();
3189             $dt->calculate_class_priors();
3190              
3191             # Next you construct a decision tree for your training data by calling:
3192              
3193             $root_node = $dt->construct_decision_tree_classifier();
3194              
3195             # where $root_node is an instance of the DTNode class that is also defined in the
3196             # module file. Now you are ready to classify a new data record. Let's say that
3197             # your data record looks like:
3198              
3199             my @test_sample = qw / g2=4.2
3200             grade=2.3
3201             gleason=4
3202             eet=1.7
3203             age=55.0
3204             ploidy=diploid /;
3205              
3206             # You can classify it by calling:
3207              
3208             my $classification = $dt->classify($root_node, \@test_sample);
3209              
3210             # The call to `classify()' returns a reference to a hash whose keys are the class
3211             # names and the values the associated classification probabilities. This hash also
3212             # includes another key-value pair for the solution path from the root node to the
3213             # leaf node at which the final classification was carried out.
3214              
3215              
3216             =head1 CHANGES
3217              
3218             B All the changes made in this version relate to the construction of
3219             regression trees. I have fixed a couple of bugs in the calculation of the regression
3220             coefficients. Additionally, the C class now comes with a new
3221             constructor parameter named C. For most cases, you'd set this
3222             parameter to 0, which causes the regression coefficients to be estimated through
3223             linear least-squares minimization.
3224              
3225             B In addition to constructing decision trees, this version of the
3226             module also allows you to construct regression trees. The regression tree capability
3227             has been packed into a separate subclass, named C, of the main
3228             C class. The subdirectory C in the main
3229             installation directory illustrates how you can use this new functionality of the
3230             module.
3231              
3232             B This version incorporates four very significant upgrades/changes to
3233             the C module: B<(1)> The CSV cleanup is now the default. So you do not
3234             have to set any special parameters in the constructor calls to initiate CSV
3235             cleanup. B<(2)> In the form of a new Perl class named C,
3236             this module provides you with an easy-to-use programming interface for attempting
3237             needle-in-a-haystack solutions for the case when your training data is overwhelmingly
3238             dominated by a single class. You need to set the constructor parameter
3239             C to invoke the logic that constructs multiple
3240             decision trees, each using the minority class samples along with samples drawn
3241             randomly from the majority class. The final classification is made through a
3242             majority vote from all the decision trees. B<(3)> Assuming you are faced with a
3243             big-data problem --- in the sense that you have been given a training database with a
3244             very large number of training records --- the class C will
3245             also let you construct multiple decision trees by pulling training data randomly from
3246             your training database (without paying attention to the relative populations of the
3247             classes). The final classification decision for a test sample is based on a majority
3248             vote from all the decision trees thus constructed. See the C
3249             directory for how to use these new features of the module. And, finally, B<(4)>
3250             Support for the old-style '.dat' training files has been dropped in this version.
3251              
3252             B This version makes it easier to use a CSV training file that
3253             violates the assumption that a comma be used only to separate the different field
3254             values in a line record. Some large econometrics databases use double-quoted values
3255             for fields, and these values may contain commas (presumably for better readability).
3256             This version also allows you to specify the leftmost entry in the first CSV record
3257             that names all the fields. Previously, this entry was required to be an empty
3258             double-quoted string. I have also made some minor changes to the
3259             'C' method to make it more user friendly for large
3260             training files that may contain tens of thousands of records. When pulling training
3261             data from such files, this method prints out a dot on the terminal screen for every
3262             10000 records it has processed.
3263              
3264             B This version brings the boosting capability to the C
3265             module.
3266              
3267             B This version adds bagging to the C module. If your
3268             training dataset is large enough, you can ask the module to construct multiple
3269             decision trees using data bags extracted from your dataset. The module can show you
3270             the results returned by the individual decision trees and also the results obtained
3271             by taking a majority vote of the classification decisions made by the individual
3272             trees. You can specify any arbitrary extent of overlap between the data bags.
3273              
3274             B The introspection capability in this version packs more of a punch.
3275             For each training data sample, you can now figure out not only the decision-tree
3276             nodes that are affected directly by that sample, but also those nodes that are
3277             affected indirectly through the generalization achieved by the probabilistic modeling
3278             of the data. The 'examples' directory of this version includes additional scripts
3279             that illustrate these enhancements to the introspection capability. See the section
3280             "The Introspection API" for a declaration of the introspection related methods, old
3281             and new.
3282              
3283             B In response to requests from several users, this version includes a new
3284             capability: You can now ask the module to introspect about the classification
3285             decisions returned by the decision tree. Toward that end, the module includes a new
3286             class named C. Perhaps the most important bit of information you
3287             are likely to seek through DT introspection is the list of the training samples that
3288             fall directly in the portion of the feature space that is assigned to a node.
3289             B When training samples are non-uniformly distributed in the underlying
3290             feature space, IT IS POSSIBLE FOR A NODE TO EXIST EVEN WHEN NO TRAINING SAMPLES FALL
3291             IN THE PORTION OF THE FEATURE SPACE ASSIGNED TO THE NODE. B<(This is an important
3292             part of the generalization achieved by probabilistic modeling of the training data.)>
3293             For additional information related to DT introspection, see the section titled
3294             "DECISION TREE INTROSPECTION" in this documentation page.
3295              
3296             B makes the logic of tree construction from the old-style '.dat' training
3297             files more consistent with how trees are constructed from the data in `.csv' files.
3298             The inconsistency in the past was with respect to the naming convention for the class
3299             labels associated with the different data records.
3300              
3301             B fixes a bug in the part of the module that some folks use for generating
3302             synthetic data for experimenting with decision tree construction and classification.
3303             In the class C that is a part of the module, there
3304             was a problem with the order in which the features were recorded from the
3305             user-supplied parameter file. The basic code for decision tree construction and
3306             classification remains unchanged.
3307              
3308             B further downshifts the required version of Perl for this module. This
3309             was a result of testing the module with Version 5.10.1 of Perl. Only one statement
3310             in the module code needed to be changed for the module to work with the older version
3311             of Perl.
3312              
3313             B fixes the C restriction on the required Perl version. This
3314             version should work with Perl versions 5.14.0 and higher.
3315              
3316             B changes the required version of Perl from 5.18.0 to 5.14.0. Everything
3317             else remains the same.
3318              
3319             B should prove more robust when the probability distribution for the
3320             values of a feature is expected to be heavy-tailed; that is, when the supposedly rare
3321             observations can occur with significant probabilities. A new option in the
3322             DecisionTree constructor lets the user specify the precision with which the
3323             probability distributions are estimated for such features.
3324              
3325             B fixes a bug that was caused by the explicitly set zero values for
3326             numerical features being misconstrued as "false" in the conditional statements in
3327             some of the method definitions.
3328              
3329             B makes it easier to write code for classifying in one go all of your test
3330             data samples in a CSV file. The bulk classifications obtained can be written out to
3331             either a CSV file or to a regular text file. See the script
3332             C in the C directory for how to
3333             classify all of your test data records in a CSV file. This version also includes
3334             improved code for generating synthetic numeric/symbolic training and test data
3335             records for experimenting with the decision tree classifier.
3336              
3337             B allows you to test the quality of your training data by running a 10-fold
3338             cross-validation test on the data. This test divides all of the training data into
3339             ten parts, with nine parts used for training a decision tree and one part used for
3340             testing its ability to classify correctly. This selection of nine parts for training
3341             and one part for testing is carried out in all of the ten different ways that are
3342             possible. This testing functionality in Version 2.1 can also be used to find the
3343             best values to use for the constructor parameters C,
3344             C, and C.
3345              
3346             B Now you can use both numeric and
3347             symbolic features for constructing a decision tree. A feature is numeric if it can
3348             take any floating-point value over an interval.
3349              
3350             B fixes a bug in the code that was triggered by 0 being declared as one of
3351             the features values in the training datafile. Version 1.71 also include an additional
3352             safety feature that is useful for training datafiles that contain a very large number
3353             of features. The new version makes sure that the number of values you declare for
3354             each sample record matches the number of features declared at the beginning of the
3355             training datafile.
3356              
3357             B includes safety checks on the consistency of the data you place in your
3358             training datafile. When a training file contains thousands of samples, it is
3359             difficult to manually check that you used the same class names in your sample records
3360             that you declared at top of your training file or that the values you have for your
3361             features are legal vis-a-vis the earlier declarations of the values in the training
3362             file. Another safety feature incorporated in this version is the non-consideration
3363             of classes that are declared at the top of the training file but that have no sample
3364             records in the file.
3365              
3366             B uses probability caching much more extensively compared to the previous
3367             versions. This should result in faster construction of large decision trees.
3368             Another new feature in Version 1.6 is the use of a decision tree for interactive
3369             classification. In this mode, after you have constructed a decision tree from the
3370             training data, the user is prompted for answers to the questions pertaining to the
3371             feature tests at the nodes of the tree.
3372              
3373             Some key elements of the documentation were cleaned up and made more readable in
3374             B. The implementation code remains unchanged from Version 1.4.
3375              
3376             B should make things faster (and easier) for folks who want to use this
3377             module with training data that creates very large decision trees (that is, trees with
3378             tens of thousands or more decision nodes). The speedup in Version 1.4 has been
3379             achieved by eliminating duplicate calculation of probabilities as the tree grows. In
3380             addition, this version provides an additional constructor parameter,
3381             C for controlling the size of the decision tree. This is in
3382             addition to the tree size control achieved by the parameter C that
3383             was introduced in Version 1.3. Since large decision trees can take a long time to
3384             create, you may find yourself wishing you could store the tree you just created in a
3385             disk file and that, subsequently, you could use the stored tree for classification
3386             work. The C directory contains two scripts, C and
3387             C, that show how you can do exactly that with the
3388             help of Perl's C module.
3389              
3390             B addresses the issue that arises when the header of a training datafile
3391             declares a certain possible value for a feature but that (feature,value) pair does
3392             NOT show up anywhere in the training data. Version 1.3 also makes it possible for a
3393             user to control the size of the decision tree by changing the value of the parameter
3394             C Additionally, Version 1.3 includes a method called
3395             C that displays useful information regarding the size and
3396             some other attributes of the training data. It also warns the user if the training
3397             data might result in a decision tree that would simply be much too large --- unless
3398             the user controls the size with the entropy_threshold parameter.
3399              
3400             In addition to the removal of a couple of serious bugs, B incorporates a
3401             number of enhancements: (1) Version 1.2 includes checks on the names of the features
3402             and values used in test data --- this is the data you want to classify with the
3403             decision tree classifier constructed by this module. (2) Version 1.2 includes a
3404             separate constructor for generating test data. To make it easier to generate test
3405             data whose probabilistic parameters may not be identical to that used for the
3406             training data, I have used separate routines for generating the test data. (3)
3407             Version 1.2 also includes in its examples directory a script that classifies the test
3408             data in a file and outputs the class labels into another file. This is for folks who
3409             do not wish to write their own scripts using this module. (4) Version 1.2 also
3410             includes addition to the documentation regarding the issue of numeric values for
3411             features.
3412              
3413             =head1 DESCRIPTION
3414              
3415             B is a I module for constructing a decision tree from
3416             a training datafile containing multidimensional data. In one form or another,
3417             decision trees have been around for about fifty years. From a statistical
3418             perspective, they are closely related to classification and regression by recursive
3419             partitioning of multidimensional data. Early work that demonstrated the usefulness
3420             of such partitioning of data for classification and regression can be traced to the
3421             work of Terry Therneau in the early 1980's in the statistics community, and to the
3422             work of Ross Quinlan in the mid 1990's in the machine learning community.
3423              
3424             For those not familiar with decision tree ideas, the traditional way to classify
3425             multidimensional data is to start with a feature space whose dimensionality is the
3426             same as that of the data. Each feature in this space corresponds to the attribute
3427             that each dimension of the data measures. You then use the training data to carve up
3428             the feature space into different regions, each corresponding to a different class.
3429             Subsequently, when you try to classify a new data sample, you locate it in the
3430             feature space and find the class label of the region to which it belongs. One can
3431             also give the new data point the same class label as that of the nearest training
3432             sample. This is referred to as the nearest neighbor classification. There exist
3433             hundreds of variations of varying power on these two basic approaches to the
3434             classification of multidimensional data.
3435              
3436             A decision tree classifier works differently. When you construct a decision tree,
3437             you select for the root node a feature test that partitions the training data in a
3438             way that causes maximal disambiguation of the class labels associated with the data.
3439             In terms of information content as measured by entropy, such a feature test would
3440             cause maximum reduction in class entropy in going from all of the training data taken
3441             together to the data as partitioned by the feature test. You then drop from the root
3442             node a set of child nodes, one for each partition of the training data created by the
3443             feature test at the root node. When your features are purely symbolic, you'll have
3444             one child node for each value of the feature chosen for the feature test at the root.
3445             When the test at the root involves a numeric feature, you find the decision threshold
3446             for the feature that best bipartitions the data and you drop from the root node two
3447             child nodes, one for each partition. Now at each child node you pose the same
3448             question that you posed when you found the best feature to use at the root: Which
3449             feature at the child node in question would maximally disambiguate the class labels
3450             associated with the training data corresponding to that child node?
3451              
3452             As the reader would expect, the two key steps in any approach to decision-tree based
3453             classification are the construction of the decision tree itself from a file
3454             containing the training data, and then using the decision tree thus obtained for
3455             classifying new data.
3456              
3457             What is cool about decision tree classification is that it gives you soft
3458             classification, meaning it may associate more than one class label with a given data
3459             vector. When this happens, it may mean that your classes are indeed overlapping in
3460             the underlying feature space. It could also mean that you simply have not supplied
3461             sufficient training data to the decision tree classifier. For a tutorial
3462             introduction to how a decision tree is constructed and used, visit
3463             L
3464              
3465             This module also allows you to generate your own synthetic training and test
3466             data. Generating your own training data, using it for constructing a decision-tree
3467             classifier, and subsequently testing the classifier on a synthetically generated
3468             test set of data is a good way to develop greater proficiency with decision trees.
3469              
3470              
3471             =head1 WHAT PRACTICAL PROBLEM IS SOLVED BY THIS MODULE
3472              
3473             If you are new to the concept of a decision tree, their practical utility is best
3474             understood with an example that only involves symbolic features. However, as
3475             mentioned earlier, versions of the module higher than 2.0 allow you to use both
3476             symbolic and numeric features.
3477              
3478             Consider the following scenario: Let's say you are running a small investment company
3479             that employs a team of stockbrokers who make buy/sell decisions for the customers of
3480             your company. Assume that your company has asked the traders to make each investment
3481             decision on the basis of the following four criteria:
3482              
3483             price_to_earnings_ratio (P_to_E)
3484              
3485             price_to_sales_ratio (P_to_S)
3486              
3487             return_on_equity (R_on_E)
3488              
3489             market_share (MS)
3490              
3491             Since you are the boss, you keep track of the buy/sell decisions made by the
3492             individual traders. But one unfortunate day, all of your traders decide to quit
3493             because you did not pay them enough. So what do you do? If you had a module like
3494             the one here, you could still run your company and do so in such a way that, on the
3495             average, would do better than any of the individual traders who worked for your
3496             company. This is what you do: You pool together the individual trader buy/sell
3497             decisions you have accumulated during the last one year. This pooled information is
3498             likely to look like:
3499              
3500              
3501             example buy/sell P_to_E P_to_S R_on_E MS
3502             ============================================================+=
3503              
3504             example_1 buy high low medium low
3505             example_2 buy medium medium low low
3506             example_3 sell low medium low high
3507             ....
3508             ....
3509              
3510             This data, when formatted according to CSV, would constitute your training file. You
3511             could feed this file into the module by calling:
3512              
3513             my $dt = Algorithm::DecisionTree->new(
3514             training_datafile => $training_datafile,
3515             csv_class_column_index => 1,
3516             csv_columns_for_features => [2,3,4,5],
3517             );
3518             $dt->get_training_data();
3519             $dt->calculate_first_order_probabilities();
3520             $dt->calculate_class_priors();
3521              
3522             Subsequently, you would construct a decision tree by calling:
3523              
3524             my $root_node = $dt->construct_decision_tree_classifier();
3525              
3526             Now you and your company (with practically no employees) are ready to service the
3527             customers again. Suppose your computer needs to make a buy/sell decision about an
3528             investment prospect that is best described by:
3529              
3530             price_to_earnings_ratio = low
3531             price_to_sales_ratio = very_low
3532             return_on_equity = none
3533             market_share = medium
3534              
3535             All that your computer would need to do would be to construct a data vector like
3536              
3537             my @data = qw / P_to_E=low
3538             P_to_S=very_low
3539             R_on_E=none
3540             MS=medium /;
3541              
3542             and call the decision tree classifier you just constructed by
3543              
3544             $dt->classify($root_node, \@data);
3545              
3546             The answer returned will be 'buy' and 'sell', along with the associated
3547             probabilities. So if the probability of 'buy' is considerably greater than the
3548             probability of 'sell', that's what you should instruct your computer to do.
3549              
3550             The chances are that, on the average, this approach would beat the performance of any
3551             of your individual traders who worked for you previously since the buy/sell decisions
3552             made by the computer would be based on the collective wisdom of all your previous
3553             traders. B
3554             captured by the silly little example here. However, it does nicely the convey the
3555             sense in which the current module could be used.>
3556              
3557             =head1 SYMBOLIC FEATURES VERSUS NUMERIC FEATURES
3558              
3559             A feature is symbolic when its values are compared using string comparison operators.
3560             By the same token, a feature is numeric when its values are compared using numeric
3561             comparison operators. Having said that, features that take only a small number of
3562             numeric values in the training data can be treated symbolically provided you are
3563             careful about handling their values in the test data. At the least, you have to set
3564             the test data value for such a feature to its closest value in the training data.
3565             The module does that automatically for you for those numeric features for which the
3566             number different numeric values is less than a user-specified threshold. For those
3567             numeric features that the module is allowed to treat symbolically, this snapping of
3568             the values of the features in the test data to the small set of values in the training
3569             data is carried out automatically by the module. That is, after a user has told the
3570             module which numeric features to treat symbolically, the user need not worry about
3571             how the feature values appear in the test data.
3572              
3573             The constructor parameter C let's you tell
3574             the module when to consider an otherwise numeric feature symbolically. Suppose you
3575             set this parameter to 10, that means that all numeric looking features that take 10
3576             or fewer different values in the training datafile will be considered to be symbolic
3577             features by the module. See the tutorial at
3578             L for
3579             further information on the implementation issues related to the symbolic and numeric
3580             features.
3581              
3582             =head1 FEATURES WITH NOT SO "NICE" STATISTICAL PROPERTIES
3583              
3584             For the purpose of estimating the probabilities, it is necessary to sample the range
3585             of values taken on by a numerical feature. For features with "nice" statistical
3586             properties, this sampling interval is set to the median of the differences between
3587             the successive feature values in the training data. (Obviously, as you would expect,
3588             you first sort all the values for a feature before computing the successive
3589             differences.) This logic will not work for the sort of a feature described below.
3590              
3591             Consider a feature whose values are heavy-tailed, and, at the same time, the values
3592             span a million to one range. What I mean by heavy-tailed is that rare values can
3593             occur with significant probabilities. It could happen that most of the values for
3594             such a feature are clustered at one of the two ends of the range. At the same time,
3595             there may exist a significant number of values near the end of the range that is less
3596             populated. (Typically, features related to human economic activities --- such as
3597             wealth, incomes, etc. --- are of this type.) With the logic described in the
3598             previous paragraph, you could end up with a sampling interval that is much too small,
3599             which could result in millions of sampling points for the feature if you are not
3600             careful.
3601              
3602             Beginning with Version 2.22, you have two options in dealing with such features. You
3603             can choose to go with the default behavior of the module, which is to sample the
3604             value range for such a feature over a maximum of 500 points. Or, you can supply an
3605             additional option to the constructor that sets a user-defined value for the number of
3606             points to use. The name of the option is C. The following
3607             script
3608              
3609             construct_dt_for_heavytailed.pl
3610              
3611             in the C directory shows an example of how to call the constructor of the
3612             module with the C option.
3613              
3614              
3615             =head1 TESTING THE QUALITY OF YOUR TRAINING DATA
3616              
3617             Versions 2.1 and higher include a new class named C, derived from
3618             the main class C, that runs a 10-fold cross-validation test on your
3619             training data to test its ability to discriminate between the classes mentioned in
3620             the training file.
3621              
3622             The 10-fold cross-validation test divides all of the training data into ten parts,
3623             with nine parts used for training a decision tree and one part used for testing its
3624             ability to classify correctly. This selection of nine parts for training and one part
3625             for testing is carried out in all of the ten different possible ways.
3626              
3627             The following code fragment illustrates how you invoke the testing function of the
3628             EvalTrainingData class:
3629              
3630             my $training_datafile = "training.csv";
3631             my $eval_data = EvalTrainingData->new(
3632             training_datafile => $training_datafile,
3633             csv_class_column_index => 1,
3634             csv_columns_for_features => [2,3],
3635             entropy_threshold => 0.01,
3636             max_depth_desired => 3,
3637             symbolic_to_numeric_cardinality_threshold => 10,
3638             );
3639             $eval_data->get_training_data();
3640             $eval_data->evaluate_training_data()
3641              
3642             The last statement above prints out a Confusion Matrix and the value of Training Data
3643             Quality Index on a scale of 0 to 100, with 100 designating perfect training data.
3644             The Confusion Matrix shows how the different classes were mislabeled in the 10-fold
3645             cross-validation test.
3646              
3647             This testing functionality can also be used to find the best values to use for the
3648             constructor parameters C, C, and
3649             C.
3650              
3651             The following two scripts in the C directory illustrate the use of the
3652             C class for testing the quality of your data:
3653              
3654             evaluate_training_data1.pl
3655             evaluate_training_data2.pl
3656              
3657              
3658             =head1 HOW TO MAKE THE BEST CHOICES FOR THE CONSTRUCTOR PARAMETERS
3659              
3660             Assuming your training data is good, the quality of the results you get from a
3661             decision tree would depend on the choices you make for the constructor parameters
3662             C, C, and
3663             C. You can optimize your choices for
3664             these parameters by running the 10-fold cross-validation test that is made available
3665             in Versions 2.2 and higher through the new class C that is included
3666             in the module file. A description of how to run this test is in the previous section
3667             of this document.
3668              
3669              
3670             =head1 DECISION TREE INTROSPECTION
3671              
3672             Starting with Version 2.30, you can ask the C class of the module to
3673             explain the classification decisions made at the different nodes of the decision
3674             tree.
3675              
3676             Perhaps the most important bit of information you are likely to seek through DT
3677             introspection is the list of the training samples that fall directly in the portion
3678             of the feature space that is assigned to a node.
3679              
3680             However, note that, when training samples are non-uniformly distributed in the
3681             underlying feature space, it is possible for a node to exist even when there are no
3682             training samples in the portion of the feature space assigned to the node. That is
3683             because the decision tree is constructed from the probability densities estimated
3684             from the training data. When the training samples are non-uniformly distributed, it
3685             is entirely possible for the estimated probability densities to be non-zero in a
3686             small region around a point even when there are no training samples specifically in
3687             that region. (After you have created a statistical model for, say, the height
3688             distribution of people in a community, the model may return a non-zero probability
3689             for the height values in a small interval even if the community does not include a
3690             single individual whose height falls in that interval.)
3691              
3692             That a decision-tree node can exist even when there are no training samples in that
3693             portion of the feature space that belongs to the node is an important indication of
3694             the generalization ability of a decision-tree-based classifier.
3695              
3696             In light of the explanation provided above, before the DTIntrospection class supplies
3697             any answers at all, it asks you to accept the fact that features can take on non-zero
3698             probabilities at a point in the feature space even though there are zero training
3699             samples at that point (or in a small region around that point). If you do not accept
3700             this rudimentary fact, the introspection class will not yield any answers (since you
3701             are not going to believe the answers anyway).
3702              
3703             The point made above implies that the path leading to a node in the decision tree may
3704             test a feature for a certain value or threshold despite the fact that the portion of
3705             the feature space assigned to that node is devoid of any training data.
3706              
3707             See the following three scripts in the Examples directory for how to carry out DT
3708             introspection:
3709              
3710             introspection_in_a_loop_interactive.pl
3711              
3712             introspection_show_training_samples_at_all_nodes_direct_influence.pl
3713              
3714             introspection_show_training_samples_to_nodes_influence_propagation.pl
3715              
3716             The first script places you in an interactive session in which you will first be
3717             asked for the node number you are interested in. Subsequently, you will be asked for
3718             whether or not you are interested in specific questions that the introspection can
3719             provide answers for. The second script descends down the decision tree and shows for
3720             each node the training samples that fall directly in the portion of the feature space
3721             assigned to that node. The third script shows for each training sample how it
3722             affects the decision-tree nodes either directly or indirectly through the
3723             generalization achieved by the probabilistic modeling of the data.
3724              
3725             The output of the script
3726             C looks like:
3727              
3728             Node 0: the samples are: None
3729             Node 1: the samples are: [sample_46 sample_58]
3730             Node 2: the samples are: [sample_1 sample_4 sample_7 .....]
3731             Node 3: the samples are: []
3732             Node 4: the samples are: []
3733             ...
3734             ...
3735              
3736             The nodes for which no samples are listed come into existence through
3737             the generalization achieved by the probabilistic modeling of the data.
3738              
3739             The output produced by the script
3740             C looks like
3741              
3742             sample_1:
3743             nodes affected directly: [2 5 19 23]
3744             nodes affected through probabilistic generalization:
3745             2=> [3 4 25]
3746             25=> [26]
3747             5=> [6]
3748             6=> [7 13]
3749             7=> [8 11]
3750             8=> [9 10]
3751             11=> [12]
3752             13=> [14 18]
3753             14=> [15 16]
3754             16=> [17]
3755             19=> [20]
3756             20=> [21 22]
3757             23=> [24]
3758            
3759             sample_4:
3760             nodes affected directly: [2 5 6 7 11]
3761             nodes affected through probabilistic generalization:
3762             2=> [3 4 25]
3763             25=> [26]
3764             5=> [19]
3765             19=> [20 23]
3766             20=> [21 22]
3767             23=> [24]
3768             6=> [13]
3769             13=> [14 18]
3770             14=> [15 16]
3771             16=> [17]
3772             7=> [8]
3773             8=> [9 10]
3774             11=> [12]
3775            
3776             ...
3777             ...
3778             ...
3779              
3780             For each training sample, the display shown above first presents the list of nodes
3781             that are directly affected by the sample. A node is affected directly by a sample if
3782             the latter falls in the portion of the feature space that belongs to the former.
3783             Subsequently, for each training sample, the display shows a subtree of the nodes that
3784             are affected indirectly by the sample through the generalization achieved by the
3785             probabilistic modeling of the data. In general, a node is affected indirectly by a
3786             sample if it is a descendant of another node that is affected directly.
3787              
3788             Also see the section titled B regarding how to invoke the
3789             introspection capabilities of the module in your own code.
3790              
3791             =head1 METHODS
3792              
3793             The module provides the following methods for constructing a decision tree from
3794             training data in a disk file and for classifying new data records with the decision
3795             tree thus constructed:
3796              
3797             =over 4
3798              
3799             =item B
3800              
3801             my $dt = Algorithm::DecisionTree->new(
3802             training_datafile => $training_datafile,
3803             csv_class_column_index => 2,
3804             csv_columns_for_features => [3,4,5,6,7,8],
3805             entropy_threshold => 0.01,
3806             max_depth_desired => 8,
3807             symbolic_to_numeric_cardinality_threshold => 10,
3808             );
3809              
3810             A call to C constructs a new instance of the C class.
3811             For this call to make sense, the training data in the training datafile must be
3812             in the CSV format.
3813              
3814             =back
3815              
3816             =head2 Constructor Parameters
3817              
3818             =over 8
3819              
3820             =item C:
3821              
3822             This parameter supplies the name of the file that contains the training data.
3823              
3824             =item C:
3825              
3826             When using a CSV file for your training data, this parameter supplies the zero-based
3827             column index for the column that contains the class label for each data record in the
3828             training file.
3829              
3830             =item C:
3831              
3832             When using a CSV file for your training data, this parameter supplies a list of
3833             columns corresponding to the features you wish to use for decision tree construction.
3834             Each column is specified by its zero-based index.
3835              
3836             =item C:
3837              
3838             This parameter sets the granularity with which the entropies are sampled by the
3839             module. For example, a feature test at a node in the decision tree is acceptable if
3840             the entropy gain achieved by the test exceeds this threshold. The larger the value
3841             you choose for this parameter, the smaller the tree. Its default value is 0.001.
3842              
3843             =item C:
3844              
3845             This parameter sets the maximum depth of the decision tree. For obvious reasons, the
3846             smaller the value you choose for this parameter, the smaller the tree.
3847              
3848             =item C:
3849              
3850             This parameter allows the module to treat an otherwise numeric feature symbolically
3851             if the number of different values the feature takes in the training data file does
3852             not exceed the value of this parameter.
3853              
3854             =item C:
3855              
3856             This parameter gives the user the option to set the number of points at which the
3857             value range for a feature should be sampled for estimating the probabilities. This
3858             parameter is effective only for those features that occupy a large value range and
3859             whose probability distributions are heavy tailed. B
3860             when you have a very large training dataset:> In general, the larger the dataset, the
3861             smaller the smallest difference between any two values for a numeric feature in
3862             relation to the overall range of values for that feature. In such cases, the module
3863             may use too large a number of bins for estimating the probabilities and that may slow
3864             down the calculation of the decision tree. You can get around this difficulty by
3865             explicitly giving a value to the 'C' parameter.
3866              
3867             =back
3868              
3869              
3870             You can choose the best values to use for the last three constructor parameters by
3871             running a 10-fold cross-validation test on your training data through the class
3872             C that comes with Versions 2.1 and higher of this module. See the
3873             section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.
3874              
3875             =over
3876              
3877             =item B
3878              
3879             After you have constructed a new instance of the C class,
3880             you must now read in the training data that is the file named in the call to the
3881             constructor. This you do by:
3882              
3883             $dt->get_training_data();
3884              
3885              
3886             =item B
3887              
3888             If you wish to see the training data that was just digested by the module,
3889             call
3890              
3891             $dt->show_training_data();
3892              
3893             =item B
3894              
3895             =item B
3896              
3897             After the module has read the training data file, it needs to initialize the
3898             probability cache. This you do by invoking:
3899              
3900             $dt->calculate_first_order_probabilities()
3901             $dt->calculate_class_priors()
3902              
3903             =item B
3904              
3905             With the probability cache initialized, it is time to construct a decision tree
3906             classifier. This you do by
3907              
3908             my $root_node = $dt->construct_decision_tree_classifier();
3909              
3910             This call returns an instance of type C. The C class is defined
3911             within the main package file. So, don't forget, that C<$root_node> in the above
3912             example call will be instantiated to an object of type C.
3913              
3914             =item B<$root_nodeC<< -> >>display_decision_tree(" "):>
3915              
3916             $root_node->display_decision_tree(" ");
3917              
3918             This will display the decision tree in your terminal window by using a recursively
3919             determined offset for each node as the display routine descends down the tree.
3920              
3921             I have intentionally left the syntax fragment C<$root_node> in the above call to
3922             remind the reader that C is NOT called on the instance of
3923             the C we constructed earlier, but on the C instance returned by
3924             the call to C.
3925              
3926             =item B
3927              
3928             Let's say you want to classify the following data record:
3929              
3930             my @test_sample = qw / g2=4.2
3931             grade=2.3
3932             gleason=4
3933             eet=1.7
3934             age=55.0
3935             ploidy=diploid /;
3936              
3937             you'd make the following call:
3938              
3939             my $classification = $dt->classify($root_node, \@test_sample);
3940              
3941             where, again, C<$root_node> is an instance of type C returned by the call to
3942             C. The variable C<$classification> holds a
3943             reference to a hash whose keys are the class names and whose values the associated
3944             probabilities. The hash that is returned by the above call also includes a special
3945             key-value pair for a key named C. The value associated with this key
3946             is an anonymous array that holds the path, in the form of a list of nodes, from the
3947             root node to the leaf node in the decision tree where the final classification was
3948             made.
3949              
3950              
3951             =item B
3952              
3953             This method allows you to use a decision-tree based classifier in an interactive
3954             mode. In this mode, a user is prompted for answers to the questions pertaining to
3955             the feature tests at the nodes of the tree. The syntax for invoking this method is:
3956              
3957             my $classification = $dt->classify_by_asking_questions($root_node);
3958              
3959             where C<$dt> is an instance of the C class returned by a
3960             call to C and C<$root_node> the root node of the decision tree returned by a
3961             call to C.
3962              
3963             =back
3964              
3965              
3966             =head1 THE INTROSPECTION API
3967              
3968             To construct an instance of C, you call
3969              
3970             my $introspector = DTIntrospection->new($dt);
3971              
3972             where you supply the instance of the C class you used for constructing
3973             the decision tree through the parameter C<$dt>. After you have constructed an
3974             instance of the introspection class, you must initialize it by
3975              
3976             $introspector->initialize();
3977              
3978             Subsequently, you can invoke either of the following methods:
3979              
3980             $introspector->explain_classification_at_one_node($node);
3981              
3982             $introspector->explain_classifications_at_multiple_nodes_interactively();
3983              
3984             depending on whether you want introspection at a single specified node or inside an
3985             infinite loop for an arbitrary number of nodes.
3986              
3987             If you want to output a tabular display that shows for each node in the decision tree
3988             all the training samples that fall in the portion of the feature space that belongs
3989             to that node, call
3990              
3991             $introspector->display_training_samples_at_all_nodes_direct_influence_only();
3992              
3993             If you want to output a tabular display that shows for each training sample a list of
3994             all the nodes that are affected directly AND indirectly by that sample, call
3995              
3996             $introspector->display_training_training_samples_to_nodes_influence_propagation();
3997              
3998             A training sample affects a node directly if the sample falls in the portion of the
3999             features space assigned to that node. On the other hand, a training sample is
4000             considered to affect a node indirectly if the node is a descendant of a node that is
4001             affected directly by the sample.
4002              
4003              
4004             =head1 BULK CLASSIFICATION OF DATA RECORDS
4005              
4006             For large test datasets, you would obviously want to process an entire file of test
4007             data at a time. The following scripts in the C directory illustrate how you
4008             can do that:
4009              
4010             classify_test_data_in_a_file.pl
4011              
4012             This script requires three command-line arguments, the first argument names the
4013             training datafile, the second the test datafile, and the third the file in which the
4014             classification results are to be deposited.
4015              
4016             The other examples directories, C, C, and
4017             C, also contain scripts that illustrate how to carry out
4018             bulk classification of data records when you wish to take advantage of bagging,
4019             boosting, or tree randomization. In their respective directories, these scripts are
4020             named:
4021              
4022             bagging_for_bulk_classification.pl
4023             boosting_for_bulk_classification.pl
4024             classify_database_records.pl
4025              
4026              
4027             =head1 HOW THE CLASSIFICATION RESULTS ARE DISPLAYED
4028              
4029             It depends on whether you apply the classifier at once to all the data samples in a
4030             file, or whether you feed one data sample at a time into the classifier.
4031              
4032             In general, the classifier returns soft classification for a test data vector. What
4033             that means is that, in general, the classifier will list all the classes to which a
4034             given data vector could belong and the probability of each such class label for the
4035             data vector. Run the examples scripts in the Examples directory to see how the output
4036             of classification can be displayed.
4037              
4038             With regard to the soft classifications returned by this classifier, if the
4039             probability distributions for the different classes overlap in the underlying feature
4040             space, you would want the classifier to return all of the applicable class labels for
4041             a data vector along with the corresponding class probabilities. Another reason for
4042             why the decision tree classifier may associate significant probabilities with
4043             multiple class labels is that you used inadequate number of training samples to
4044             induce the decision tree. The good thing is that the classifier does not lie to you
4045             (unlike, say, a hard classification rule that would return a single class label
4046             corresponding to the partitioning of the underlying feature space). The decision
4047             tree classifier give you the best classification that can be made given the training
4048             data you fed into it.
4049              
4050              
4051             =head1 USING BAGGING
4052              
4053             Starting with Version 3.0, you can use the class C that
4054             comes with the module to incorporate bagging in your decision tree based
4055             classification. Bagging means constructing multiple decision trees for different
4056             (possibly overlapping) segments of the data extracted from your training dataset and
4057             then aggregating the decisions made by the individual decision trees for the final
4058             classification. The aggregation of the classification decisions can average out the
4059             noise and bias that may otherwise affect the classification decision obtained from
4060             just one tree.
4061              
4062             =over 4
4063              
4064             =item B
4065              
4066             A typical call to the constructor for the C class looks
4067             like:
4068              
4069             use Algorithm::DecisionTreeWithBagging;
4070            
4071             my $training_datafile = "stage3cancer.csv";
4072            
4073             my $dtbag = Algorithm::DecisionTreeWithBagging->new(
4074             training_datafile => $training_datafile,
4075             csv_class_column_index => 2,
4076             csv_columns_for_features => [3,4,5,6,7,8],
4077             entropy_threshold => 0.01,
4078             max_depth_desired => 8,
4079             symbolic_to_numeric_cardinality_threshold => 10,
4080             how_many_bags => 4,
4081             bag_overlap_fraction => 0.2,
4082             );
4083            
4084             Note in particular the following two constructor parameters:
4085            
4086             how_many_bags
4087              
4088             bag_overlap_fraction
4089              
4090             where, as the name implies, the parameter C controls how many bags
4091             (and, therefore, how many decision trees) will be constructed from your training
4092             dataset; and where the parameter C controls the degree of
4093             overlap between the bags. To understand what exactly is achieved by setting the
4094             parameter C to 0.2 in the above example, let's say that the
4095             non-overlapping partitioning of the training data between the bags results in 100
4096             training samples per bag. With bag_overlap_fraction set to 0.2, additional 20 samples
4097             drawn randomly from the other bags will be added to the data in each bag.
4098              
4099             =back
4100              
4101             =head2 B class>
4102              
4103             =over 8
4104              
4105             =item B
4106              
4107             This method reads your training datafile, randomizes it, and then partitions it into
4108             the specified number of bags. Subsequently, if the constructor parameter
4109             C is non-zero, it adds to each bag additional samples drawn at
4110             random from the other bags. The number of these additional samples added to each bag
4111             is controlled by the constructor parameter C. If this
4112             parameter is set to, say, 0.2, the size of each bag will grow by 20% with the samples
4113             drawn from the other bags.
4114              
4115             =item B
4116              
4117             Shows for each bag the names of the training data samples in that bag.
4118              
4119             =item B
4120              
4121             Calls on the appropriate methods of the main C class to estimate the
4122             first-order probabilities from the data samples in each bag.
4123              
4124             =item B
4125              
4126             Calls on the appropriate method of the main C class to estimate the
4127             class priors for the data samples in each bag.
4128              
4129             =item B
4130              
4131             Calls on the appropriate method of the main C class to construct a
4132             decision tree from the training data in each bag.
4133              
4134             =item B
4135              
4136             Display separately the decision tree for each bag..
4137              
4138             =item B
4139              
4140             Calls on the appropriate methods of the main C class to classify the
4141             argument test sample.
4142              
4143             =item B
4144              
4145             Displays separately the classification decision made by each the decision tree
4146             constructed for each bag.
4147              
4148             =item B
4149              
4150             Using majority voting, this method aggregates the classification decisions made by
4151             the individual decision trees into a single decision.
4152              
4153             =back
4154              
4155             See the example scripts in the directory C for how to call these
4156             methods for classifying individual samples and for bulk classification when you place
4157             all your test samples in a single file.
4158              
4159             =head1 USING BOOSTING
4160              
4161             Starting with Version 3.20, you can use the class C for
4162             constructing a boosted decision-tree classifier. Boosting results in a cascade of
4163             decision trees in which each decision tree is constructed with samples that are
4164             mostly those that are misclassified by the previous decision tree. To be precise,
4165             you create a probability distribution over the training samples for the selection of
4166             samples for training each decision tree in the cascade. To start out, the
4167             distribution is uniform over all of the samples. Subsequently, this probability
4168             distribution changes according to the misclassifications by each tree in the cascade:
4169             if a sample is misclassified by a given tree in the cascade, the probability of its
4170             being selected for training the next tree is increased significantly. You also
4171             associate a trust factor with each decision tree depending on its power to classify
4172             correctly all of the training data samples. After a cascade of decision trees is
4173             constructed in this manner, you construct a final classifier that calculates the
4174             class label for a test data sample by taking into account the classification
4175             decisions made by each individual tree in the cascade, the decisions being weighted
4176             by the trust factors associated with the individual classifiers. These boosting
4177             notions --- generally referred to as the AdaBoost algorithm --- are based on a now
4178             celebrated paper "A Decision-Theoretic Generalization of On-Line Learning and an
4179             Application to Boosting" by Yoav Freund and Robert Schapire that appeared in 1995 in
4180             the Proceedings of the 2nd European Conf. on Computational Learning Theory. For a
4181             tutorial introduction to AdaBoost, see L
4182              
4183             Keep in mind the fact that, ordinarily, the theoretical guarantees provided by
4184             boosting apply only to the case of binary classification. Additionally, your
4185             training dataset must capture all of the significant statistical variations in the
4186             classes represented therein.
4187              
4188             =over 4
4189              
4190             =item B
4191              
4192             If you'd like to experiment with boosting, a typical call to the constructor for the
4193             C class looks like:
4194              
4195             use Algorithm::BoostedDecisionTree;
4196             my $training_datafile = "training6.csv";
4197             my $boosted = Algorithm::BoostedDecisionTree->new(
4198             training_datafile => $training_datafile,
4199             csv_class_column_index => 1,
4200             csv_columns_for_features => [2,3],
4201             entropy_threshold => 0.01,
4202             max_depth_desired => 8,
4203             symbolic_to_numeric_cardinality_threshold => 10,
4204             how_many_stages => 4,
4205             );
4206              
4207             Note in particular the constructor parameter:
4208            
4209             how_many_stages
4210              
4211             As its name implies, this parameter controls how many stages will be used in the
4212             boosted decision tree classifier. As mentioned above, a separate decision tree is
4213             constructed for each stage of boosting using a set of training samples that are drawn
4214             through a probability distribution maintained over the entire training dataset.
4215              
4216             =back
4217              
4218             =head2 B class>
4219              
4220             =over 8
4221              
4222             =item B
4223              
4224             This method reads your training datafile, creates the data structures from the data
4225             ingested for constructing the base decision tree.
4226              
4227             =item B
4228              
4229             Writes to the standard output the training data samples and also some relevant
4230             properties of the features used in the training dataset.
4231              
4232             =item B
4233              
4234             Calls on the appropriate methods of the main C class to estimate the
4235             first-order probabilities and the class priors.
4236              
4237             =item B
4238              
4239             Calls on the appropriate method of the main C class to construct the
4240             base decision tree.
4241              
4242             =item B
4243              
4244             Displays the base decision tree in your terminal window. (The textual form of the
4245             decision tree is written out to the standard output.)
4246              
4247             =item B
4248              
4249             Uses the AdaBoost algorithm to construct a cascade of decision trees. As mentioned
4250             earlier, the training samples for each tree in the cascade are drawn using a
4251             probability distribution over the entire training dataset. This probability
4252             distribution for any given tree in the cascade is heavily influenced by which
4253             training samples are misclassified by the previous tree.
4254              
4255             =item B
4256              
4257             Displays separately in your terminal window the decision tree constructed for each
4258             stage of the cascade. (The textual form of the trees is written out to the standard
4259             output.)
4260              
4261             =item B
4262              
4263             Calls on each decision tree in the cascade to classify the argument C<$test_sample>.
4264              
4265             =item B
4266              
4267             You can call this method to display in your terminal window the classification
4268             decision made by each decision tree in the cascade. The method also prints out the
4269             trust factor associated with each decision tree. It is important to look
4270             simultaneously at the classification decision and the trust factor for each tree ---
4271             since a classification decision made by a specific tree may appear bizarre for a
4272             given test sample. This method is useful primarily for debugging purposes.
4273              
4274             =item B
4275              
4276             As with the previous method, this method is useful mostly for debugging. It returns
4277             class labels for the samples misclassified by the stage whose integer index is
4278             supplied as an argument to the method. Say you have 10 stages in your cascade. The
4279             value of the argument C would go from 0 to 9, with 0 corresponding to
4280             the base tree.
4281              
4282             =item B
4283              
4284             Uses the "final classifier" formula of the AdaBoost algorithm to pool together the
4285             classification decisions made by the individual trees while taking into account the
4286             trust factors associated with the trees. As mentioned earlier, we associate with
4287             each tree of the cascade a trust factor that depends on the overall misclassification
4288             rate associated with that tree.
4289              
4290             =back
4291              
4292             See the example scripts in the C subdirectory for how to call the
4293             methods listed above for classifying individual data samples with boosting and for
4294             bulk classification when you place all your test samples in a single file.
4295              
4296              
4297             =head1 USING RANDOMIZED DECISION TREES
4298              
4299             As mentioned earlier, the new C class allows you to solve
4300             the following two problems: (1) Data classification using the needle-in-a-haystack
4301             metaphor, that is, when a vast majority of your training samples belong to just one
4302             class. And (2) You have access to a very large database of training samples and you
4303             wish to construct an ensemble of decision trees for classification.
4304              
4305             =over 4
4306              
4307             =item B
4308              
4309             Here is how you'd call the C constructor for
4310             needle-in-a-haystack classification:
4311              
4312             use Algorithm::RandomizedTreesForBigData;
4313             my $training_datafile = "your_database.csv";
4314             my $rt = Algorithm::RandomizedTreesForBigData->new(
4315             training_datafile => $training_datafile,
4316             csv_class_column_index => 48,
4317             csv_columns_for_features => [24,32,33,34,41],
4318             entropy_threshold => 0.01,
4319             max_depth_desired => 8,
4320             symbolic_to_numeric_cardinality_threshold => 10,
4321             how_many_trees => 5,
4322             looking_for_needles_in_haystack => 1,
4323             );
4324              
4325             Note in particular the constructor parameters:
4326              
4327             looking_for_needles_in_haystack
4328             how_many_trees
4329              
4330             The first of these parameters, C, invokes the logic for
4331             constructing an ensemble of decision trees, each based on a training dataset that
4332             uses all of the minority class samples, and a random drawing from the majority class
4333             samples.
4334              
4335             Here is how you'd call the C constructor for a more
4336             general attempt at constructing an ensemble of decision trees, with each tree trained
4337             with randomly drawn samples from a large database of training data (without paying
4338             attention to the differences in the sizes of the populations for the different
4339             classes):
4340              
4341             use Algorithm::RandomizedTreesForBigData;
4342             my $training_datafile = "your_database.csv";
4343             my $rt = Algorithm::RandomizedTreesForBigData->new(
4344             training_datafile => $training_datafile,
4345             csv_class_column_index => 2,
4346             csv_columns_for_features => [3,4,5,6,7,8],
4347             entropy_threshold => 0.01,
4348             max_depth_desired => 8,
4349             symbolic_to_numeric_cardinality_threshold => 10,
4350             how_many_trees => 3,
4351             how_many_training_samples_per_tree => 50,
4352             );
4353              
4354             Note in particular the constructor parameters:
4355              
4356             how_many_training_samples_per_tree
4357             how_many_trees
4358              
4359             When you set the C parameter, you are not allowed
4360             to also set the C parameter, and vice versa.
4361              
4362             =back
4363              
4364             =head2 B class>
4365              
4366             =over 8
4367              
4368             =item B
4369              
4370             What this method does depends on which of the two constructor parameters ---
4371             C or C --- is
4372             set. When the former is set, it creates a collection of training datasets for
4373             C number of decision trees, with each dataset being a mixture of the
4374             minority class and sample drawn randomly from the majority class. However, when the
4375             latter option is set, all the datasets are drawn randomly from the training database
4376             with no particular attention given to the relative populations of the two classes.
4377              
4378             =item B
4379              
4380             As the name implies, this method shows the training data being used for all the
4381             decision trees. This method is useful for debugging purposes using small datasets.
4382              
4383             =item B
4384              
4385             Calls on the appropriate method of the main C to estimate the
4386             first-order probabilities for the training dataset to be used for each decision tree.
4387              
4388             =item B
4389              
4390             Calls on the appropriate method of the main C class to estimate the
4391             class priors for the training dataset to be used for each decision tree.
4392              
4393             =item B
4394              
4395             Calls on the appropriate method of the main C class to construct the
4396             decision trees.
4397              
4398             =item B
4399              
4400             Displays all the decision trees in your terminal window. (The textual form of the
4401             decision trees is written out to the standard output.)
4402              
4403             =item B
4404              
4405             The test_sample is sent to each decision tree for classification.
4406              
4407             =item B
4408              
4409             The classification decisions returned by the individual decision trees are written
4410             out to the standard output.
4411              
4412             =item B
4413              
4414             This method aggregates the classification results returned by the individual decision
4415             trees and returns the majority decision.
4416              
4417             =back
4418              
4419             =head1 CONSTRUCTING REGRESSION TREES:
4420              
4421             Decision tree based modeling requires that the class labels be distinct. That is,
4422             the training dataset must contain a relatively small number of discrete class labels
4423             for all of your data records if you want to model the data with one or more decision
4424             trees. However, when one is trying to understand all of the associational
4425             relationships that exist in a large database, one often runs into situations where,
4426             instead of discrete class labels, you have a continuously valued variable as a
4427             dependent variable whose values are predicated on a set of feature values. It is for
4428             such situations that you will find useful the new class C that is now
4429             a part of the C module. The C class has been
4430             programmed as a subclass of the main C class.
4431              
4432             You can think of regression with a regression tree as a powerful generalization of
4433             the very commonly used Linear Regression algorithms. Although you can certainly
4434             carry out polynomial regression with run-of-the-mill Linear Regression algorithms for
4435             modeling nonlinearities between the predictor variables and the dependent variable,
4436             specifying the degree of the polynomial is often tricky. Additionally, a polynomial
4437             can inject continuities between the predictor and the predicted variables that may
4438             not really exist in the real data. Regression trees, on the other hand, give you a
4439             piecewise linear relationship between the predictor and the predicted variables that
4440             is freed from the constraints of superimposed continuities at the joins between the
4441             different segments. See the following tutorial for further information regarding the
4442             standard linear regression approach and the regression that can be achieved with the
4443             RegressionTree class in this module:
4444             L
4445              
4446             The RegressionTree class in the current version of the module assumes that all of
4447             your data is numerical. That is, unlike what is possible with the DecisionTree class
4448             (and the other more closely related classes in this module) that allow your training
4449             file to contain a mixture of numerical and symbolic data, the RegressionTree class
4450             requires that ALL of your data be numerical. I hope to relax this constraint in
4451             future versions of this module. Obviously, the dependent variable will always be
4452             numerical for regression.
4453              
4454             See the example scripts in the directory C if you wish to become
4455             more familiar with the regression capabilities of the module.
4456              
4457             =over 4
4458              
4459             =item B
4460              
4461             my $training_datafile = "gendata5.csv";
4462             my $rt = Algorithm::RegressionTree->new(
4463             training_datafile => $training_datafile,
4464             dependent_variable_column => 2,
4465             predictor_columns => [1],
4466             mse_threshold => 0.01,
4467             max_depth_desired => 2,
4468             jacobian_choice => 0,
4469             );
4470              
4471             Note in particular the constructor parameters:
4472              
4473             dependent_variable
4474             predictor_columns
4475             mse_threshold
4476             jacobian_choice
4477              
4478             The first of these parameters, C, is set to the column index in
4479             the CSV file for the dependent variable. The second constructor parameter,
4480             C, tells the system as to which columns contain values for the
4481             predictor variables. The third parameter, C, is for deciding when to
4482             partition the data at a node into two child nodes as a regression tree is being
4483             constructed. If the minmax of MSE (Mean Squared Error) that can be achieved by
4484             partitioning any of the features at a node is smaller than C, that
4485             node becomes a leaf node of the regression tree.
4486              
4487             The last parameter, C, must be set to either 0 or 1 or 2. Its
4488             default value is 0. When this parameter equals 0, the regression coefficients are
4489             calculated using the linear least-squares method and no further "refinement" of the
4490             coefficients is carried out using gradient descent. This is the fastest way to
4491             calculate the regression coefficients. When C is set to 1, you get
4492             a weak version of gradient descent in which the Jacobian is set to the "design
4493             matrix" itself. Choosing 2 for C results in a more reasonable
4494             approximation to the Jacobian. That, however, is at a cost of much longer
4495             computation time. B For most cases, using 0 for C is the
4496             best choice. See my tutorial "I" for why
4497             that is the case.
4498              
4499             =back
4500              
4501             =head2 B class>
4502              
4503             =over 8
4504              
4505             =item B
4506              
4507             Only CSV training datafiles are allowed. Additionally, the first record in the file
4508             must list the names of the fields, and the first column must contain an integer ID
4509             for each record.
4510              
4511             =item B
4512              
4513             As the name implies, this is the method that construct a regression tree.
4514              
4515             =item B
4516              
4517             Displays the regression tree, as the name implies. The white-space string argument
4518             specifies the offset to use in displaying the child nodes in relation to a parent
4519             node.
4520              
4521             =item B
4522              
4523             You call this method after you have constructed a regression tree if you want to
4524             calculate the prediction for one sample. The parameter C<$root_node> is what is
4525             returned by the call C. The formatting of the argument
4526             bound to the C<$test_sample> parameter is important. To elaborate, let's say you are
4527             using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In
4528             this case, the C<$test_sample> parameter will be bound to a list that will look like
4529              
4530             ['xvar1 = 23.4', 'xvar2 = 12.9']
4531              
4532             Arbitrary amount of white space, including none, on the two sides of the equality
4533             symbol is allowed in the construct shown above. A call to this method returns a
4534             dictionary with two key-value pairs. One of the keys is called C and
4535             the other C. The value associated with key C is the path
4536             in the regression tree to the leaf node that yielded the prediction. And the value
4537             associated with the key C is the answer you are looking for.
4538              
4539             =item B
4540              
4541             This call calculates the predictions for all of the predictor variables data in your
4542             training file. The parameter C<$root_node> is what is returned by the call to
4543             C. The values for the dependent variable thus predicted
4544             can be seen by calling C, which is the method mentioned below.
4545              
4546             =item B
4547              
4548             This method displays the results obtained by calling the prediction method of the
4549             previous entry. This method also creates a hardcopy of the plots and saves it as a
4550             C<.png> disk file. The name of this output file is always C.
4551              
4552             =item B
4553              
4554             This method carries out an error analysis of the predictions for the samples in your
4555             training datafile. It shows you the overall MSE (Mean Squared Error) with tree-based
4556             regression, the MSE for the data samples at each of the leaf nodes of the regression
4557             tree, and the MSE for the plain old Linear Regression as applied to all of the data.
4558             The parameter C<$root_node> in the call syntax is what is returned by the call to
4559             C.
4560              
4561             =item B
4562              
4563             Call this method if you want to apply the regression tree to all your test data in a
4564             disk file. The predictions for all of the test samples in the disk file are written
4565             out to another file whose name is the same as that of the test file except for the
4566             addition of C<_output> in the name of the file. The parameter C<$filename> is the
4567             name of the disk file that contains the test data. And the parameter C<$columns> is a
4568             list of the column indices for the predictor variables in the test file.
4569              
4570             =back
4571              
4572             =head1 GENERATING SYNTHETIC TRAINING DATA
4573              
4574             The module file contains the following additional classes: (1)
4575             C, and (2) C for
4576             generating synthetic training data.
4577              
4578             The class C outputs one CSV file for the
4579             training data and another one for the test data for experimenting with numeric
4580             features. The numeric values are generated using a multivariate Gaussian
4581             distribution whose mean and covariance are specified in a parameter file. See the
4582             file C in the C directory for an example of such a
4583             parameter file. Note that the dimensionality of the data is inferred from the
4584             information you place in the parameter file.
4585              
4586             The class C generates synthetic training for the
4587             purely symbolic case. The relative frequencies of the different possible values for
4588             the features is controlled by the biasing information you place in a parameter file.
4589             See C for an example of such a file.
4590              
4591              
4592             =head1 THE C DIRECTORY
4593              
4594             See the C directory in the distribution for how to construct a decision
4595             tree, and how to then classify new data using the decision tree. To become more
4596             familiar with the module, run the scripts
4597              
4598             construct_dt_and_classify_one_sample_case1.pl
4599             construct_dt_and_classify_one_sample_case2.pl
4600             construct_dt_and_classify_one_sample_case3.pl
4601             construct_dt_and_classify_one_sample_case4.pl
4602              
4603             The first script is for the purely symbolic case, the second for the case that
4604             involves both numeric and symbolic features, the third for the case of purely numeric
4605             features, and the last for the case when the training data is synthetically generated
4606             by the script C.
4607              
4608             Next run the following script as it is for bulk classification of data records placed
4609             in a CSV file:
4610              
4611             classify_test_data_in_a_file.pl training4.csv test4.csv out4.csv
4612              
4613             The script first constructs a decision tree using the training data in the training
4614             file supplied by the first argument file C. The script then
4615             calculates the class label for each data record in the test data file supplied
4616             through the second argument file, C. The estimated class labels are
4617             written out to the output file which in the call shown above is C. An
4618             important thing to note here is that your test file --- in this case C ---
4619             must have a column for class labels. Obviously, in real-life situations, there will
4620             be no class labels in this column. What that is the case, you can place an empty
4621             string C<""> there for each data record. This is demonstrated by the following call:
4622              
4623             classify_test_data_in_a_file.pl training4.csv test4_no_class_labels.csv out4.csv
4624              
4625             The following script in the C directory
4626              
4627             classify_by_asking_questions.pl
4628              
4629             shows how you can use a decision-tree classifier interactively. In this mode, you
4630             first construct the decision tree from the training data and then the user is
4631             prompted for answers to the feature tests at the nodes of the tree.
4632              
4633             If your training data has a feature whose values span a large range and, at the same
4634             time, are characterized by a heavy-tail distribution, you should look at the script
4635              
4636             construct_dt_for_heavytailed.pl
4637              
4638             to see how to use the option C in the call to the
4639             constructor. This option was introduced in Version 2.22 for dealing with such
4640             features. If you do not set this option, the module will use the default value of
4641             500 for the number of points at which to sample the value range for such a feature.
4642              
4643             The C directory also contains the following scripts:
4644              
4645             generate_training_data_numeric.pl
4646             generate_training_data_symbolic.pl
4647              
4648             that show how you can use the module to generate synthetic training. Synthetic
4649             training is generated according to the specifications laid out in a parameter file.
4650             There are constraints on how the information is laid out in a parameter file. See
4651             the files C and C in the C directory
4652             for how to structure these files.
4653              
4654             The C directory of Versions 2.1 and higher of the module also contains the
4655             following two scripts:
4656              
4657             evaluate_training_data1.pl
4658             evaluate_training_data2.pl
4659              
4660             that illustrate how the Perl class C can be used to evaluate the
4661             quality of your training data (as long as it resides in a `C<.csv>' file.) This new
4662             class is a subclass of the C class in the module file. See the README
4663             in the C directory for further information regarding these two scripts.
4664              
4665             The C directory of Versions 2.31 and higher of the module contains the
4666             following three scripts:
4667              
4668             introspection_in_a_loop_interactive.pl
4669              
4670             introspection_show_training_samples_at_all_nodes_direct_influence.pl
4671              
4672             introspection_show_training_samples_to_nodes_influence_propagation.pl
4673              
4674             The first script illustrates how to use the C class of the module
4675             interactively for generating explanations for the classification decisions made at
4676             the nodes of the decision tree. In the interactive session you are first asked for
4677             the node number you are interested in. Subsequently, you are asked for whether or
4678             not you are interested in specific questions that the introspector can provide
4679             answers for. The second script generates a tabular display that shows for each node
4680             of the decision tree a list of the training samples that fall directly in the portion
4681             of the feature space assigned that node. (As mentioned elsewhere in this
4682             documentation, when this list is empty for a node, that means the node is a result of
4683             the generalization achieved by probabilistic modeling of the data. Note that this
4684             module constructs a decision tree NOT by partitioning the set of training samples,
4685             BUT by partitioning the domains of the probability density functions.) The third
4686             script listed above also generates a tabular display, but one that shows how the
4687             influence of each training sample propagates in the tree. This display first shows
4688             the list of nodes that are affected directly by the data in a training sample. This
4689             list is followed by an indented display of the nodes that are affected indirectly by
4690             the training sample. A training sample affects a node indirectly if the node is a
4691             descendant of one of the nodes affected directly.
4692              
4693              
4694             =head1 THE C DIRECTORY
4695              
4696             The C directory contains the following scripts:
4697              
4698             bagging_for_classifying_one_test_sample.pl
4699            
4700             bagging_for_bulk_classification.pl
4701              
4702             As the names of the scripts imply, the first shows how to call the different methods
4703             of the C class for classifying a single test sample. When
4704             you are classifying a single test sample, you can also see how each bag is
4705             classifying the test sample. You can, for example, display the training data used in
4706             each bag, the decision tree constructed for each bag, etc.
4707              
4708             The second script is for the case when you place all of the test samples in a single
4709             file. The demonstration script displays for each test sample a single aggregate
4710             classification decision that is obtained through majority voting by all the decision
4711             trees.
4712              
4713              
4714             =head1 THE C DIRECTORY
4715              
4716             The C subdirectory in the main installation directory contains the
4717             following three scripts:
4718              
4719             boosting_for_classifying_one_test_sample_1.pl
4720              
4721             boosting_for_classifying_one_test_sample_2.pl
4722              
4723             boosting_for_bulk_classification.pl
4724              
4725             As the names of the first two scripts imply, these show how to call the different
4726             methods of the C class for classifying a single test sample.
4727             When you are classifying a single test sample, you can see how each stage of the
4728             cascade of decision trees is classifying the test sample. You can also view each
4729             decision tree separately and also see the trust factor associated with the tree.
4730              
4731             The third script is for the case when you place all of the test samples in a single
4732             file. The demonstration script outputs for each test sample a single aggregate
4733             classification decision that is obtained through trust-factor weighted majority
4734             voting by all the decision trees.
4735              
4736             =head1 THE C DIRECTORY
4737              
4738             The C directory shows example scripts that you can use to
4739             become more familiar with the C class for solving
4740             needle-in-a-haystack and big-data data classification problems. These scripts are:
4741              
4742             randomized_trees_for_classifying_one_test_sample_1.pl
4743              
4744             randomized_trees_for_classifying_one_test_sample_2.pl
4745              
4746             classify_database_records.pl
4747              
4748             The first script shows the constructor options to use for solving a
4749             needle-in-a-haystack problem --- that is, a problem in which a vast majority of the
4750             training data belongs to just one class. The second script shows the constructor
4751             options for using randomized decision trees for the case when you have access to a
4752             very large database of training samples and you'd like to construct an ensemble of
4753             decision trees using training samples pulled randomly from the training database.
4754             The last script illustrates how you can evaluate the classification power of an
4755             ensemble of decision trees as constructed by C by classifying
4756             a large number of test samples extracted randomly from the training database.
4757              
4758              
4759             =head1 THE C DIRECTORY
4760              
4761             The C subdirectory in the main installation directory shows
4762             example scripts that you can use to become familiar with regression trees and how
4763             they can be used for nonlinear regression. If you are new to the concept of
4764             regression trees, start by executing the following scripts without changing them and
4765             see what sort of output is produced by them:
4766              
4767             regression4.pl
4768              
4769             regression5.pl
4770              
4771             regression6.pl
4772              
4773             regression8.pl
4774              
4775             The C script involves only one predictor variable and one dependent
4776             variable. The training data for this exercise is drawn from the file C.
4777             This data file contains strongly nonlinear data. When you run the script
4778             C, you will see how much better the result from tree regression is
4779             compared to what you can get with linear regression.
4780              
4781             The C script is essentially the same as the previous script except
4782             for the fact that the training datafile used in this case, C, consists
4783             of three noisy segments, as opposed to just two in the previous case.
4784              
4785             The script C deals with the case when we have two predictor variables
4786             and one dependent variable. You can think of the data as consisting of noisy height
4787             values over an C<(x1,x2)> plane. The data used in this script is drawn from the csv
4788             file C.
4789              
4790             Finally, the script C shows how you can carry out bulk prediction for
4791             all your test data records in a disk file. The script writes all the calculated
4792             predictions into another disk file whose name is derived from the name of the test
4793             data file.
4794              
4795              
4796             =head1 EXPORT
4797              
4798             None by design.
4799              
4800             =head1 BUGS
4801              
4802             Please notify the author if you encounter any bugs. When sending email, please place
4803             the string 'DecisionTree' in the subject line.
4804              
4805             =head1 INSTALLATION
4806              
4807             Download the archive from CPAN in any directory of your choice. Unpack the archive
4808             with a command that on a Linux machine would look like:
4809              
4810             tar zxvf Algorithm-DecisionTree-3.41.tar.gz
4811              
4812             This will create an installation directory for you whose name will be
4813             C. Enter this directory and execute the following
4814             commands for a standard install of the module if you have root privileges:
4815              
4816             perl Makefile.PL
4817             make
4818             make test
4819             sudo make install
4820              
4821             If you do not have root privileges, you can carry out a non-standard install the
4822             module in any directory of your choice by:
4823              
4824             perl Makefile.PL prefix=/some/other/directory/
4825             make
4826             make test
4827             make install
4828              
4829             With a non-standard install, you may also have to set your PERL5LIB environment
4830             variable so that this module can find the required other modules. How you do that
4831             would depend on what platform you are working on. In order to install this module in
4832             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
4833             variable by
4834              
4835             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4836              
4837             If I used bash, I'd need to declare:
4838              
4839             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4840              
4841              
4842             =head1 THANKS
4843              
4844             I wish to thank many users of this module for their feedback. Many of the
4845             improvements I have made to the module over the years are a result of the feedback
4846             received.
4847              
4848             I thank Slaven Rezic for pointing out that the module worked with Perl 5.14.x. For
4849             Version 2.22, I had set the required version of Perl to 5.18.0 since that's what I
4850             used for testing the module. Slaven's feedback in the form of the Bug report
4851             C<#96547> resulted in Version 2.23 of the module. Version 2.25 further downshifts
4852             the required version of Perl to 5.10.
4853              
4854             On the basis of the report posted by Slaven at C regarding Version 2.27,
4855             I am removing the Perl version restriction altogether from Version 2.30. Thanks
4856             Slaven!
4857              
4858              
4859             =head1 AUTHOR
4860              
4861             The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy Project"
4862             with the publication of the book I by John-Wiley. If
4863             interested, visit his web page at Purdue to find out what this project was all
4864             about. You might like I especially if you enjoyed reading
4865             Harry Potter as a kid (or even as an adult, for that matter).
4866              
4867             If you send email regarding this module, please place the string "DecisionTree" in
4868             your subject line to get past my spam filter. Avi Kak's email address is
4869             C
4870              
4871             =head1 COPYRIGHT
4872              
4873             This library is free software; you can redistribute it and/or modify it under the
4874             same terms as Perl itself.
4875              
4876             Copyright 2016 Avinash Kak
4877              
4878             =cut
4879