File Coverage

blib/lib/Algorithm/DecisionTree.pm
Criterion Covered Total %
statement 780 2249 34.6
branch 201 708 28.3
condition 42 234 17.9
subroutine 44 92 47.8
pod 8 40 20.0
total 1075 3323 32.3


line stmt bran cond sub pod time code
1             package Algorithm::DecisionTree;
2              
3             #--------------------------------------------------------------------------------------
4             # Copyright (c) 2016 Avinash Kak. All rights reserved. This program is free
5             # software. You may modify and/or distribute it under the same terms as Perl itself.
6             # This copyright notice must remain attached to the file.
7             #
8             # Algorithm::DecisionTree is a Perl module for decision-tree based classification of
9             # multidimensional data.
10             # -------------------------------------------------------------------------------------
11              
12             #use 5.10.0;
13 1     1   12964 use strict;
  1         2  
  1         24  
14 1     1   3 use warnings;
  1         1  
  1         20  
15 1     1   4 use Carp;
  1         3  
  1         12093  
16              
17             our $VERSION = '3.42';
18              
19             ############################################ Constructor ##############################################
20             sub new {
21 1     1 1 330 my ($class, %args, $eval_or_boosting_mode);
22 1 50       12 if (@_ % 2 != 0) {
23 1         5 ($class, %args) = @_;
24             } else {
25 0         0 $class = shift;
26 0         0 $eval_or_boosting_mode = shift;
27 0 0 0     0 die unless $eval_or_boosting_mode eq 'evalmode' || $eval_or_boosting_mode eq 'boostingmode';
28 0 0       0 die "Only one string arg allowed in eval and boosting modes" if @_;
29             }
30 1 50       3 unless ($eval_or_boosting_mode) {
31 1         3 my @params = keys %args;
32 1 50       3 croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
33             if check_for_illegal_params2(@params) == 0;
34             }
35             bless {
36             _training_datafile => $args{training_datafile},
37             _entropy_threshold => $args{entropy_threshold} || 0.01,
38             _max_depth_desired => exists $args{max_depth_desired} ?
39             $args{max_depth_desired} : undef,
40             _debug1 => $args{debug1} || 0,
41             _debug2 => $args{debug2} || 0,
42             _debug3 => $args{debug3} || 0,
43             _csv_class_column_index => $args{csv_class_column_index} || undef,
44             _csv_columns_for_features => $args{csv_columns_for_features} || undef,
45             _symbolic_to_numeric_cardinality_threshold
46             => $args{symbolic_to_numeric_cardinality_threshold} || 10,
47             _number_of_histogram_bins => $args{number_of_histogram_bins} || undef,
48 1 50 50     53 _csv_cleanup_needed => $args{csv_cleanup_needed} || 0,
      50        
      50        
      50        
      50        
      50        
      50        
      50        
      50        
49             _training_data => [],
50             _root_node => undef,
51             _probability_cache => {},
52             _entropy_cache => {},
53             _training_data_hash => {},
54             _features_and_values_hash => {},
55             _samples_class_label_hash => {},
56             _class_names => [],
57             _class_priors => [],
58             _class_priors_hash => {},
59             _feature_names => [],
60             _numeric_features_valuerange_hash => {},
61             _sampling_points_for_numeric_feature_hash => {},
62             _feature_values_how_many_uniques_hash => {},
63             _prob_distribution_numeric_features_hash => {},
64             _histogram_delta_hash => {},
65             _num_of_histogram_bins_hash => {},
66             }, $class;
67              
68             }
69              
70             #################################### Classify with Decision Tree #######################################
71              
72             ## Classifies one test sample at a time using the decision tree constructed from
73             ## your training file. The data record for the test sample must be supplied as
74             ## shown in the scripts in the `examples' subdirectory. See the scripts
75             ## construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
76             sub classify {
77 0     0 1 0 my $self = shift;
78 0         0 my $root_node = shift;
79 0         0 my $feature_and_values = shift;
80 0         0 my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
81 0         0 my @features_and_values = @$feature_and_values;
82 0         0 @features_and_values = @{deep_copy_array(\@features_and_values)};
  0         0  
83 0 0       0 die "\n\nError in the names you have used for features and/or values. " .
84             "Try using the csv_cleanup_needed option in the constructor call."
85             unless $self->check_names_used(\@features_and_values);
86 0         0 my @new_features_and_values = ();
87 0         0 my $pattern = '(\S+)\s*=\s*(\S+)';
88 0         0 foreach my $feature_and_value (@features_and_values) {
89 0         0 $feature_and_value =~ /$pattern/;
90 0         0 my ($feature, $value) = ($1, $2);
91 0         0 my $newvalue = $value;
92 0         0 my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
  0         0  
93 0         0 my $not_all_values_float = 0;
94 0 0       0 map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
  0         0  
95 0 0 0     0 if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
  0         0  
96             $not_all_values_float == 0) {
97 0         0 $newvalue = closest_sampling_point($value, \@unique_values_for_feature);
98             }
99 0         0 push @new_features_and_values, "$feature" . '=' . "$newvalue";
100             }
101 0         0 @features_and_values = @new_features_and_values;
102 0 0       0 print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
103 0         0 my %answer = ();
104 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
105 0         0 $answer{$class_name} = undef;
106             }
107 0         0 $answer{'solution_path'} = [];
108 0         0 my %classification = %{$self->recursive_descent_for_classification($root_node,
  0         0  
109             \@features_and_values,\%answer)};
110 0         0 @{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
  0         0  
  0         0  
111 0 0       0 if ($self->{_debug3}) {
112 0         0 print "\nCL2 The classification:\n";
113 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
114 0         0 print " $class_name with probability $classification{$class_name}\n";
115             }
116             }
117 0         0 my %classification_for_display = ();
118 0         0 foreach my $item (keys %classification) {
119 0 0       0 if ($item ne 'solution_path') {
120 0         0 $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
121             } else {
122 0         0 my @outlist = ();
123 0         0 foreach my $x (@{$classification{$item}}) {
  0         0  
124 0         0 push @outlist, "NODE$x";
125             }
126 0         0 $classification_for_display{$item} = \@outlist;
127             }
128             }
129 0         0 return \%classification_for_display;
130             }
131              
132             sub recursive_descent_for_classification {
133 0     0 0 0 my $self = shift;
134 0         0 my $node = shift;
135 0         0 my $features_and_values = shift;
136 0         0 my $answer = shift;
137 0         0 my @features_and_values = @$features_and_values;
138 0         0 my %answer = %$answer;
139 0         0 my @children = @{$node->get_children()};
  0         0  
140 0 0       0 if (@children == 0) {
141 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
142 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
143 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
144             }
145 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
146 0         0 return \%answer;
147             }
148 0         0 my $feature_tested_at_node = $node->get_feature();
149             print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n"
150 0 0       0 if $self->{_debug3};
151 0         0 my $value_for_feature;
152             my $path_found;
153 0         0 my $pattern = '(\S+)\s*=\s*(\S+)';
154 0         0 foreach my $feature_and_value (@features_and_values) {
155 0         0 $feature_and_value =~ /$pattern/;
156 0 0       0 $value_for_feature = $2 if $feature_tested_at_node eq $1;
157             }
158             # The following clause introduced in Version 3.20
159 0 0       0 if (!defined $value_for_feature) {
160 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
161 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
162 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
163             }
164 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
165 0         0 return \%answer;
166             }
167 0 0       0 if ($value_for_feature) {
168 0 0       0 if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
  0         0  
169 0 0       0 print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};
170 0         0 my $pattern1 = '(.+)<(.+)';
171 0         0 my $pattern2 = '(.+)>(.+)';
172 0         0 foreach my $child (@children) {
173 0         0 my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
  0         0  
174 0         0 my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
175 0 0       0 if ($last_feature_and_value_on_branch =~ /$pattern1/) {
176 0         0 my ($feature, $threshold) = ($1,$2);
177 0 0       0 if ($value_for_feature <= $threshold) {
178 0         0 $path_found = 1;
179 0         0 %answer = %{$self->recursive_descent_for_classification($child,
  0         0  
180             $features_and_values,\%answer)};
181 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
182 0         0 last;
183             }
184             }
185 0 0       0 if ($last_feature_and_value_on_branch =~ /$pattern2/) {
186 0         0 my ($feature, $threshold) = ($1,$2);
187 0 0       0 if ($value_for_feature > $threshold) {
188 0         0 $path_found = 1;
189 0         0 %answer = %{$self->recursive_descent_for_classification($child,
  0         0  
190             $features_and_values,\%answer)};
191 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
192 0         0 last;
193             }
194             }
195             }
196 0 0       0 return \%answer if $path_found;
197             } else {
198 0         0 my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";
199             print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n"
200 0 0       0 if $self->{_debug3};
201 0         0 foreach my $child (@children) {
202 0         0 my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
  0         0  
203 0 0       0 print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};
204 0         0 my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
205 0 0       0 if ($last_feature_and_value_on_branch eq $feature_value_combo) {
206 0         0 %answer = %{$self->recursive_descent_for_classification($child,
  0         0  
207             $features_and_values,\%answer)};
208 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
209 0         0 $path_found = 1;
210 0         0 last;
211             }
212             }
213 0 0       0 return \%answer if $path_found;
214             }
215             }
216 0 0       0 if (! $path_found) {
217 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
218 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
219 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
220             }
221 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
222             }
223 0         0 return \%answer;
224             }
225              
226             ## If you want classification to be carried out by engaging a human user in a
227             ## question-answer session, this is the method to use for that purpose. See, for
228             ## example, the script classify_by_asking_questions.pl in the `examples'
229             ## subdirectory for an illustration of how to do that.
230             sub classify_by_asking_questions {
231 0     0 1 0 my $self = shift;
232 0         0 my $root_node = shift;
233 0         0 my %answer = ();
234 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
235 0         0 $answer{$class_name} = undef;
236             }
237 0         0 $answer{'solution_path'} = [];
238 0         0 my %scratchpad_for_numeric_answers = ();
239 0         0 foreach my $feature_name (keys %{$self->{_prob_distribution_numeric_features_hash}}) {
  0         0  
240 0         0 $scratchpad_for_numeric_answers{$feature_name} = undef;
241             }
242 0         0 my %classification = %{$self->interactive_recursive_descent_for_classification($root_node,
  0         0  
243             \%answer, \%scratchpad_for_numeric_answers)};
244 0         0 @{$classification{'solution_path'}} = reverse @{$classification{'solution_path'}};
  0         0  
  0         0  
245 0         0 my %classification_for_display = ();
246 0         0 foreach my $item (keys %classification) {
247 0 0       0 if ($item ne 'solution_path') {
248 0         0 $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
249             } else {
250 0         0 my @outlist = ();
251 0         0 foreach my $x (@{$classification{$item}}) {
  0         0  
252 0         0 push @outlist, "NODE$x";
253             }
254 0         0 $classification_for_display{$item} = \@outlist;
255             }
256             }
257 0         0 return \%classification_for_display;
258             }
259              
260             sub interactive_recursive_descent_for_classification {
261 0     0 0 0 my $self = shift;
262 0         0 my $node = shift;
263 0         0 my $answer = shift;
264 0         0 my $scratchpad_for_numerics = shift;
265 0         0 my %answer = %$answer;
266 0         0 my %scratchpad_for_numerics = %$scratchpad_for_numerics;
267 0         0 my $pattern1 = '(.+)<(.+)';
268 0         0 my $pattern2 = '(.+)>(.+)';
269 0         0 my $user_value_for_feature;
270 0         0 my @children = @{$node->get_children()};
  0         0  
271 0 0       0 if (@children == 0) {
272 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
273 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
274 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
275             }
276 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
277 0         0 return \%answer;
278             }
279 0         0 my @list_of_branch_attributes_to_children = ();
280 0         0 foreach my $child (@children) {
281 0         0 my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
  0         0  
282 0         0 my $feature_and_value_on_branch = $branch_features_and_values[-1];
283 0         0 push @list_of_branch_attributes_to_children, $feature_and_value_on_branch;
284             }
285 0         0 my $feature_tested_at_node = $node->get_feature();
286 0         0 my $feature_value_combo;
287 0         0 my $path_found = 0;
288 0 0       0 if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
  0         0  
289 0 0       0 if ($scratchpad_for_numerics{$feature_tested_at_node}) {
290 0         0 $user_value_for_feature = $scratchpad_for_numerics{$feature_tested_at_node};
291             } else {
292 0         0 my @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_tested_at_node}};
  0         0  
293 0         0 while (1) {
294 0         0 print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
295 0         0 print "\nEnter a value in the range (@valuerange): ";
296 0         0 $user_value_for_feature = ;
297 0         0 $user_value_for_feature =~ s/\r?\n?$//;
298 0         0 $user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
299 0         0 my $answer_found = 0;
300 0 0 0     0 if ($user_value_for_feature >= $valuerange[0] && $user_value_for_feature <= $valuerange[1]) {
301 0         0 $answer_found = 1;
302 0         0 last;
303             }
304 0 0       0 last if $answer_found;
305 0         0 print("You entered illegal value. Let's try again")
306             }
307 0         0 $scratchpad_for_numerics{$feature_tested_at_node} = $user_value_for_feature;
308             }
309 0         0 foreach my $i (0..@list_of_branch_attributes_to_children-1) {
310 0         0 my $branch_attribute = $list_of_branch_attributes_to_children[$i];
311 0 0       0 if ($branch_attribute =~ /$pattern1/) {
312 0         0 my ($feature,$threshold) = ($1,$2);
313 0 0       0 if ($user_value_for_feature <= $threshold) {
314 0         0 %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
  0         0  
315             \%answer, \%scratchpad_for_numerics)};
316 0         0 $path_found = 1;
317 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
318 0         0 last;
319             }
320             }
321 0 0       0 if ($branch_attribute =~ /$pattern2/) {
322 0         0 my ($feature,$threshold) = ($1,$2);
323 0 0       0 if ($user_value_for_feature > $threshold) {
324 0         0 %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
  0         0  
325             \%answer, \%scratchpad_for_numerics)};
326 0         0 $path_found = 1;
327 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
328 0         0 last;
329             }
330             }
331             }
332 0 0       0 return \%answer if $path_found;
333             } else {
334 0         0 my @possible_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature_tested_at_node}};
  0         0  
335 0         0 while (1) {
336 0         0 print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
337 0         0 print "\nEnter a value from the list (@possible_values_for_feature): ";
338 0         0 $user_value_for_feature = ;
339 0         0 $user_value_for_feature =~ s/\r?\n?$//;
340 0         0 $user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
341 0         0 my $answer_found = 0;
342 0 0       0 if (contained_in($user_value_for_feature, @possible_values_for_feature)) {
343 0         0 $answer_found = 1;
344 0         0 last;
345             }
346 0 0       0 last if $answer_found;
347 0         0 print("You entered illegal value. Let's try again");
348             }
349 0         0 $feature_value_combo = "$feature_tested_at_node=$user_value_for_feature";
350 0         0 foreach my $i (0..@list_of_branch_attributes_to_children-1) {
351 0         0 my $branch_attribute = $list_of_branch_attributes_to_children[$i];
352 0 0       0 if ($branch_attribute eq $feature_value_combo) {
353 0         0 %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
  0         0  
354             \%answer, \%scratchpad_for_numerics)};
355 0         0 $path_found = 1;
356 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
357 0         0 last;
358             }
359             }
360 0 0       0 return \%answer if $path_found;
361             }
362 0 0       0 if (! $path_found) {
363 0         0 my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
  0         0  
364 0         0 foreach my $i (0..@{$self->{_class_names}}-1) {
  0         0  
365 0         0 $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
366             }
367 0         0 push @{$answer{'solution_path'}}, $node->get_serial_num();
  0         0  
368             }
369 0         0 return \%answer;
370             }
371              
372             ###################################### Decision Tree Construction ####################################
373              
374             ## At the root node, we find the best feature that yields the greatest reduction in
375             ## class entropy from the entropy based on just the class priors. The logic for
376             ## finding this feature is different for symbolic features and for numeric features.
377             ## That logic is built into the method shown later for best feature calculations.
378             sub construct_decision_tree_classifier {
379 1     1 1 8 print "\nConstructing the decision tree ...\n";
380 1         2 my $self = shift;
381 1 50       3 if ($self->{_debug3}) {
382 0         0 $self->determine_data_condition();
383 0         0 print "\nStarting construction of the decision tree:\n";
384             }
385 1         2 my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
  2         5  
  1         2  
386 1 50       4 if ($self->{_debug3}) {
387 0         0 print "\nPrior class probabilities: @class_probabilities\n";
388 0         0 print "\nClass names: @{$self->{_class_names}}\n";
  0         0  
389             }
390 1         4 my $entropy = $self->class_entropy_on_priors();
391 1 50       3 print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};
392 1         11 my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');
393 1         1 $root_node->set_class_names(\@{$self->{_class_names}});
  1         4  
394 1         1 $self->{_root_node} = $root_node;
395 1         3 $self->recursive_descent($root_node);
396 1         5 return $root_node;
397             }
398              
399             ## After the root node of the decision tree is calculated by the previous methods,
400             ## we invoke this method recursively to create the rest of the tree. At each node,
401             ## we find the feature that achieves the largest entropy reduction with regard to
402             ## the partitioning of the training data samples that correspond to that node.
403             sub recursive_descent {
404 26     26 0 23 my $self = shift;
405 26         13 my $node = shift;
406             print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"
407 26 50       44 if $self->{_debug3};
408 26         31 my $node_serial_number = $node->get_serial_num();
409 26         18 my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
  26         29  
410 26         27 my $existing_node_entropy = $node->get_node_entropy();
411 26 50       38 if ($self->{_debug3}) {
412 0         0 print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";
413 0         0 print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
414 0         0 print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
415 0         0 my @class_probs = @{$node->get_class_probabilities()};
  0         0  
416 0         0 print "\nRD4 Class probabilities: @class_probs\n";
417             }
418 26 100       36 if ($existing_node_entropy < $self->{_entropy_threshold}) {
419 8 50       12 print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
420 8         23 return;
421             }
422 18         16 my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  18         22  
423 18         35 my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
424             $self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
425 18         30 $node->set_feature($best_feature);
426 18 50       27 $node->display_node() if $self->{_debug3};
427 18 50 33     28 if (defined($self->{_max_depth_desired}) &&
428             (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
429 0 0       0 print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3};
430 0         0 return;
431             }
432 18 50       27 return if ! defined $best_feature;
433 18 50       21 if ($self->{_debug3}) {
434 0         0 print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
435 0         0 print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
436 0         0 print "\nRD9 Best feature entropy: $best_feature_entropy\n";
437 0         0 print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
438             }
439 18         19 my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
440 18 50       22 print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
441 18 100       25 if ($entropy_gain > $self->{_entropy_threshold}) {
442 12 50 33     23 if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} &&
443             $self->{_feature_values_how_many_uniques_hash}->{$best_feature} >
444             $self->{_symbolic_to_numeric_cardinality_threshold}) {
445 0         0 my $best_threshold = $decision_val; # as returned by best feature calculator
446 0         0 my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
447             my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
448 0         0 @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  0         0  
449             my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
450 0         0 @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  0         0  
451 0         0 my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
452 0         0 my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
453 0         0 push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
454             $feature_threshold_combo_for_less_than;
455 0         0 push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
456             $feature_threshold_combo_for_greater_than;
457 0 0       0 if ($self->{_debug3}) {
458 0         0 print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
459             "@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
460 0         0 print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
461             "@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
462             }
463             my @class_probabilities_for_lessthan_child_node =
464 0         0 map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
465 0         0 \@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
  0         0  
466             my @class_probabilities_for_greaterthan_child_node =
467 0         0 map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
468 0         0 \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
  0         0  
469 0 0       0 if ($self->{_debug3}) {
470 0         0 print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
471 0         0 print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
472             }
473 0 0       0 if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
474 0         0 my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
475             \@class_probabilities_for_lessthan_child_node,
476             \@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
477 0         0 $node->add_child_link($left_child_node);
478 0         0 $self->recursive_descent($left_child_node);
479             }
480 0 0       0 if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
481 0         0 my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
482             \@class_probabilities_for_greaterthan_child_node,
483             \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
484 0         0 $node->add_child_link($right_child_node);
485 0         0 $self->recursive_descent($right_child_node);
486             }
487             } else {
488             print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
489 12 50       15 if $self->{_debug3};
490 12         8 my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
  12         22  
491 12 50       19 print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
492 12         29 my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
  40         75  
493 12         13 my @class_entropies_for_children = ();
494 12         23 foreach my $feature_and_value_index (0..@feature_value_combos-1) {
495             print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
496 40 50       50 if $self->{_debug3};
497 40         32 my @extended_branch_features_and_values_or_thresholds;
498 40 100       45 if (! @features_and_values_or_thresholds_on_branch) {
499 4         7 @extended_branch_features_and_values_or_thresholds =
500             ($feature_value_combos[$feature_and_value_index]);
501             } else {
502             @extended_branch_features_and_values_or_thresholds =
503 36         27 @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  36         38  
504 36         41 push @extended_branch_features_and_values_or_thresholds,
505             $feature_value_combos[$feature_and_value_index];
506             }
507             my @class_probabilities =
508 80         89 map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
509 40         31 \@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
  40         42  
510 40         48 my $class_entropy_for_child =
511             $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
512             \@extended_branch_features_and_values_or_thresholds);
513 40 50       56 if ($self->{_debug3}) {
514 0         0 print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
515 0         0 print "\nRD20 class entropy for child: $class_entropy_for_child\n";
516             }
517 40 100       51 if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
518 25         42 my $child_node = DTNode->new(undef, $class_entropy_for_child,
519             \@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
520 25         38 $node->add_child_link($child_node);
521 25         78 $self->recursive_descent($child_node);
522             } else {
523 15 50       34 print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
524             }
525             }
526             }
527             } else {
528             print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n"
529 6 50       10 if $self->{_debug3};
530 6         22 return;
531             }
532             }
533              
534             ## This is the heart of the decision tree constructor. Its main job is to figure
535             ## out the best feature to use for partitioning the training data samples that
536             ## correspond to the current node. The search for the best feature is carried out
537             ## differently for symbolic features and for numeric features. For a symbolic
538             ## feature, the method estimates the entropy for each value of the feature and then
539             ## averages out these entropies as a measure of the discriminatory power of that
540             ## features. For a numeric feature, on the other hand, it estimates the entropy
541             ## reduction that can be achieved if were to partition the set of training samples
542             ## for each possible threshold. For a numeric feature, all possible sampling points
543             ## relevant to the node in question are considered as candidates for thresholds.
544             sub best_feature_calculator {
545 18     18 0 18 my $self = shift;
546 18         8 my $features_and_values_or_thresholds_on_branch = shift;
547 18         13 my $existing_node_entropy = shift;
548 18         19 my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
549 18         17 my $pattern1 = '(.+)=(.+)';
550 18         9 my $pattern2 = '(.+)<(.+)';
551 18         13 my $pattern3 = '(.+)>(.+)';
552 18         17 my @all_symbolic_features = ();
553 18         13 foreach my $feature_name (@{$self->{_feature_names}}) {
  18         25  
554             push @all_symbolic_features, $feature_name
555 72 50       111 if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
556             }
557 18         14 my @symbolic_features_already_used = ();
558 18         18 foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
559 47 50       197 push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
560             }
561 18         14 my @symbolic_features_not_yet_used;
562 18         14 foreach my $x (@all_symbolic_features) {
563 72 100       65 push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
564             }
565 18         15 my @true_numeric_types = ();
566 18         74 my @symbolic_types = ();
567 18         13 my @true_numeric_types_feature_names = ();
568 18         11 my @symbolic_types_feature_names = ();
569 18         17 foreach my $item (@features_and_values_or_thresholds_on_branch) {
570 47 50       146 if ($item =~ /$pattern2/) {
    50          
    50          
571 0         0 push @true_numeric_types, $item;
572 0         0 push @true_numeric_types_feature_names, $1;
573             } elsif ($item =~ /$pattern3/) {
574 0         0 push @true_numeric_types, $item;
575 0         0 push @true_numeric_types_feature_names, $1;
576             } elsif ($item =~ /$pattern1/) {
577 47         36 push @symbolic_types, $item;
578 47         66 push @symbolic_types_feature_names, $1;
579             } else {
580 0         0 die "format error in the representation of feature and values or thresholds";
581             }
582             }
583 18         24 my %seen = ();
584 18 0       15 @true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
  0         0  
585 18         27 %seen = ();
586 18 50       18 @symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
  47         104  
587             my @bounded_intervals_numeric_types =
588 18         11 @{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
  18         33  
589             # Calculate the upper and the lower bounds to be used when searching for the best
590             # threshold for each of the numeric features that are in play at the current node:
591 18         13 my (%upperbound, %lowerbound);
592 18         17 foreach my $feature (@true_numeric_types_feature_names) {
593 0         0 $upperbound{$feature} = undef;
594 0         0 $lowerbound{$feature} = undef;
595             }
596 18         17 foreach my $item (@bounded_intervals_numeric_types) {
597 0         0 foreach my $feature_grouping (@$item) {
598 0 0       0 if ($feature_grouping->[1] eq '>') {
599 0         0 $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
600             } else {
601 0         0 $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
602             }
603             }
604             }
605 18         15 my %entropy_values_for_different_features = ();
606 18         11 my %partitioning_point_child_entropies_hash = ();
607 18         15 my %partitioning_point_threshold = ();
608 18         16 my %entropies_for_different_values_of_symbolic_feature = ();
609 18         18 foreach my $feature (@{$self->{_feature_names}}) {
  18         25  
610 72         73 $entropy_values_for_different_features{$feature} = [];
611 72         59 $partitioning_point_child_entropies_hash{$feature} = {};
612 72         54 $partitioning_point_threshold{$feature} = undef;
613 72         74 $entropies_for_different_values_of_symbolic_feature{$feature} = [];
614             }
615 18         13 foreach my $i (0..@{$self->{_feature_names}}-1) {
  18         51  
616 72         62 my $feature_name = $self->{_feature_names}->[$i];
617 72 50       82 print "\n\nBFC1 FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
618 72 100 33     82 if (contained_in($feature_name, @symbolic_features_already_used)) {
    50          
619 47         45 next;
620 25         44 } elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
621             $self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
622             $self->{_symbolic_to_numeric_cardinality_threshold}) {
623 0         0 my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
  0         0  
624 0 0       0 print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};
625 0         0 my @newvalues = ();
626 0 0       0 if (contained_in($feature_name, @true_numeric_types_feature_names)) {
627 0 0 0     0 if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
    0 0        
    0 0        
    0 0        
628             $lowerbound{$feature_name} >= $upperbound{$feature_name}) {
629 0         0 next;
630             } elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
631             $lowerbound{$feature_name} < $upperbound{$feature_name}) {
632 0         0 foreach my $x (@values) {
633 0 0 0     0 push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
634             }
635             } elsif (defined($upperbound{$feature_name})) {
636 0         0 foreach my $x (@values) {
637 0 0       0 push @newvalues, $x if $x <= $upperbound{$feature_name};
638             }
639             } elsif (defined($lowerbound{$feature_name})) {
640 0         0 foreach my $x (@values) {
641 0 0       0 push @newvalues, $x if $x > $lowerbound{$feature_name};
642             }
643             } else {
644 0         0 die "Error is bound specifications in best feature calculator";
645             }
646             } else {
647 0         0 @newvalues = @{deep_copy_array(\@values)};
  0         0  
648             }
649 0 0       0 next if @newvalues == 0;
650 0         0 my @partitioning_entropies = ();
651 0         0 foreach my $value (@newvalues) {
652 0         0 my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
653 0         0 my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
654 0         0 my @for_left_child;
655             my @for_right_child;
656 0 0       0 if (@features_and_values_or_thresholds_on_branch) {
657 0         0 @for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  0         0  
658 0         0 push @for_left_child, $feature_and_less_than_value_string;
659 0         0 @for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  0         0  
660 0         0 push @for_right_child, $feature_and_greater_than_value_string;
661             } else {
662 0         0 @for_left_child = ($feature_and_less_than_value_string);
663 0         0 @for_right_child = ($feature_and_greater_than_value_string);
664             }
665 0         0 my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(
666             \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
667 0         0 my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
668             \@features_and_values_or_thresholds_on_branch, $feature_name, $value);
669 0         0 my $partitioning_entropy = $entropy1 *
670             $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
671             $entropy2 *
672             $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);
673              
674 0         0 push @partitioning_entropies, $partitioning_entropy;
675 0         0 $partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
676             }
677 0         0 my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
678 0 0       0 if ($min_entropy < $existing_node_entropy) {
679 0         0 $partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
680 0         0 $entropy_values_for_different_features{$feature_name} = $min_entropy;
681             }
682             } else {
683 25 50       29 print "\nBFC2: Entering section reserved for symbolic features\n" if $self->{_debug3};
684 25 50       30 print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
685 25         20 my %seen;
686 82 50       262 my @values = grep {$_ ne 'NA' && !$seen{$_}++}
687 25         16 @{$self->{_features_and_unique_values_hash}->{$feature_name}};
  25         35  
688 25         47 @values = sort @values;
689 25 50       31 print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};
690              
691 25         19 my $entropy = 0;
692 25         25 foreach my $value (@values) {
693 82         127 my $feature_value_string = "$feature_name" . '=' . "$value";
694 82 50       98 print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
695 82         59 my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
  82         96  
696 82 100       117 if (@features_and_values_or_thresholds_on_branch) {
697 68         63 push @extended_attributes, $feature_value_string;
698             } else {
699 14         14 @extended_attributes = ($feature_value_string);
700             }
701 82         100 $entropy +=
702             $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) *
703             $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
704             print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
705 82 50       118 "is $entropy\n" if $self->{_debug3};
706 82         62 push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
  82         157  
707             }
708 25 50       41 if ($entropy < $existing_node_entropy) {
709 25         64 $entropy_values_for_different_features{$feature_name} = $entropy;
710             }
711             }
712             }
713 18         16 my $min_entropy_for_best_feature;
714             my $best_feature_name;
715 18         30 foreach my $feature_nom (keys %entropy_values_for_different_features) {
716 72 100       73 if (!defined($best_feature_name)) {
717 18         14 $best_feature_name = $feature_nom;
718 18         20 $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
719             } else {
720 54 100       86 if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
721 13         11 $best_feature_name = $feature_nom;
722 13         13 $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
723             }
724             }
725             }
726 18         15 my $threshold_for_best_feature;
727 18 50       23 if (exists $partitioning_point_threshold{$best_feature_name}) {
728 18         16 $threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
729             } else {
730 0         0 $threshold_for_best_feature = undef;
731             }
732 18         10 my $best_feature_entropy = $min_entropy_for_best_feature;
733 18         15 my @val_based_entropies_to_be_returned;
734             my $decision_val_to_be_returned;
735 18 50 33     36 if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} &&
736             $self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} >
737             $self->{_symbolic_to_numeric_cardinality_threshold}) {
738             @val_based_entropies_to_be_returned =
739 0         0 @{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
  0         0  
740             } else {
741 18         15 @val_based_entropies_to_be_returned = ();
742             }
743 18 50       26 if (exists $partitioning_point_threshold{$best_feature_name}) {
744 18         15 $decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
745             } else {
746 0         0 $decision_val_to_be_returned = undef;
747             }
748             print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
749 18 50       25 "@val_based_entropies_to_be_returned\n" if $self->{_debug3};
750 18         102 return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned,
751             $decision_val_to_be_returned);
752             }
753              
754             ######################################### Entropy Calculators #####################################
755              
756             sub class_entropy_on_priors {
757 1     1 0 2 my $self = shift;
758             return $self->{_entropy_cache}->{'priors'}
759 1 50       4 if exists $self->{_entropy_cache}->{"priors"};
760 1         1 my @class_names = @{$self->{_class_names}};
  1         2  
761 1         2 my $entropy;
762 1         1 foreach my $class (@class_names) {
763 2         4 my $prob = $self->prior_probability_for_class($class);
764 2 50 33     18 my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;
765 2 50       3 $log_prob = 0 if $prob < 0.0001; # since X.log(X)->0 as X->0
766 2 50       4 $log_prob = 0 if $prob > 0.999; # since log(1) = 0
767 2 100       5 if (!defined $entropy) {
768 1         2 $entropy = -1.0 * $prob * $log_prob;
769 1         2 next;
770             }
771 1         3 $entropy += -1.0 * $prob * $log_prob;
772             }
773 1         2 $self->{_entropy_cache}->{'priors'} = $entropy;
774 1         2 return $entropy;
775             }
776              
777             sub entropy_scanner_for_a_numeric_feature {
778 0     0 0 0 local $| = 1;
779 0         0 my $self = shift;
780 0         0 my $feature = shift;
781 0         0 my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};
  0         0  
782 0         0 my @entropies_for_less_than_thresholds = ();
783 0         0 my @entropies_for_greater_than_thresholds = ();
784 0         0 foreach my $point (@all_sampling_points) {
785 0         0 print ". ";
786 0         0 push @entropies_for_less_than_thresholds,
787             $self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);
788 0         0 push @entropies_for_greater_than_thresholds,
789             $self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);
790             }
791 0         0 print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ".
792             "@entropies_for_less_than_thresholds\n";
793 0         0 print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ".
794             "@entropies_for_greater_than_thresholds\n";
795             }
796              
797             sub class_entropy_for_less_than_threshold_for_feature {
798 0     0 0 0 my $self = shift;
799 0         0 my $arr = shift;
800 0         0 my $feature = shift;
801 0         0 my $threshold = shift;
802 0         0 my @array_of_features_and_values_or_thresholds = @$arr;
803 0         0 my $feature_threshold_combo = "$feature" . '<' . "$threshold";
804 0         0 my $sequence = join ":", @array_of_features_and_values_or_thresholds;
805 0         0 $sequence .= ":" . $feature_threshold_combo;
806 0 0       0 return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
807             my @copy_of_array_of_features_and_values_or_thresholds =
808 0         0 @{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
  0         0  
809 0         0 push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
810 0         0 my $entropy = 0;
811 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
812 0         0 my $log_prob = undef;
813 0         0 my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
814             $class_name, \@copy_of_array_of_features_and_values_or_thresholds);
815 0 0 0     0 if ($prob >= 0.0001 && $prob <= 0.999) {
    0          
    0          
816 0         0 $log_prob = log($prob) / log(2.0);
817             } elsif ($prob < 0.0001) {
818 0         0 $log_prob = 0;
819             } elsif ($prob > 0.999) {
820 0         0 $log_prob = 0;
821             } else {
822 0         0 die "An error has occurred in log_prob calculation";
823             }
824 0         0 $entropy += -1.0 * $prob * $log_prob;
825             }
826 0 0       0 if (abs($entropy) < 0.0000001) {
827 0         0 $entropy = 0.0;
828             }
829 0         0 $self->{_entropy_cache}->{$sequence} = $entropy;
830 0         0 return $entropy;
831             }
832              
833             sub class_entropy_for_greater_than_threshold_for_feature {
834 0     0 0 0 my $self = shift;
835 0         0 my $arr = shift;
836 0         0 my $feature = shift;
837 0         0 my $threshold = shift;
838 0         0 my @array_of_features_and_values_or_thresholds = @$arr;
839 0         0 my $feature_threshold_combo = "$feature" . '>' . "$threshold";
840 0         0 my $sequence = join ":", @array_of_features_and_values_or_thresholds;
841 0         0 $sequence .= ":" . $feature_threshold_combo;
842 0 0       0 return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
843             my @copy_of_array_of_features_and_values_or_thresholds =
844 0         0 @{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
  0         0  
845 0         0 push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
846 0         0 my $entropy = 0;
847 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
848 0         0 my $log_prob = undef;
849 0         0 my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
850             $class_name, \@copy_of_array_of_features_and_values_or_thresholds);
851 0 0 0     0 if ($prob >= 0.0001 && $prob <= 0.999) {
    0          
    0          
852 0         0 $log_prob = log($prob) / log(2.0);
853             } elsif ($prob < 0.0001) {
854 0         0 $log_prob = 0;
855             } elsif ($prob > 0.999) {
856 0         0 $log_prob = 0;
857             } else {
858 0         0 die "An error has occurred in log_prob calculation";
859             }
860 0         0 $entropy += -1.0 * $prob * $log_prob;
861             }
862 0 0       0 if (abs($entropy) < 0.0000001) {
863 0         0 $entropy = 0.0;
864             }
865 0         0 $self->{_entropy_cache}->{$sequence} = $entropy;
866 0         0 return $entropy;
867             }
868              
869             sub class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds {
870 122     122 0 89 my $self = shift;
871 122         69 my $array_of_features_and_values_or_thresholds = shift;
872 122         124 my @array_of_features_and_values_or_thresholds = @$array_of_features_and_values_or_thresholds;
873 122         122 my $sequence = join ":", @array_of_features_and_values_or_thresholds;
874 122 100       229 return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
875 82         49 my $entropy = 0;
876 82         49 foreach my $class_name (@{$self->{_class_names}}) {
  82         89  
877 164         124 my $log_prob = undef;
878 164         192 my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
879             $class_name, \@array_of_features_and_values_or_thresholds);
880 164 100 100     438 if ($prob >= 0.0001 && $prob <= 0.999) {
    100          
    50          
881 138         149 $log_prob = log($prob) / log(2.0);
882             } elsif ($prob < 0.0001) {
883 13         10 $log_prob = 0;
884             } elsif ($prob > 0.999) {
885 13         12 $log_prob = 0;
886             } else {
887 0         0 die "An error has occurred in log_prob calculation";
888             }
889 164         184 $entropy += -1.0 * $prob * $log_prob;
890             }
891 82 100       116 if (abs($entropy) < 0.0000001) {
892 13         11 $entropy = 0.0;
893             }
894 82         109 $self->{_entropy_cache}->{$sequence} = $entropy;
895 82         132 return $entropy;
896             }
897              
898              
899             ##################################### Probability Calculators ########################################
900              
901             sub prior_probability_for_class {
902 4     4 0 5 my $self = shift;
903 4         3 my $class = shift;
904 4         3 my $class_name_in_cache = "prior" . '::' . $class;
905             return $self->{_probability_cache}->{$class_name_in_cache}
906 4 50       16 if exists $self->{_probability_cache}->{$class_name_in_cache};
907 0         0 my $total_num_of_samples = keys %{$self->{_samples_class_label_hash}};
  0         0  
908 0         0 my @values = values %{$self->{_samples_class_label_hash}};
  0         0  
909 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
910 0         0 my @trues = grep {$_ eq $class_name} @values;
  0         0  
911 0         0 my $prior_for_this_class = (1.0 * @trues) / $total_num_of_samples;
912 0         0 my $this_class_name_in_cache = "prior" . '::' . $class_name;
913 0         0 $self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
914             }
915 0         0 return $self->{_probability_cache}->{$class_name_in_cache};
916             }
917              
918             sub calculate_class_priors {
919 1     1 1 7 my $self = shift;
920 1 50       2 return if scalar keys %{$self->{_class_priors_hash}} > 1;
  1         4  
921 1         2 foreach my $class_name (@{$self->{_class_names}}) {
  1         2  
922 2         4 my $class_name_in_cache = "prior::$class_name";
923 2         1 my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};
  2         5  
924 2         1 my @all_values = values %{$self->{_samples_class_label_hash}};
  2         11  
925 2         3 my @trues_for_this_class = grep {$_ eq $class_name} @all_values;
  70         53  
926 2         5 my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;
927 2         3 $self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;
928 2         4 my $this_class_name_in_cache = "prior::$class_name";
929 2         6 $self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
930             }
931 1 50       4 if ($self->{_debug1}) {
932 0         0 foreach my $class (sort keys %{$self->{_class_priors_hash}}) {
  0         0  
933 0         0 print "$class => $self->{_class_priors_hash}->{$class}\n";
934             }
935             }
936             }
937              
938             sub calculate_first_order_probabilities {
939 1     1 1 13 print "\nEstimating probabilities...\n";
940 1         2 my $self = shift;
941 1         1 foreach my $feature (@{$self->{_feature_names}}) {
  1         3  
942 4         13 $self->probability_of_feature_value($feature, undef);
943 4 50       16 if ($self->{_debug2}) {
944 0 0       0 if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {
945 0         0 print "\nPresenting probability distribution for a numeric feature:\n";
946 0         0 foreach my $sampling_point (sort {$a <=> $b} keys
  0         0  
947 0         0 %{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {
948 0         0 my $sampling_pt_for_display = sprintf("%.2f", $sampling_point);
949             print "$feature :: $sampling_pt_for_display=" . sprintf("%.5f",
950 0         0 $self->{_prob_distribution_numeric_features_hash}->{$feature}{$sampling_point}) . "\n";
951             }
952             } else {
953 0         0 print "\nPresenting probabilities for the values of a feature considered to be symbolic:\n";
954 0         0 my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
  0         0  
955 0         0 foreach my $value (sort @values_for_feature) {
956 0         0 my $prob = $self->probability_of_feature_value($feature,$value);
957 0         0 print "$feature :: $value = " . sprintf("%.5f", $prob) . "\n";
958             }
959             }
960             }
961             }
962             }
963              
964             sub probability_of_feature_value {
965 190     190 0 133 my $self = shift;
966 190         142 my $feature_name = shift;
967 190         131 my $value = shift;
968 190 50 66     568 $value = sprintf("%.1f", $value) if defined($value) && $value =~ /^\d+$/;
969 190 50 66     373 if (defined($value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
970             $value = closest_sampling_point($value,
971 0         0 $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
972             }
973 190         112 my $feature_and_value;
974 190 100       228 if (defined($value)) {
975 186         196 $feature_and_value = "$feature_name=$value";
976             }
977 190 50 66     358 if (defined($value) && exists($self->{_probability_cache}->{$feature_and_value})) {
978 186         329 return $self->{_probability_cache}->{$feature_and_value};
979             }
980 4         7 my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
981 4 50       10 if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
982 0 0       0 if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
983             $self->{_symbolic_to_numeric_cardinality_threshold}) {
984 0 0       0 if (! exists $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}) {
985 0         0 @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
  0         0  
986 0         0 $diffrange = $valuerange[1] - $valuerange[0];
987 0         0 my %seen = ();
988 0 0 0     0 my @unique_values_for_feature = sort {$a <=> $b} grep {$_ if $_ ne 'NA' && !$seen{$_}++}
  0         0  
989 0         0 @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
990 0         0 my @diffs = sort {$a <=> $b} map {$unique_values_for_feature[$_] -
  0         0  
  0         0  
991             $unique_values_for_feature[$_-1]} 1..@unique_values_for_feature-1;
992 0         0 my $median_diff = $diffs[int(@diffs/2) - 1];
993 0         0 $histogram_delta = $median_diff * 2;
994 0 0       0 if ($histogram_delta < $diffrange / 500.0) {
995 0 0       0 if (defined $self->{_number_of_histogram_bins}) {
996 0         0 $histogram_delta = $diffrange / $self->{_number_of_histogram_bins};
997             } else {
998 0         0 $histogram_delta = $diffrange / 500.0;
999             }
1000             }
1001 0         0 $self->{_histogram_delta_hash}->{$feature_name} = $histogram_delta;
1002 0         0 $num_of_histogram_bins = int($diffrange / $histogram_delta) + 1;
1003 0         0 $self->{_num_of_histogram_bins_hash}->{$feature_name} = $num_of_histogram_bins;
1004 0         0 my @sampling_points_for_feature = map {$valuerange[0] + $histogram_delta * $_}
  0         0  
1005             0..$num_of_histogram_bins-1;
1006 0         0 $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name} =
1007             \@sampling_points_for_feature;
1008             }
1009             }
1010             }
1011 4 50       9 if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1012 0 0       0 if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1013             $self->{_symbolic_to_numeric_cardinality_threshold}) {
1014             my @sampling_points_for_feature =
1015 0         0 @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
  0         0  
1016 0         0 my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1017 0         0 my @actual_values_for_feature = grep {$_ ne 'NA'}
1018 0         0 @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
1019 0         0 foreach my $i (0..@sampling_points_for_feature-1) {
1020 0         0 foreach my $j (0..@actual_values_for_feature-1) {
1021 0 0       0 if (abs($sampling_points_for_feature[$i]-$actual_values_for_feature[$j]) < $histogram_delta) {
1022 0         0 $counts_at_sampling_points[$i]++
1023             }
1024             }
1025             }
1026 0         0 my $total_counts = 0;
1027 0         0 map {$total_counts += $_} @counts_at_sampling_points;
  0         0  
1028 0         0 my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
  0         0  
1029 0         0 my %bin_prob_hash = ();
1030 0         0 foreach my $i (0..@sampling_points_for_feature-1) {
1031 0         0 $bin_prob_hash{$sampling_points_for_feature[$i]} = $probs[$i];
1032             }
1033 0         0 $self->{_prob_distribution_numeric_features_hash}->{$feature_name} = \%bin_prob_hash;
1034 0         0 my @values_for_feature = map "$feature_name=$_", map {sprintf("%.5f", $_)}
  0         0  
1035             @sampling_points_for_feature;
1036 0         0 foreach my $i (0..@values_for_feature-1) {
1037 0         0 $self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1038             }
1039 0 0 0     0 if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1040 0         0 return $self->{_probability_cache}->{$feature_and_value};
1041             } else {
1042 0         0 return 0;
1043             }
1044             } else {
1045 0         0 my %seen = ();
1046 0 0 0     0 my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1047 0         0 @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
1048 0         0 @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
  0         0  
1049 0         0 my @value_counts = (0) x @values_for_feature;
1050             # foreach my $sample (sort {sample_index($a) cmp sample_index($b)}keys %{$self->{_training_data_hash}}){
1051 0         0 foreach my $sample (sort {sample_index($a) <=> sample_index($b)}keys %{$self->{_training_data_hash}}){
  0         0  
  0         0  
1052 0         0 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
  0         0  
1053 0         0 foreach my $i (0..@values_for_feature-1) {
1054 0         0 foreach my $current_value (@features_and_values) {
1055 0 0       0 $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1056             }
1057             }
1058             }
1059 0         0 my $total_counts = 0;
1060 0         0 map {$total_counts += $_} @value_counts;
  0         0  
1061 0 0       0 die "PFV Something is wrong with your training file. It contains no training samples \
1062             for feature named $feature_name" if $total_counts == 0;
1063 0         0 my @probs = map {$_ / (1.0 * $total_counts)} @value_counts;
  0         0  
1064 0         0 foreach my $i (0..@values_for_feature-1) {
1065 0         0 $self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1066             }
1067 0 0 0     0 if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1068 0         0 return $self->{_probability_cache}->{$feature_and_value};
1069             } else {
1070 0         0 return 0;
1071             }
1072             }
1073             } else {
1074             # This section is only for purely symbolic features:
1075 4         6 my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
  4         26  
1076 4         7 @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
  140         157  
1077 4         17 my @value_counts = (0) x @values_for_feature;
1078             # foreach my $sample (sort {sample_index($a) cmp sample_index($b)} keys %{$self->{_training_data_hash}}) {
1079 4         5 foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}}) {
  548         520  
  4         50  
1080 140         77 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
  140         232  
1081 140         148 foreach my $i (0..@values_for_feature-1) {
1082 4900         3357 for my $current_value (@features_and_values) {
1083 19600 100       23181 $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1084             }
1085             }
1086             }
1087 4         22 foreach my $i (0..@values_for_feature-1) {
1088             $self->{_probability_cache}->{$values_for_feature[$i]} =
1089 140         95 $value_counts[$i] / (1.0 * scalar(keys %{$self->{_training_data_hash}}));
  140         214  
1090             }
1091 4 50 33     16 if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1092 0         0 return $self->{_probability_cache}->{$feature_and_value};
1093             } else {
1094 4         20 return 0;
1095             }
1096             }
1097             }
1098              
1099             sub probability_of_feature_value_given_class {
1100 380     380 0 266 my $self = shift;
1101 380         261 my $feature_name = shift;
1102 380         266 my $feature_value = shift;
1103 380         225 my $class_name = shift;
1104 380 50 33     1091 $feature_value = sprintf("%.1f", $feature_value) if defined($feature_value) && $feature_value =~ /^\d+$/;
1105 380 50 33     678 if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
1106             $feature_value = closest_sampling_point($feature_value,
1107 0         0 $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
1108             }
1109 380         239 my $feature_value_class;
1110 380 50       417 if (defined($feature_value)) {
1111 380         515 $feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";
1112             }
1113 380 100 33     759 if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {
1114             print "\nNext answer returned by cache for feature $feature_name and " .
1115 372 50       443 "value $feature_value given class $class_name\n" if $self->{_debug2};
1116 372         730 return $self->{_probability_cache}->{$feature_value_class};
1117             }
1118 8         14 my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
1119              
1120 8 50       10 if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1121 0 0       0 if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1122             $self->{_symbolic_to_numeric_cardinality_threshold}) {
1123 0         0 $histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};
1124 0         0 $num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};
1125 0         0 @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
  0         0  
1126 0         0 $diffrange = $valuerange[1] - $valuerange[0];
1127             }
1128             }
1129 8         8 my @samples_for_class = ();
1130             # Accumulate all samples names for the given class:
1131 8         4 foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
  8         47  
1132 280 100       330 if ($self->{_samples_class_label_hash}->{$sample_name} eq $class_name) {
1133 140         111 push @samples_for_class, $sample_name;
1134             }
1135             }
1136 8 50       18 if (exists($self->{_numeric_features_valuerange_hash}->{$feature_name})) {
1137 0 0       0 if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1138             $self->{_symbolic_to_numeric_cardinality_threshold}) {
1139             my @sampling_points_for_feature =
1140 0         0 @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
  0         0  
1141 0         0 my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1142 0         0 my @actual_feature_values_for_samples_in_class = ();
1143 0         0 foreach my $sample (@samples_for_class) {
1144 0         0 foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
  0         0  
1145 0         0 my $pattern = '(.+)=(.+)';
1146 0         0 $feature_and_value =~ /$pattern/;
1147 0         0 my ($feature, $value) = ($1, $2);
1148 0 0 0     0 if (($feature eq $feature_name) && ($value ne 'NA')) {
1149 0         0 push @actual_feature_values_for_samples_in_class, $value;
1150             }
1151             }
1152             }
1153 0         0 foreach my $i (0..@sampling_points_for_feature-1) {
1154 0         0 foreach my $j (0..@actual_feature_values_for_samples_in_class-1) {
1155 0 0       0 if (abs($sampling_points_for_feature[$i] -
1156             $actual_feature_values_for_samples_in_class[$j]) < $histogram_delta) {
1157 0         0 $counts_at_sampling_points[$i]++;
1158             }
1159             }
1160             }
1161 0         0 my $total_counts = 0;
1162 0         0 map {$total_counts += $_} @counts_at_sampling_points;
  0         0  
1163 0 0       0 die "PFVC1 Something is wrong with your training file. It contains no training " .
1164             "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1165 0         0 my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
  0         0  
1166 0         0 my @values_for_feature_and_class = map {"$feature_name=$_" . "::" . "$class_name"}
  0         0  
1167             @sampling_points_for_feature;
1168 0         0 foreach my $i (0..@values_for_feature_and_class-1) {
1169 0         0 $self->{_probability_cache}->{$values_for_feature_and_class[$i]} = $probs[$i];
1170             }
1171 0 0       0 if (exists $self->{_probability_cache}->{$feature_value_class}) {
1172 0         0 return $self->{_probability_cache}->{$feature_value_class};
1173             } else {
1174 0         0 return 0;
1175             }
1176             } else {
1177             # This section is for numeric features that will be treated symbolically
1178 0         0 my %seen = ();
1179 0 0 0     0 my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1180 0         0 @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
1181 0         0 @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
  0         0  
1182 0         0 my @value_counts = (0) x @values_for_feature;
1183 0         0 foreach my $sample (@samples_for_class) {
1184 0         0 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
  0         0  
1185 0         0 foreach my $i (0..@values_for_feature-1) {
1186 0         0 foreach my $current_value (@features_and_values) {
1187 0 0       0 $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1188             }
1189             }
1190             }
1191 0         0 my $total_counts = 0;
1192 0         0 map {$total_counts += $_} @value_counts;
  0         0  
1193 0 0       0 die "PFVC2 Something is wrong with your training file. It contains no training " .
1194             "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1195             # We normalize by total_count because the probabilities are conditioned on a given class
1196 0         0 foreach my $i (0..@values_for_feature-1) {
1197 0         0 my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1198 0         0 $self->{_probability_cache}->{$feature_and_value_and_class} =
1199             $value_counts[$i] / (1.0 * $total_counts);
1200             }
1201 0 0       0 if (exists $self->{_probability_cache}->{$feature_value_class}) {
1202 0         0 return $self->{_probability_cache}->{$feature_value_class};
1203             } else {
1204 0         0 return 0;
1205             }
1206             }
1207             } else {
1208             # This section is for purely symbolic features
1209 8         6 my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
  8         38  
1210 8         8 my %seen = ();
1211 280 100 66     737 @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1212 8         6 @{$self->{_features_and_values_hash}->{$feature_name}};
  8         11  
1213 8         8 @values_for_feature = map {"$feature_name=$_"} @values_for_feature;
  28         37  
1214 8         11 my @value_counts = (0) x @values_for_feature;
1215 8         9 foreach my $sample (@samples_for_class) {
1216 140         81 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
  140         181  
1217 140         139 foreach my $i (0..@values_for_feature-1) {
1218 490         313 foreach my $current_value (@features_and_values) {
1219 1960 100       2377 $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1220             }
1221             }
1222             }
1223 8         9 my $total_counts = 0;
1224 8         7 map {$total_counts += $_} @value_counts;
  28         23  
1225 8 50       29 die "PFVC2 Something is wrong with your training file. It contains no training " .
1226             "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1227             # We normalize by total_count because the probabilities are conditioned on a given class
1228 8         11 foreach my $i (0..@values_for_feature-1) {
1229 28         32 my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1230 28         61 $self->{_probability_cache}->{$feature_and_value_and_class} =
1231             $value_counts[$i] / (1.0 * $total_counts);
1232             }
1233 8 50       11 if (exists $self->{_probability_cache}->{$feature_value_class}) {
1234 8         33 return $self->{_probability_cache}->{$feature_value_class};
1235             } else {
1236 0         0 return 0;
1237             }
1238             }
1239             }
1240              
1241             sub probability_of_feature_less_than_threshold {
1242 0     0 0 0 my $self = shift;
1243 0         0 my $feature_name = shift;
1244 0         0 my $threshold = shift;
1245 0         0 my $feature_threshold_combo = "$feature_name" . '<' . "$threshold";
1246             return $self->{_probability_cache}->{$feature_threshold_combo}
1247 0 0       0 if (exists $self->{_probability_cache}->{$feature_threshold_combo});
1248 0 0       0 my @all_values = grep {$_ if $_ ne 'NA'} @{$self->{_features_and_values_hash}->{$feature_name}};
  0         0  
  0         0  
1249 0 0       0 my @all_values_less_than_threshold = grep {$_ if $_ <= $threshold} @all_values;
  0         0  
1250 0         0 my $probability = (1.0 * @all_values_less_than_threshold) / @all_values;
1251 0         0 $self->{_probability_cache}->{$feature_threshold_combo} = $probability;
1252 0         0 return $probability;
1253             }
1254              
1255             sub probability_of_feature_less_than_threshold_given_class {
1256 0     0 0 0 my $self = shift;
1257 0         0 my $feature_name = shift;
1258 0         0 my $threshold = shift;
1259 0         0 my $class_name = shift;
1260 0         0 my $feature_threshold_class_combo = "$feature_name" . "<" . "$threshold" . "::" . "$class_name";
1261             return $self->{_probability_cache}->{$feature_threshold_class_combo}
1262 0 0       0 if (exists $self->{_probability_cache}->{$feature_threshold_class_combo});
1263 0         0 my @data_samples_for_class = ();
1264             # Accumulate all samples names for the given class:
1265 0         0 foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
  0         0  
1266             push @data_samples_for_class, $sample_name
1267 0 0       0 if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;
1268             }
1269 0         0 my @actual_feature_values_for_samples_in_class = ();
1270 0         0 foreach my $sample (@data_samples_for_class) {
1271 0         0 foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
  0         0  
1272 0         0 my $pattern = '(.+)=(.+)';
1273 0         0 $feature_and_value =~ /$pattern/;
1274 0         0 my ($feature,$value) = ($1,$2);
1275 0 0 0     0 push @actual_feature_values_for_samples_in_class, $value
1276             if $feature eq $feature_name && $value ne 'NA';
1277             }
1278             }
1279 0 0       0 my @actual_points_for_feature_less_than_threshold = grep {$_ if $_ <= $threshold} @actual_feature_values_for_samples_in_class;
  0         0  
1280             # The condition in the assignment that follows was a bug correction in Version 3.20
1281 0 0       0 my $probability = @actual_feature_values_for_samples_in_class > 0 ? ((1.0 * @actual_points_for_feature_less_than_threshold) / @actual_feature_values_for_samples_in_class) : 0.0;
1282 0         0 $self->{_probability_cache}->{$feature_threshold_class_combo} = $probability;
1283 0         0 return $probability;
1284             }
1285              
1286             # This method requires that all truly numeric types only be expressed as '<' or '>'
1287             # constructs in the array of branch features and thresholds
1288             sub probability_of_a_sequence_of_features_and_values_or_thresholds {
1289 233     233 0 160 my $self = shift;
1290 233         157 my $arr = shift;
1291 233         253 my @array_of_features_and_values_or_thresholds = @$arr;
1292 233 50       294 return if scalar @array_of_features_and_values_or_thresholds == 0;
1293 233         278 my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1294 233 100       463 return $self->{_probability_cache}->{$sequence} if exists $self->{_probability_cache}->{$sequence};
1295 68         48 my $probability = undef;
1296 68         46 my $pattern1 = '(.+)=(.+)';
1297 68         34 my $pattern2 = '(.+)<(.+)';
1298 68         47 my $pattern3 = '(.+)>(.+)';
1299 68         53 my @true_numeric_types = ();
1300 68         59 my @true_numeric_types_feature_names = ();
1301 68         55 my @symbolic_types = ();
1302 68         41 my @symbolic_types_feature_names = ();
1303 68         69 foreach my $item (@array_of_features_and_values_or_thresholds) {
1304 186 50       358 if ($item =~ /$pattern2/) {
    50          
1305 0         0 push @true_numeric_types, $item;
1306 0         0 my ($feature,$value) = ($1,$2);
1307 0         0 push @true_numeric_types_feature_names, $feature;
1308             } elsif ($item =~ /$pattern3/) {
1309 0         0 push @true_numeric_types, $item;
1310 0         0 my ($feature,$value) = ($1,$2);
1311 0         0 push @true_numeric_types_feature_names, $feature;
1312             } else {
1313 186         128 push @symbolic_types, $item;
1314 186         280 $item =~ /$pattern1/;
1315 186         237 my ($feature,$value) = ($1,$2);
1316 186         193 push @symbolic_types_feature_names, $feature;
1317             }
1318             }
1319 68         66 my %seen1 = ();
1320 68 0       69 @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
  0         0  
1321 68         40 my %seen2 = ();
1322 68 50       61 @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
  186         409  
1323 68         107 my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1324             print_array_with_msg("POS: Answer returned by find_bounded: ",
1325 68 50       81 $bounded_intervals_numeric_types) if $self->{_debug2};
1326             # Calculate the upper and the lower bounds to be used when searching for the best
1327             # threshold for each of the numeric features that are in play at the current node:
1328 68         44 my (%upperbound, %lowerbound);
1329 68         62 foreach my $feature_name (@true_numeric_types_feature_names) {
1330 0         0 $upperbound{$feature_name} = undef;
1331 0         0 $lowerbound{$feature_name} = undef;
1332             }
1333 68         69 foreach my $item (@$bounded_intervals_numeric_types) {
1334 0         0 foreach my $feature_grouping (@$item) {
1335 0 0       0 if ($feature_grouping->[1] eq '>') {
1336 0         0 $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1337             } else {
1338 0         0 $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1339             }
1340             }
1341             }
1342 68         57 foreach my $feature_name (@true_numeric_types_feature_names) {
1343 0 0 0     0 if (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name}) &&
    0 0        
    0 0        
    0 0        
      0        
1344             $upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1345 0         0 return 0;
1346             } elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1347 0 0       0 if (! $probability) {
1348             $probability = $self->probability_of_feature_less_than_threshold($feature_name,
1349             $upperbound{$feature_name}) -
1350 0         0 $self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name});
1351             } else {
1352             $probability *= ($self->probability_of_feature_less_than_threshold($feature_name,
1353             $upperbound{$feature_name}) -
1354 0         0 $self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name}))
1355             }
1356             } elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1357 0 0       0 if (! $probability) {
1358             $probability = $self->probability_of_feature_less_than_threshold($feature_name,
1359 0         0 $upperbound{$feature_name});
1360             } else {
1361             $probability *= $self->probability_of_feature_less_than_threshold($feature_name,
1362 0         0 $upperbound{$feature_name});
1363             }
1364             } elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1365 0 0       0 if (! $probability) {
1366             $probability = 1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1367 0         0 $lowerbound{$feature_name});
1368             } else {
1369             $probability *= (1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1370 0         0 $lowerbound{$feature_name}));
1371             }
1372             } else {
1373 0         0 die("Ill formatted call to 'probability_of_sequence' method");
1374             }
1375             }
1376 68         50 foreach my $feature_and_value (@symbolic_types) {
1377 186 50       508 if ($feature_and_value =~ /$pattern1/) {
1378 186         228 my ($feature,$value) = ($1,$2);
1379 186 100       179 if (! $probability) {
1380 68         113 $probability = $self->probability_of_feature_value($feature, $value);
1381             } else {
1382 118         132 $probability *= $self->probability_of_feature_value($feature, $value);
1383             }
1384             }
1385             }
1386 68         106 $self->{_probability_cache}->{$sequence} = $probability;
1387 68         152 return $probability;
1388             }
1389              
1390             ## The following method requires that all truly numeric types only be expressed as
1391             ## '<' or '>' constructs in the array of branch features and thresholds
1392             sub probability_of_a_sequence_of_features_and_values_or_thresholds_given_class {
1393 164     164 0 108 my $self = shift;
1394 164         103 my $arr = shift;
1395 164         118 my $class_name = shift;
1396 164         180 my @array_of_features_and_values_or_thresholds = @$arr;
1397 164 50       216 return if scalar @array_of_features_and_values_or_thresholds == 0;
1398 164         212 my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1399 164         178 my $sequence_with_class = "$sequence" . "::" . $class_name;
1400             return $self->{_probability_cache}->{$sequence_with_class}
1401 164 100       271 if exists $self->{_probability_cache}->{$sequence_with_class};
1402 144         95 my $probability = undef;
1403 144         118 my $pattern1 = '(.+)=(.+)';
1404 144         85 my $pattern2 = '(.+)<(.+)';
1405 144         85 my $pattern3 = '(.+)>(.+)';
1406 144         132 my @true_numeric_types = ();
1407 144         97 my @true_numeric_types_feature_names = ();
1408 144         86 my @symbolic_types = ();
1409 144         106 my @symbolic_types_feature_names = ();
1410 144         128 foreach my $item (@array_of_features_and_values_or_thresholds) {
1411 380 50       704 if ($item =~ /$pattern2/) {
    50          
1412 0         0 push @true_numeric_types, $item;
1413 0         0 my ($feature,$value) = ($1,$2);
1414 0         0 push @true_numeric_types_feature_names, $feature;
1415             } elsif ($item =~ /$pattern3/) {
1416 0         0 push @true_numeric_types, $item;
1417 0         0 my ($feature,$value) = ($1,$2);
1418 0         0 push @true_numeric_types_feature_names, $feature;
1419             } else {
1420 380         269 push @symbolic_types, $item;
1421 380         603 $item =~ /$pattern1/;
1422 380         486 my ($feature,$value) = ($1,$2);
1423 380         416 push @symbolic_types_feature_names, $feature;
1424             }
1425             }
1426 144         133 my %seen1 = ();
1427 144 0       125 @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
  0         0  
1428 144         101 my %seen2 = ();
1429 144 50       120 @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
  380         841  
1430 144         205 my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1431             print_array_with_msg("POSC: Answer returned by find_bounded: ",
1432 144 50       186 $bounded_intervals_numeric_types) if $self->{_debug2};
1433             # Calculate the upper and the lower bounds to be used when searching for the best
1434             # threshold for each of the numeric features that are in play at the current node:
1435 144         93 my (%upperbound, %lowerbound);
1436 144         130 foreach my $feature_name (@true_numeric_types_feature_names) {
1437 0         0 $upperbound{$feature_name} = undef;
1438 0         0 $lowerbound{$feature_name} = undef;
1439             }
1440 144         129 foreach my $item (@$bounded_intervals_numeric_types) {
1441 0         0 foreach my $feature_grouping (@$item) {
1442 0 0       0 if ($feature_grouping->[1] eq '>') {
1443 0         0 $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1444             } else {
1445 0         0 $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1446             }
1447             }
1448             }
1449 144         119 foreach my $feature_name (@true_numeric_types_feature_names) {
1450 0 0 0     0 if ($lowerbound{$feature_name} && $upperbound{$feature_name} &&
    0 0        
    0 0        
    0 0        
      0        
1451             $upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1452 0         0 return 0;
1453             } elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1454 0 0       0 if (! $probability) {
1455              
1456             $probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1457             $upperbound{$feature_name}, $class_name) -
1458             $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1459 0         0 $lowerbound{$feature_name}, $class_name);
1460             } else {
1461             $probability *= ($self->probability_of_feature_less_than_threshold_given_class($feature_name,
1462             $upperbound{$feature_name}, $class_name) -
1463             $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1464 0         0 $lowerbound{$feature_name}, $class_name))
1465             }
1466             } elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1467 0 0       0 if (! $probability) {
1468             $probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1469 0         0 $upperbound{$feature_name}, $class_name);
1470             } else {
1471             $probability *= $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1472 0         0 $upperbound{$feature_name}, $class_name);
1473             }
1474             } elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1475 0 0       0 if (! $probability) {
1476             $probability = 1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1477 0         0 $lowerbound{$feature_name}, $class_name);
1478             } else {
1479             $probability *= (1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1480 0         0 $lowerbound{$feature_name}, $class_name));
1481             }
1482             } else {
1483 0         0 die("Ill formatted call to 'probability of sequence given class' method");
1484             }
1485             }
1486 144         122 foreach my $feature_and_value (@symbolic_types) {
1487 380 50       1027 if ($feature_and_value =~ /$pattern1/) {
1488 380         493 my ($feature,$value) = ($1,$2);
1489 380 100       365 if (! $probability) {
1490 144         192 $probability = $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1491             } else {
1492 236         272 $probability *= $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1493             }
1494             }
1495             }
1496 144         250 $self->{_probability_cache}->{$sequence_with_class} = $probability;
1497 144         337 return $probability;
1498             }
1499              
1500             sub probability_of_a_class_given_sequence_of_features_and_values_or_thresholds {
1501 244     244 0 153 my $self = shift;
1502 244         161 my $class_name = shift;
1503 244         157 my $arr = shift;
1504 244         248 my @array_of_features_and_values_or_thresholds = @$arr;
1505 244         240 my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1506 244         233 my $class_and_sequence = "$class_name" . "::" . $sequence;
1507             return $self->{_probability_cache}->{$class_and_sequence}
1508 244 100       549 if exists $self->{_probability_cache}->{$class_and_sequence};
1509 82         55 my @array_of_class_probabilities = (0) x scalar @{$self->{_class_names}};
  82         114  
1510 82         54 foreach my $i (0..@{$self->{_class_names}}-1) {
  82         100  
1511 164         131 my $class_name = $self->{_class_names}->[$i];
1512 164         198 my $prob = $self->probability_of_a_sequence_of_features_and_values_or_thresholds_given_class(
1513             \@array_of_features_and_values_or_thresholds, $class_name);
1514 164 100       224 if ($prob < 0.000001) {
1515 13         16 $array_of_class_probabilities[$i] = 0.0;
1516 13         18 next;
1517             }
1518 151         208 my $prob_of_feature_sequence = $self->probability_of_a_sequence_of_features_and_values_or_thresholds(
1519             \@array_of_features_and_values_or_thresholds);
1520             # die "PCS Something is wrong with your sequence of feature values and thresholds in " .
1521             # "probability_of_a_class_given_sequence_of_features_and_values_or_thresholds()"
1522             # if ! $prob_of_feature_sequence;
1523 151         186 my $prior = $self->{_class_priors_hash}->{$self->{_class_names}->[$i]};
1524 151 50       159 if ($prob_of_feature_sequence) {
1525 151         210 $array_of_class_probabilities[$i] = $prob * $prior / $prob_of_feature_sequence;
1526             } else {
1527 0         0 $array_of_class_probabilities[$i] = $prior;
1528             }
1529             }
1530 82         56 my $sum_probability;
1531 82         66 map {$sum_probability += $_} @array_of_class_probabilities;
  164         159  
1532 82 50       92 if ($sum_probability == 0) {
1533 0         0 @array_of_class_probabilities = map {1.0 / (scalar @{$self->{_class_names}})}
  0         0  
1534 0         0 (0..@{$self->{_class_names}}-1);
  0         0  
1535             } else {
1536 82         63 @array_of_class_probabilities = map {$_ * 1.0 / $sum_probability} @array_of_class_probabilities;
  164         200  
1537             }
1538 82         64 foreach my $i (0..@{$self->{_class_names}}-1) {
  82         143  
1539 164         204 my $this_class_and_sequence = "$self->{_class_names}->[$i]" . "::" . "$sequence";
1540 164         275 $self->{_probability_cache}->{$this_class_and_sequence} = $array_of_class_probabilities[$i];
1541             }
1542 82         138 return $self->{_probability_cache}->{$class_and_sequence};
1543             }
1544              
1545             ####################################### Class Based Utilities ##########################################
1546              
1547             ## Given a list of branch attributes for the numeric features of the form, say,
1548             ## ['g2<1','g2<2','g2<3','age>34','age>36','age>37'], this method returns the
1549             ## smallest list that is relevant for the purpose of calculating the probabilities.
1550             ## To explain, the probability that the feature `g2' is less than 1 AND, at the same
1551             ## time, less than 2, AND, at the same time, less than 3, is the same as the
1552             ## probability that the feature less than 1. Similarly, the probability that 'age'
1553             ## is greater than 34 and also greater than 37 is the same as `age' being greater
1554             ## than 37.
1555             sub find_bounded_intervals_for_numeric_features {
1556 230     230 0 178 my $self = shift;
1557 230         141 my $arr = shift;
1558 230         174 my @arr = @$arr;
1559 230         151 my @features = @{$self->{_feature_names}};
  230         295  
1560 230         192 my @arr1 = map {my @x = split /(>|<)/, $_; \@x} @arr;
  0         0  
  0         0  
1561 230 50       316 print_array_with_msg("arr1", \@arr1) if $self->{_debug2};
1562 230         155 my @arr3 = ();
1563 230         198 foreach my $feature_name (@features) {
1564 920         624 my @temp = ();
1565 920         633 foreach my $x (@arr1) {
1566 0 0 0     0 push @temp, $x if @$x > 0 && $x->[0] eq $feature_name;
1567             }
1568 920 50       1201 push @arr3, \@temp if @temp > 0;
1569             }
1570 230 50       276 print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
1571             # Sort each list so that '<' entries occur before '>' entries:
1572 230         138 my @arr4;
1573 230         166 foreach my $li (@arr3) {
1574 0         0 my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
  0         0  
1575 0         0 push @arr4, \@sorted;
1576             }
1577 230 50       281 print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
1578 230         168 my @arr5;
1579 230         181 foreach my $li (@arr4) {
1580 0         0 my @temp1 = ();
1581 0         0 my @temp2 = ();
1582 0         0 foreach my $inner (@$li) {
1583 0 0       0 if ($inner->[1] eq '<') {
1584 0         0 push @temp1, $inner;
1585             } else {
1586 0         0 push @temp2, $inner;
1587             }
1588             }
1589 0 0 0     0 if (@temp1 > 0 && @temp2 > 0) {
    0          
1590 0         0 push @arr5, [\@temp1, \@temp2];
1591             } elsif (@temp1 > 0) {
1592 0         0 push @arr5, [\@temp1];
1593             } else {
1594 0         0 push @arr5, [\@temp2];
1595             }
1596             }
1597 230 50       263 print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
1598 230         180 my @arr6 = ();
1599 230         168 foreach my $li (@arr5) {
1600 0         0 my @temp1 = ();
1601 0         0 foreach my $inner (@$li) {
1602 0         0 my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
  0         0  
1603 0         0 push @temp1, \@sorted;
1604             }
1605 0         0 push @arr6, \@temp1;
1606             }
1607 230 50       256 print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
1608 230         182 my @arr9 = ();
1609 230         204 foreach my $li (@arr6) {
1610 0         0 foreach my $alist (@$li) {
1611 0         0 my @newalist = ();
1612 0 0       0 if ($alist->[0][1] eq '<') {
1613 0         0 push @newalist, $alist->[0];
1614             } else {
1615 0         0 push @newalist, $alist->[-1];
1616             }
1617 0 0       0 if ($alist->[0][1] ne $alist->[-1][1]) {
1618 0         0 push @newalist, $alist->[-1];
1619             }
1620 0         0 push @arr9, \@newalist;
1621             }
1622             }
1623 230 50       292 print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
1624 230         324 return \@arr9;
1625              
1626             }
1627              
1628             ## This method is used to verify that you used legal feature names in the test
1629             ## sample that you want to classify with the decision tree.
1630             sub check_names_used {
1631 0     0 0 0 my $self = shift;
1632 0         0 my $features_and_values_test_data = shift;
1633 0         0 my @features_and_values_test_data = @$features_and_values_test_data;
1634 0         0 my $pattern = '(\S+)\s*=\s*(\S+)';
1635 0         0 foreach my $feature_and_value (@features_and_values_test_data) {
1636 0         0 $feature_and_value =~ /$pattern/;
1637 0         0 my ($feature,$value) = ($1,$2);
1638 0 0 0     0 die "Your test data has formatting error" unless defined($feature) && defined($value);
1639 0 0       0 return 0 unless contained_in($feature, @{$self->{_feature_names}});
  0         0  
1640             }
1641 0         0 return 1;
1642             }
1643              
1644             ####################################### Data Condition Calculator ######################################
1645              
1646             ## This method estimates the worst-case fan-out of the decision tree taking into
1647             ## account the number of values (and therefore the number of branches emanating from
1648             ## a node) for the symbolic features.
1649             sub determine_data_condition {
1650 0     0 0 0 my $self = shift;
1651 0         0 my $num_of_features = scalar @{$self->{_feature_names}};
  0         0  
1652 0         0 my @values = ();
1653 0         0 my @number_of_values;
1654 0         0 foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
  0         0  
1655 0         0 push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
1656 0 0       0 if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
  0         0  
1657 0         0 push @number_of_values, scalar @values;
1658             }
1659 0 0       0 return if ! @values;
1660 0         0 print "Number of features: $num_of_features\n";
1661 0         0 my @minmax = minmax(\@number_of_values);
1662 0         0 my $max_num_values = $minmax[1];
1663 0         0 print "Largest number of values for symbolic features is: $max_num_values\n";
1664 0         0 my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
1665 0         0 print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
1666             "nodes. The exact number of nodes created depends critically on " .
1667             "the entropy_threshold used for node expansion (the default value " .
1668             "for this threshold is 0.01) and on the value set for max_depth_desired " .
1669             "for the depth of the tree.\n";
1670 0 0       0 if ($estimated_number_of_nodes > 10000) {
1671 0         0 print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
1672             "large value for entropy_threshold and/or a small value for " .
1673             "for max_depth_desired to reduce the number of nodes created.\n";
1674 0         0 print "\nDo you wish to continue anyway? Enter 'y' for yes: ";
1675 0         0 my $answer = ;
1676 0         0 $answer =~ s/\r?\n?$//;
1677 0   0     0 while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
1678 0         0 print "\nAnswer not recognized. Let's try again. Enter 'y' or 'n': ";
1679 0         0 $answer = ;
1680 0         0 $answer =~ s/\r?\n?$//;
1681             }
1682 0 0       0 die unless $answer =~ /y(es)?/i;
1683             }
1684             }
1685              
1686              
1687             #################################### Read Training Data From File ######################################
1688              
1689              
1690             sub get_training_data {
1691 1     1 1 5 my $self = shift;
1692 1 50       11 die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
1693 1         2 my %class_names = ();
1694 1         2 my %all_record_ids_with_class_labels;
1695             my $firstline;
1696 0         0 my %data_hash;
1697 1         2 $|++;
1698 1   50     24 open FILEIN, $self->{_training_datafile} || die "unable to open $self->{_training_datafile}: $!";
1699 1         2 my $record_index = 0;
1700 1         1 my $firsetline;
1701 1         13 while () {
1702 36 50       64 next if /^[ ]*\r?\n?$/;
1703 36         72 $_ =~ s/\r?\n?$//;
1704 36 50       47 my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
1705 36 100       42 if ($record_index == 0) {
1706 1         1 $firstline = $record;
1707 1         1 $record_index++;
1708 1         3 next;
1709             }
1710 35         102 my @parts = split /,/, $record;
1711 35         29 my $classname = $parts[$self->{_csv_class_column_index}];
1712 35         22 $class_names{$classname} = 1;
1713 35         29 my $record_label = shift @parts;
1714 35         46 $record_label =~ s/^\s*\"|\"\s*$//g;
1715 35         48 $data_hash{$record_label} = \@parts;
1716 35         31 $all_record_ids_with_class_labels{$record_label} = $classname;
1717 35 50       45 print "." if $record_index % 10000 == 0;
1718 35         74 $record_index++;
1719             }
1720 1         6 close FILEIN;
1721 1         2 $|--;
1722 1         2 $self->{_how_many_total_training_samples} = $record_index - 1; # must subtract 1 for the header record
1723 1 50       3 print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
1724 1         6 my @all_feature_names = split /,/, substr($firstline, index($firstline,','));
1725 1         2 my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
1726 1         2 my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
  2         6  
1727 1         2 my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
  4         5  
  1         2  
1728 1         6 my %class_for_sample_hash = map {"sample_" . $_ => "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
  35         59  
1729 1         7 my @sample_names = map {"sample_$_"} keys %data_hash;
  35         30  
1730 1 50       6 my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $data_hash{$sampleID}->[$_-1] } } @{$self->{_csv_columns_for_features}} ] } keys %data_hash;
  35         21  
  35         26  
  140         99  
  140         97  
  140         331  
  35         32  
1731 1 50       5 my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};
  4         3  
  4         3  
  4         16  
  140         78  
  140         199  
  140         111  
  1         2  
1732 1         2 my %numeric_features_valuerange_hash = ();
1733 1         1 my %feature_values_how_many_uniques_hash = ();
1734 1         1 my %features_and_unique_values_hash = ();
1735 1         3 my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
1736 1         3 foreach my $feature (keys %features_and_values_hash) {
1737 4         4 my %seen1 = ();
1738 140 100 66     369 my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
1739 4         3 @{$features_and_values_hash{$feature}};
  4         6  
1740 4         5 $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
1741 4         5 my $not_all_values_float = 0;
1742 4 50       5 map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
  14         92  
1743 4 50       9 if ($not_all_values_float == 0) {
1744 0         0 my @minmaxvalues = minmax(\@unique_values_for_feature);
1745 0         0 $numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
1746             }
1747 4         8 $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
1748             }
1749 1 50       4 if ($self->{_debug1}) {
1750 0         0 print "\nAll class names: @all_class_names\n";
1751 0         0 print "\nEach sample data record:\n";
1752 0         0 foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
  0         0  
1753 0         0 print "$sample => @{$feature_values_for_samples_hash{$sample}}\n";
  0         0  
1754             }
1755 0         0 print "\nclass label for each data sample:\n";
1756 0         0 foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %class_for_sample_hash) {
  0         0  
1757 0         0 print "$sample => $class_for_sample_hash{$sample}\n";
1758             }
1759 0         0 print "\nFeatures used: @feature_names\n\n";
1760 0         0 print "\nfeatures and the values taken by them:\n";
1761 0         0 foreach my $feature (sort keys %features_and_values_hash) {
1762 0         0 print "$feature => @{$features_and_values_hash{$feature}}\n";
  0         0  
1763             }
1764 0         0 print "\nnumeric features and their ranges:\n";
1765 0         0 foreach my $feature (sort keys %numeric_features_valuerange_hash) {
1766 0         0 print "$feature => @{$numeric_features_valuerange_hash{$feature}}\n";
  0         0  
1767             }
1768 0         0 print "\nnumber of unique values in each feature:\n";
1769 0         0 foreach my $feature (sort keys %feature_values_how_many_uniques_hash) {
1770 0         0 print "$feature => $feature_values_how_many_uniques_hash{$feature}\n";
1771             }
1772             }
1773 1         2 $self->{_class_names} = \@all_class_names;
1774 1         5 $self->{_feature_names} = \@feature_names;
1775 1         2 $self->{_samples_class_label_hash} = \%class_for_sample_hash;
1776 1         2 $self->{_training_data_hash} = \%feature_values_for_samples_hash;
1777 1         2 $self->{_features_and_values_hash} = \%features_and_values_hash;
1778 1         2 $self->{_features_and_unique_values_hash} = \%features_and_unique_values_hash;
1779 1         1 $self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
1780 1         18 $self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
1781             }
1782              
1783             sub show_training_data {
1784 0     0 1 0 my $self = shift;
1785 0         0 my @class_names = @{$self->{_class_names}};
  0         0  
1786 0         0 my %features_and_values_hash = %{$self->{_features_and_values_hash}};
  0         0  
1787 0         0 my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};
  0         0  
1788 0         0 my %training_data_hash = %{$self->{_training_data_hash}};
  0         0  
1789 0         0 print "\n\nClass Names: @class_names\n";
1790 0         0 print "\n\nFeatures and Their Values:\n\n";
1791 0         0 while ( my ($k, $v) = each %features_and_values_hash ) {
1792 0         0 print "$k ---> @{$features_and_values_hash{$k}}\n";
  0         0  
1793             }
1794 0         0 print "\n\nSamples vs. Class Labels:\n\n";
1795 0         0 foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {
  0         0  
1796 0         0 print "$kee => $samples_class_label_hash{$kee}\n";
1797             }
1798 0         0 print "\n\nTraining Samples:\n\n";
1799 0         0 foreach my $kee (sort {sample_index($a) <=> sample_index($b)}
  0         0  
1800             keys %training_data_hash) {
1801 0         0 print "$kee => @{$training_data_hash{$kee}}\n";
  0         0  
1802             }
1803             }
1804              
1805             sub get_class_names {
1806 0     0 0 0 my $self = shift;
1807 0         0 return @{$self->{_class_names}}
  0         0  
1808             }
1809              
1810             ########################################## Utility Routines ############################################
1811              
1812             sub closest_sampling_point {
1813 0     0 0 0 my $value = shift;
1814 0         0 my $arr_ref = shift;
1815 0         0 my @arr = @{$arr_ref};
  0         0  
1816 0         0 my @compare = map {abs($_ - $value)} @arr;
  0         0  
1817 0         0 my ($minval,$index) = minimum(\@compare);
1818 0         0 return $arr[$index];
1819             }
1820              
1821             ## returns the array index that contains a specified STRING value: (meant only for array of strings)
1822             sub get_index_at_value {
1823 0     0 0 0 my $value = shift;
1824 0         0 my @array = @{shift @_};
  0         0  
1825 0         0 foreach my $i (0..@array-1) {
1826 0 0       0 return $i if $value eq $array[$i];
1827             }
1828             }
1829              
1830             ## When the training data is read from a CSV file, we assume that the first column
1831             ## of each data record contains a unique integer identifier for the record in that
1832             ## row. This training data is stored in a hash whose keys are the prefix 'sample_'
1833             ## followed by the identifying integers. The purpose of this function is to return
1834             ## the identifying integer associated with a data record.
1835             sub sample_index {
1836 1096     1096 0 678 my $arg = shift;
1837 1096         1079 $arg =~ /_(.+)$/;
1838 1096         1212 return $1;
1839             }
1840              
1841             # Returns the minimum value and its positional index in an array
1842             sub minimum {
1843 0     0 0 0 my $arr = shift;
1844 0         0 my $min;
1845             my $index;
1846 0         0 foreach my $i (0..@{$arr}-1) {
  0         0  
1847 0 0 0     0 if ( (!defined $min) || ($arr->[$i] < $min) ) {
1848 0         0 $index = $i;
1849 0         0 $min = $arr->[$i];
1850             }
1851             }
1852 0         0 return ($min, $index);
1853             }
1854              
1855             # Returns an array of two values, the min and the max, of an array of floats
1856             sub minmax {
1857 0     0 0 0 my $arr = shift;
1858 0         0 my ($min, $max);
1859 0         0 foreach my $i (0..@{$arr}-1) {
  0         0  
1860 0 0 0     0 if ( (!defined $min) || ($arr->[$i] < $min) ) {
1861 0         0 $min = $arr->[$i];
1862             }
1863 0 0 0     0 if ( (!defined $max) || ($arr->[$i] > $max) ) {
1864 0         0 $max = $arr->[$i];
1865             }
1866             }
1867 0         0 return ($min, $max);
1868             }
1869              
1870             # checks whether an element is in an array:
1871             sub contained_in {
1872 169     169 0 113 my $ele = shift;
1873 169         162 my @array = @_;
1874 169         95 my $count = 0;
1875 169 100       129 map {$count++ if $ele eq $_} @array;
  376         598  
1876 169         275 return $count;
1877             }
1878              
1879             # Meant only for an array of strings (no nesting):
1880             sub deep_copy_array {
1881 136     136 0 96 my $ref_in = shift;
1882 136         80 my $ref_out;
1883 136 100       206 return [] if scalar @$ref_in == 0;
1884 121         83 foreach my $i (0..@{$ref_in}-1) {
  121         158  
1885 239         251 $ref_out->[$i] = $ref_in->[$i];
1886             }
1887 121         171 return $ref_out;
1888             }
1889              
1890             sub check_for_illegal_params2 {
1891 1     1 0 2 my @params = @_;
1892 1         3 my @legal_params = qw / training_datafile
1893             entropy_threshold
1894             max_depth_desired
1895             csv_class_column_index
1896             csv_columns_for_features
1897             symbolic_to_numeric_cardinality_threshold
1898             number_of_histogram_bins
1899             csv_cleanup_needed
1900             debug1
1901             debug2
1902             debug3
1903             /;
1904 1         1 my $found_match_flag;
1905 1         3 foreach my $param (@params) {
1906 3         2 foreach my $legal (@legal_params) {
1907 10         8 $found_match_flag = 0;
1908 10 100       15 if ($param eq $legal) {
1909 3         2 $found_match_flag = 1;
1910 3         3 last;
1911             }
1912             }
1913 3 50       5 last if $found_match_flag == 0;
1914             }
1915 1         4 return $found_match_flag;
1916             }
1917              
1918             sub print_array_with_msg {
1919 0     0 0 0 my $message = shift;
1920 0         0 my $arr = shift;
1921 0         0 print "\n$message: ";
1922 0         0 print_nested_array( $arr );
1923             }
1924              
1925             sub print_nested_array {
1926 0     0 0 0 my $arr = shift;
1927 0         0 my @arr = @$arr;
1928 0         0 print "[";
1929 0         0 foreach my $item (@arr) {
1930 0 0       0 if (ref $item) {
1931 0         0 print_nested_array($item);
1932             } else {
1933 0         0 print "$item";
1934             }
1935             }
1936 0         0 print "]";
1937             }
1938              
1939             sub cleanup_csv {
1940 0     0 0 0 my $line = shift;
1941 0         0 $line =~ tr/\/:?()[]{}'/ /;
1942 0         0 my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
1943 0         0 for (@double_quoted) {
1944 0         0 my $item = $_;
1945 0         0 $item = substr($item, 1, -1);
1946 0         0 $item =~ s/^s+|,|\s+$//g;
1947 0         0 $item = join '_', split /\s+/, $item;
1948 0         0 substr($line, index($line, $_), length($_)) = $item;
1949             }
1950 0         0 my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
1951 0         0 for (@white_spaced) {
1952 0         0 my $item = $_;
1953 0         0 $item =~ s/\s+/_/g;
1954 0         0 $item =~ s/^\s*_|_\s*$//g;
1955 0         0 substr($line, index($line, $_), length($_)) = $item;
1956             }
1957 0         0 $line =~ s/,\s*(?=,|$)/,NA/g;
1958 0         0 return $line;
1959             }
1960              
1961             ######################################### Class EvalTrainingData ########################################
1962              
1963             ## This subclass of the DecisionTree class is used to evaluate the quality of your
1964             ## training data by running a 10-fold cross-validation test on it. This test divides
1965             ## all of the training data into ten parts, with nine parts used for training a
1966             ## decision tree and one part used for testing its ability to classify correctly.
1967             ## This selection of nine parts for training and one part for testing is carried out
1968             ## in all of the ten different possible ways. This testing functionality can also
1969             ## be used to find the best values to use for the constructor parameters
1970             ## entropy_threshold, max_depth_desired, and
1971             ## symbolic_to_numeric_cardinality_threshold.
1972              
1973             ## Only the CSV training files can be evaluated in this manner (because only CSV
1974             ## training are allowed to have numeric features --- which is the more interesting
1975             ## case for evaluation analytics.
1976              
1977             package EvalTrainingData;
1978              
1979             @EvalTrainingData::ISA = ('Algorithm::DecisionTree');
1980              
1981             sub new {
1982 0     0   0 my $class = shift;
1983 0         0 my $instance = Algorithm::DecisionTree->new(@_);
1984 0         0 bless $instance, $class;
1985             }
1986              
1987             sub evaluate_training_data {
1988 0     0   0 my $self = shift;
1989 0         0 my $evaldebug = 0;
1990             die "The data evaluation function in the module can only be used when your " .
1991 0 0       0 "training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
1992 0         0 print "\nWill run a 10-fold cross-validation test on your training data to test its " .
1993             "class-discriminatory power:\n";
1994 0         0 my %all_training_data = %{$self->{_training_data_hash}};
  0         0  
1995 0         0 my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=>
  0         0  
1996             Algorithm::DecisionTree::sample_index($b)} keys %all_training_data;
1997 0         0 my $fold_size = int(0.1 * (scalar keys %all_training_data));
1998 0         0 print "fold size: $fold_size\n";
1999 0         0 my %confusion_matrix = ();
2000 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
2001 0         0 foreach my $inner_class_name (@{$self->{_class_names}}) {
  0         0  
2002 0         0 $confusion_matrix{$class_name}->{$inner_class_name} = 0;
2003             }
2004             }
2005 0         0 foreach my $fold_index (0..9) {
2006 0         0 print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n";
2007 0         0 my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
2008 0         0 my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],
2009             @all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
2010 0         0 my %testing_data = ();
2011 0         0 foreach my $x (@testing_samples) {
2012 0         0 $testing_data{$x} = $all_training_data{$x};
2013             }
2014 0         0 my %training_data = ();
2015 0         0 foreach my $x (@training_samples) {
2016 0         0 $training_data{$x} = $all_training_data{$x};
2017             }
2018 0         0 my $trainingDT = Algorithm::DecisionTree->new('evalmode');
2019 0         0 $trainingDT->{_training_data_hash} = \%training_data;
2020 0         0 $trainingDT->{_class_names} = $self->{_class_names};
2021 0         0 $trainingDT->{_feature_names} = $self->{_feature_names};
2022 0         0 $trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
2023 0         0 $trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
2024             $trainingDT->{_symbolic_to_numeric_cardinality_threshold} =
2025 0         0 $self->{_symbolic_to_numeric_cardinality_threshold};
2026 0         0 foreach my $sample_name (@training_samples) {
2027             $trainingDT->{_samples_class_label_hash}->{$sample_name} =
2028 0         0 $self->{_samples_class_label_hash}->{$sample_name};
2029             }
2030 0         0 foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
  0         0  
2031 0         0 $trainingDT->{_features_and_values_hash}->{$feature} = ();
2032             }
2033 0         0 my $pattern = '(\S+)\s*=\s*(\S+)';
2034 0         0 foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=>
  0         0  
2035             Algorithm::DecisionTree::sample_index($b)}
2036 0         0 keys %{$trainingDT->{_training_data_hash}}) {
2037 0         0 foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
  0         0  
2038 0         0 $feature_and_value =~ /$pattern/;
2039 0         0 my ($feature,$value) = ($1,$2);
2040 0 0       0 push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
  0         0  
2041             }
2042             }
2043 0         0 foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
  0         0  
2044 0         0 my %seen = ();
2045 0 0 0     0 my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
2046 0         0 @{$trainingDT->{_features_and_values_hash}->{$feature}};
  0         0  
2047 0 0       0 if (Algorithm::DecisionTree::contained_in($feature,
2048 0         0 keys %{$self->{_numeric_features_valuerange_hash}})) {
2049 0         0 @unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;
  0         0  
2050             } else {
2051 0         0 @unique_values_for_feature = sort @unique_values_for_feature;
2052             }
2053 0         0 $trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
2054             }
2055 0         0 foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
  0         0  
2056             my @minmaxvalues = Algorithm::DecisionTree::minmax(
2057 0         0 \@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
  0         0  
2058 0         0 $trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
2059             }
2060 0 0       0 if ($evaldebug) {
2061 0         0 print "\n\nprinting samples in the testing set: @testing_samples\n";
2062 0         0 print "\n\nPrinting features and their values in the training set:\n";
2063 0         0 foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
  0         0  
2064 0         0 print "$item => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
  0         0  
2065             }
2066 0         0 print "\n\nPrinting unique values for features:\n";
2067 0         0 foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
  0         0  
2068 0         0 print "$item => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
  0         0  
2069             }
2070 0         0 print "\n\nPrinting unique value ranges for features:\n";
2071 0         0 foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
  0         0  
2072 0         0 print "$item => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
  0         0  
2073             }
2074             }
2075 0         0 foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
  0         0  
2076             $trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} =
2077 0         0 scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
  0         0  
2078             }
2079 0 0       0 $trainingDT->{_debug2} = 1 if $evaldebug;
2080 0         0 $trainingDT->calculate_first_order_probabilities();
2081 0         0 $trainingDT->calculate_class_priors();
2082 0         0 my $root_node = $trainingDT->construct_decision_tree_classifier();
2083 0 0       0 $root_node->display_decision_tree(" ") if $evaldebug;
2084 0         0 foreach my $test_sample_name (@testing_samples) {
2085 0         0 my @test_sample_data = @{$all_training_data{$test_sample_name}};
  0         0  
2086 0 0       0 print "original data in test sample: @test_sample_data\n" if $evaldebug;
2087 0 0 0     0 @test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
  0         0  
2088 0 0       0 print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
2089 0         0 my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
  0         0  
2090 0         0 my @solution_path = @{$classification{'solution_path'}};
  0         0  
2091 0         0 delete $classification{'solution_path'};
2092 0         0 my @which_classes = keys %classification;
2093 0         0 @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
  0         0  
2094 0         0 my $most_likely_class_label = $which_classes[0];
2095 0 0       0 if ($evaldebug) {
2096 0         0 print "\nClassification:\n\n";
2097 0         0 print " class probability\n";
2098 0         0 print " ---------- -----------\n";
2099 0         0 foreach my $which_class (@which_classes) {
2100 0         0 my $classstring = sprintf("%-30s", $which_class);
2101 0         0 my $valuestring = sprintf("%-30s", $classification{$which_class});
2102 0         0 print " $classstring $valuestring\n";
2103             }
2104 0         0 print "\nSolution path in the decision tree: @solution_path\n";
2105 0         0 print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
2106             }
2107 0         0 my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
2108 0 0       0 print "$test_sample_name: true_class: $true_class_label_for_sample " .
2109             "estimated_class: $most_likely_class_label\n" if $evaldebug;
2110 0         0 $confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
2111             }
2112             }
2113 0         0 print "\n\n DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
2114 0         0 my $matrix_header = " " x 30;
2115 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
2116 0         0 $matrix_header .= sprintf("%-30s", $class_name);
2117             }
2118 0         0 print "\n" . $matrix_header . "\n\n";
2119 0         0 foreach my $row_class_name (sort keys %confusion_matrix) {
2120 0         0 my $row_display = sprintf("%-30s", $row_class_name);
2121 0         0 foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
  0         0  
2122 0         0 $row_display .= sprintf( "%-30u", $confusion_matrix{$row_class_name}->{$col_class_name} );
2123             }
2124 0         0 print "$row_display\n\n";
2125             }
2126 0         0 print "\n\n";
2127 0         0 my ($diagonal_sum, $off_diagonal_sum) = (0,0);
2128 0         0 foreach my $row_class_name (sort keys %confusion_matrix) {
2129 0         0 foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
  0         0  
2130 0 0       0 if ($row_class_name eq $col_class_name) {
2131 0         0 $diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2132             } else {
2133 0         0 $off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2134             }
2135             }
2136             }
2137 0         0 my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
2138 0         0 print "\nTraining Data Quality Index: $data_quality_index (out of a possible maximum of 100)\n";
2139 0 0 0     0 if ($data_quality_index <= 80) {
    0 0        
    0 0        
    0          
2140 0         0 print "\nYour training data does not possess much class discriminatory " .
2141             "information. It could be that the classes are inherently not well " .
2142             "separable or that your constructor parameter choices are not appropriate.\n";
2143             } elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
2144 0         0 print "\nYour training data possesses some class discriminatory information " .
2145             "but it may not be sufficient for real-world applications. You might " .
2146             "try tweaking the constructor parameters to see if that improves the " .
2147             "class discriminations.\n";
2148             } elsif ($data_quality_index > 90 && $data_quality_index <= 95) {
2149 0         0 print "\nYour training data appears to possess good class discriminatory " .
2150             "information. Whether or not it is acceptable would depend on your " .
2151             "application.\n";
2152             } elsif ($data_quality_index > 95 && $data_quality_index <= 98) {
2153 0         0 print "\nYour training data is of excellent quality.\n";
2154             } else {
2155 0         0 print "\nYour training data is perfect.\n";
2156             }
2157              
2158             }
2159              
2160              
2161             ############################################# Class DTNode #############################################
2162              
2163             # The nodes of the decision tree are instances of this class:
2164              
2165             package DTNode;
2166              
2167 1     1   16 use strict;
  1         2  
  1         25  
2168 1     1   4 use Carp;
  1         1  
  1         847  
2169              
2170             # $feature is the feature test at the current node. $branch_features_and_values is
2171             # an anonymous array holding the feature names and corresponding values on the path
2172             # from the root to the current node:
2173             sub new {
2174 26     26   26 my ($class, $feature, $entropy, $class_probabilities,
2175             $branch_features_and_values_or_thresholds, $dt, $root_or_not) = @_;
2176 26 100       46 $root_or_not = '' if !defined $root_or_not;
2177 26 100       39 if ($root_or_not eq 'root') {
2178 1         2 $dt->{nodes_created} = -1;
2179 1         4 $dt->{class_names} = undef;
2180             }
2181 26         78 my $self = {
2182             _dt => $dt,
2183             _feature => $feature,
2184             _node_creation_entropy => $entropy,
2185             _class_probabilities => $class_probabilities,
2186             _branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
2187             _linked_to => [],
2188             };
2189 26         29 bless $self, $class;
2190 26         31 $self->{_serial_number} = $self->get_next_serial_num();
2191 26         29 return $self;
2192             }
2193              
2194             sub how_many_nodes {
2195 0     0   0 my $self = shift;
2196 0         0 return $self->{_dt}->{nodes_created} + 1;
2197             }
2198              
2199             sub set_class_names {
2200 1     1   1 my $self = shift;
2201 1         1 my $class_names_list = shift;
2202 1         2 $self->{_dt}->{class_names} = $class_names_list;
2203             }
2204              
2205             sub get_class_names {
2206 0     0   0 my $self = shift;
2207 0         0 return $self->{_dt}->{class_names};
2208             }
2209              
2210             sub get_next_serial_num {
2211 26     26   18 my $self = shift;
2212 26         31 $self->{_dt}->{nodes_created} += 1;
2213 26         35 return $self->{_dt}->{nodes_created};
2214             }
2215              
2216             sub get_serial_num {
2217 26     26   18 my $self = shift;
2218 26         24 $self->{_serial_number};
2219             }
2220              
2221             # this returns the feature test at the current node
2222             sub get_feature {
2223 0     0   0 my $self = shift;
2224 0         0 return $self->{ _feature };
2225             }
2226              
2227             sub set_feature {
2228 18     18   16 my $self = shift;
2229 18         16 my $feature = shift;
2230 18         20 $self->{_feature} = $feature;
2231             }
2232              
2233             sub get_node_entropy {
2234 26     26   21 my $self = shift;
2235 26         26 return $self->{_node_creation_entropy};
2236             }
2237              
2238             sub get_class_probabilities {
2239 0     0   0 my $self = shift;
2240 0         0 return $self->{ _class_probabilities};
2241             }
2242              
2243             sub get_branch_features_and_values_or_thresholds {
2244 26     26   19 my $self = shift;
2245 26         34 return $self->{_branch_features_and_values_or_thresholds};
2246             }
2247              
2248             sub add_to_branch_features_and_values {
2249 0     0   0 my $self = shift;
2250 0         0 my $feature_and_value = shift;
2251 0         0 push @{$self->{ _branch_features_and_values }}, $feature_and_value;
  0         0  
2252             }
2253              
2254             sub get_children {
2255 0     0   0 my $self = shift;
2256 0         0 return $self->{_linked_to};
2257             }
2258              
2259             sub add_child_link {
2260 25     25   22 my ($self, $new_node, ) = @_;
2261 25         16 push @{$self->{_linked_to}}, $new_node;
  25         36  
2262             }
2263              
2264             sub delete_all_links {
2265 0     0   0 my $self = shift;
2266 0         0 $self->{_linked_to} = undef;
2267             }
2268              
2269             sub display_node {
2270 0     0   0 my $self = shift;
2271 0   0     0 my $feature_at_node = $self->get_feature() || " ";
2272 0         0 my $node_creation_entropy_at_node = $self->get_node_entropy();
2273 0         0 my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2274 0         0 my @class_probabilities = @{$self->get_class_probabilities()};
  0         0  
2275 0         0 my @class_probabilities_for_display = map {sprintf("%0.3f", $_)} @class_probabilities;
  0         0  
2276 0         0 my $serial_num = $self->get_serial_num();
2277 0         0 my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
  0         0  
2278 0         0 print "\n\nNODE $serial_num" .
2279             ":\n Branch features and values to this node: @branch_features_and_values_or_thresholds" .
2280             "\n Class probabilities at current node: @class_probabilities_for_display" .
2281             "\n Entropy at current node: $print_node_creation_entropy_at_node" .
2282             "\n Best feature test at current node: $feature_at_node\n\n";
2283             }
2284              
2285             sub display_decision_tree {
2286 0     0   0 my $self = shift;
2287 0         0 my $offset = shift;
2288 0         0 my $serial_num = $self->get_serial_num();
2289 0 0       0 if (@{$self->get_children()} > 0) {
  0         0  
2290 0   0     0 my $feature_at_node = $self->get_feature() || " ";
2291 0         0 my $node_creation_entropy_at_node = $self->get_node_entropy();
2292 0         0 my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2293 0         0 my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
  0         0  
2294 0         0 my @class_probabilities = @{$self->get_class_probabilities()};
  0         0  
2295 0         0 my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
  0         0  
2296 0         0 my @class_names = @{$self->get_class_names()};
  0         0  
2297             my @print_class_probabilities_with_class =
2298 0         0 map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
  0         0  
2299 0         0 print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
2300 0         0 my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2301 0         0 print "$second_line_offset" . "Decision Feature: $feature_at_node Node Creation Entropy: " ,
2302             "$print_node_creation_entropy_at_node Class Probs: @print_class_probabilities_with_class\n\n";
2303 0         0 $offset .= " ";
2304 0         0 foreach my $child (@{$self->get_children()}) {
  0         0  
2305 0         0 $child->display_decision_tree($offset);
2306             }
2307             } else {
2308 0         0 my $node_creation_entropy_at_node = $self->get_node_entropy();
2309 0         0 my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2310 0         0 my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
  0         0  
2311 0         0 my @class_probabilities = @{$self->get_class_probabilities()};
  0         0  
2312 0         0 my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
  0         0  
2313 0         0 my @class_names = @{$self->get_class_names()};
  0         0  
2314             my @print_class_probabilities_with_class =
2315 0         0 map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
  0         0  
2316 0         0 print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
2317 0         0 my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2318 0         0 print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node " .
2319             "Class Probs: @print_class_probabilities_with_class\n\n";
2320             }
2321             }
2322              
2323              
2324             ############################## Generate Your Own Numeric Training Data #################################
2325             ############################# Class TrainingDataGeneratorNumeric ################################
2326              
2327             ## See the script generate_training_data_numeric.pl in the examples
2328             ## directory on how to use this class for generating your own numeric training and
2329             ## test data. The training and test data are generated in accordance with the
2330             ## specifications you place in the parameter file that is supplied as an argument to
2331             ## the constructor of this class.
2332              
2333             package TrainingDataGeneratorNumeric;
2334              
2335 1     1   5 use strict;
  1         1  
  1         16  
2336 1     1   3 use Carp;
  1         1  
  1         943  
2337              
2338             sub new {
2339 0     0   0 my ($class, %args) = @_;
2340 0         0 my @params = keys %args;
2341 0 0       0 croak "\nYou have used a wrong name for a keyword argument " .
2342             "--- perhaps a misspelling\n"
2343             if check_for_illegal_params3(@params) == 0;
2344             bless {
2345             _output_training_csv_file => $args{'output_training_csv_file'}
2346             || croak("name for output_training_csv_file required"),
2347             _output_test_csv_file => $args{'output_test_csv_file'}
2348             || croak("name for output_test_csv_file required"),
2349             _parameter_file => $args{'parameter_file'}
2350             || croak("parameter_file required"),
2351             _number_of_samples_for_training => $args{'number_of_samples_for_training'}
2352             || croak("number_of_samples_for_training"),
2353             _number_of_samples_for_testing => $args{'number_of_samples_for_testing'}
2354             || croak("number_of_samples_for_testing"),
2355 0   0     0 _debug => $args{debug} || 0,
      0        
      0        
      0        
      0        
      0        
2356             _class_names => [],
2357             _class_names_and_priors => {},
2358             _features_with_value_range => {},
2359             _features_ordered => [],
2360             _classes_and_their_param_values => {},
2361             }, $class;
2362             }
2363              
2364             sub check_for_illegal_params3 {
2365 0     0   0 my @params = @_;
2366 0         0 my @legal_params = qw / output_training_csv_file
2367             output_test_csv_file
2368             parameter_file
2369             number_of_samples_for_training
2370             number_of_samples_for_testing
2371             debug
2372             /;
2373 0         0 my $found_match_flag;
2374 0         0 foreach my $param (@params) {
2375 0         0 foreach my $legal (@legal_params) {
2376 0         0 $found_match_flag = 0;
2377 0 0       0 if ($param eq $legal) {
2378 0         0 $found_match_flag = 1;
2379 0         0 last;
2380             }
2381             }
2382 0 0       0 last if $found_match_flag == 0;
2383             }
2384 0         0 return $found_match_flag;
2385             }
2386              
2387             ## The training data generated by an instance of the class
2388             ## TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
2389             ## you supply to the class constructor through a constructor variable called
2390             ## `parameter_file'. This method is for parsing the parameter file in order to
2391             ## order to determine the names to be used for the different data classes, their
2392             ## means, and their variances.
2393             sub read_parameter_file_numeric {
2394 0     0   0 my $self = shift;
2395 0         0 my @class_names = ();
2396 0         0 my %class_names_and_priors = ();
2397 0         0 my %features_with_value_range = ();
2398 0         0 my %classes_and_their_param_values = ();
2399             # my $regex8 = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
2400 0   0     0 open FILE, $self->{_parameter_file} || die "unable to open parameter file: $!";
2401 0         0 my @params = ;
2402 0         0 my $params = join "", @params;
2403 0         0 my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
2404 0         0 $params =~ /$regex/si;
2405 0         0 my ($class_names, $class_priors) = ($1, $2);
2406 0         0 @class_names = split ' ', $class_names;
2407 0         0 my @class_priors = split ' ', $class_priors;
2408 0         0 foreach my $i (0..@class_names-1) {
2409 0         0 $class_names_and_priors{$class_names[$i]} = $class_priors[$i];
2410             }
2411 0 0       0 if ($self->{_debug}) {
2412 0         0 foreach my $cname (keys %class_names_and_priors) {
2413 0         0 print "$cname => $class_names_and_priors{$cname}\n";
2414             }
2415             }
2416 0         0 $regex = 'feature name: \w*.*?value range: [\d\. -]+';
2417 0         0 my @features = $params =~ /$regex/gsi;
2418 0         0 my @features_ordered;
2419 0         0 $regex = 'feature name: (\w+)\W*?value range:\s*([\d. -]+)';
2420 0         0 foreach my $feature (@features) {
2421 0         0 $feature =~ /$regex/i;
2422 0         0 my $feature_name = $1;
2423 0         0 push @features_ordered, $feature_name;
2424 0         0 my @value_range = split ' ', $2;
2425 0         0 $features_with_value_range{$feature_name} = \@value_range;
2426             }
2427 0 0       0 if ($self->{_debug}) {
2428 0         0 foreach my $fname (keys %features_with_value_range) {
2429 0         0 print "$fname => @{$features_with_value_range{$fname}}\n";
  0         0  
2430             }
2431             }
2432 0         0 foreach my $i (0..@class_names-1) {
2433 0         0 $classes_and_their_param_values{$class_names[$i]} = {};
2434             }
2435 0         0 $regex = 'params for class: \w*?\W+?mean:[\d\. ]+\W*?covariance:\W+?(?:[ \d.]+\W+?)+';
2436 0         0 my @class_params = $params =~ /$regex/gsi;
2437 0         0 $regex = 'params for class: (\w+)\W*?mean:\s*([\d. -]+)\W*covariance:\s*([\s\d.]+)';
2438 0         0 foreach my $class_param (@class_params) {
2439 0         0 $class_param =~ /$regex/gsi;
2440 0         0 my $class_name = $1;
2441 0         0 my @class_mean = split ' ', $2;
2442 0         0 $classes_and_their_param_values{$class_name}->{'mean'} = \@class_mean;
2443 0         0 my $class_param_string = $3;
2444 0         0 my @covar_rows = split '\n', $class_param_string;
2445 0         0 my @covar_matrix;
2446 0         0 foreach my $row (@covar_rows) {
2447 0         0 my @row = split ' ', $row;
2448 0         0 push @covar_matrix, \@row;
2449             }
2450 0         0 $classes_and_their_param_values{$class_name}->{'covariance'} = \@covar_matrix;
2451             }
2452 0 0       0 if ($self->{_debug}) {
2453 0         0 print "\nThe class parameters are:\n\n";
2454 0         0 foreach my $cname (keys %classes_and_their_param_values) {
2455 0         0 print "\nFor class name $cname:\n";
2456 0         0 my %params_hash = %{$classes_and_their_param_values{$cname}};
  0         0  
2457 0         0 foreach my $x (keys %params_hash) {
2458 0 0       0 if ($x eq 'mean') {
2459 0         0 print " $x => @{$params_hash{$x}}\n";
  0         0  
2460             } else {
2461 0 0       0 if ($x eq 'covariance') {
2462 0         0 print " The covariance matrix:\n";
2463 0         0 my @matrix = @{$params_hash{'covariance'}};
  0         0  
2464 0         0 foreach my $row (@matrix) {
2465 0         0 print " @$row\n";
2466             }
2467             }
2468             }
2469             }
2470             }
2471             }
2472 0         0 $self->{_class_names} = \@class_names;
2473 0         0 $self->{_class_names_and_priors} = \%class_names_and_priors;
2474 0         0 $self->{_features_with_value_range} = \%features_with_value_range;
2475 0         0 $self->{_classes_and_their_param_values} = \%classes_and_their_param_values;
2476 0         0 $self->{_features_ordered} = \@features_ordered;
2477             }
2478              
2479             ## After the parameter file is parsed by the previous method, this method calls on
2480             ## Math::Random::random_multivariate_normal() to generate the training and test data
2481             ## samples. Your training and test data can be of any number of of dimensions, can
2482             ## have any mean, and any covariance. The training and test data must obviously be
2483             ## drawn from the same distribution.
2484             sub gen_numeric_training_and_test_data_and_write_to_csv {
2485 1     1   584 use Math::Random;
  1         4461  
  1         767  
2486 0     0   0 my $self = shift;
2487 0         0 my %training_samples_for_class;
2488             my %test_samples_for_class;
2489 0         0 foreach my $class_name (@{$self->{_class_names}}) {
  0         0  
2490 0         0 $training_samples_for_class{$class_name} = [];
2491 0         0 $test_samples_for_class{$class_name} = [];
2492             }
2493 0         0 foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
  0         0  
2494 0         0 my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
  0         0  
2495 0         0 my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
  0         0  
2496             my @new_training_data = Math::Random::random_multivariate_normal(
2497 0         0 $self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
2498             @mean, @covariance );
2499             my @new_test_data = Math::Random::random_multivariate_normal(
2500 0         0 $self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
2501             @mean, @covariance );
2502 0 0       0 if ($self->{_debug}) {
2503 0         0 print "training data for class $class_name:\n";
2504 0         0 foreach my $x (@new_training_data) {print "@$x\n";}
  0         0  
2505 0         0 print "\n\ntest data for class $class_name:\n";
2506 0         0 foreach my $x (@new_test_data) {print "@$x\n";}
  0         0  
2507             }
2508 0         0 $training_samples_for_class{$class_name} = \@new_training_data;
2509 0         0 $test_samples_for_class{$class_name} = \@new_test_data;
2510             }
2511 0         0 my @training_data_records = ();
2512 0         0 my @test_data_records = ();
2513 0         0 foreach my $class_name (keys %training_samples_for_class) {
2514             my $num_of_samples_for_training = $self->{_number_of_samples_for_training} *
2515 0         0 $self->{_class_names_and_priors}->{$class_name};
2516             my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} *
2517 0         0 $self->{_class_names_and_priors}->{$class_name};
2518 0         0 foreach my $sample_index (0..$num_of_samples_for_training-1) {
2519 0         0 my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};
  0         0  
2520 0         0 @training_vector = map {sprintf("%.3f", $_)} @training_vector;
  0         0  
2521 0         0 my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";
2522 0         0 push @training_data_records, $training_data_record;
2523             }
2524 0         0 foreach my $sample_index (0..$num_of_samples_for_testing-1) {
2525 0         0 my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
  0         0  
2526 0         0 @test_vector = map {sprintf("%.3f", $_)} @test_vector;
  0         0  
2527 0         0 my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
2528 0         0 push @test_data_records, $test_data_record;
2529             }
2530             }
2531 0         0 fisher_yates_shuffle(\@training_data_records);
2532 0         0 fisher_yates_shuffle(\@test_data_records);
2533 0 0       0 if ($self->{_debug}) {
2534 0         0 foreach my $record (@training_data_records) {
2535 0         0 print "$record";
2536             }
2537 0         0 foreach my $record (@test_data_records) {
2538 0         0 print "$record";
2539             }
2540             }
2541 0         0 open OUTPUT, ">$self->{_output_training_csv_file}";
2542 0         0 my @feature_names_training = @{$self->{_features_ordered}};
  0         0  
2543 0         0 my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
  0         0  
2544 0         0 my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
2545 0         0 print OUTPUT "$first_row_training\n";
2546 0         0 foreach my $i (0..@training_data_records-1) {
2547 0         0 my $i1 = $i+1;
2548 0         0 my $sample_record = "\"$i1\",$training_data_records[$i]";
2549 0         0 print OUTPUT "$sample_record";
2550             }
2551 0         0 close OUTPUT;
2552 0         0 open OUTPUT, ">$self->{_output_test_csv_file}";
2553 0         0 my @feature_names_testing = keys %{$self->{_features_with_value_range}};
  0         0  
2554 0         0 my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
  0         0  
2555 0         0 my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
2556 0         0 print OUTPUT "$first_row_testing\n";
2557 0         0 foreach my $i (0..@test_data_records-1) {
2558 0         0 my $i1 = $i+1;
2559 0         0 my $sample_record = "\"$i1\",$test_data_records[$i]";
2560 0         0 print OUTPUT "$sample_record";
2561             }
2562 0         0 close OUTPUT;
2563             }
2564              
2565             # from perl docs:
2566             sub fisher_yates_shuffle {
2567 0     0   0 my $arr = shift;
2568 0         0 my $i = @$arr;
2569 0         0 while (--$i) {
2570 0         0 my $j = int rand( $i + 1 );
2571 0         0 @$arr[$i, $j] = @$arr[$j, $i];
2572             }
2573             }
2574              
2575             ########################### Generate Your Own Symbolic Training Data ###############################
2576             ########################### Class TrainingDataGeneratorSymbolic #############################
2577              
2578             ## See the sample script generate_training_and_test_data_symbolic.pl for how to use
2579             ## this class for generating purely symbolic training and test data. The data is
2580             ## generated according to the specifications you place in a parameter file whose
2581             ## name you supply as one of constructor arguments.
2582             package TrainingDataGeneratorSymbolic;
2583              
2584 1     1   6 use strict;
  1         1  
  1         21  
2585 1     1   3 use Carp;
  1         2  
  1         1734  
2586              
2587             sub new {
2588 1     1   14 my ($class, %args) = @_;
2589 1         4 my @params = keys %args;
2590 1 50       3 croak "\nYou have used a wrong name for a keyword argument " .
2591             "--- perhaps a misspelling\n"
2592             if check_for_illegal_params4(@params) == 0;
2593             bless {
2594             _output_training_datafile => $args{'output_training_datafile'}
2595             || die("name for output_training_datafile required"),
2596             _parameter_file => $args{'parameter_file'}
2597             || die("parameter_file required"),
2598             _number_of_samples_for_training => $args{'number_of_samples_for_training'}
2599             || die("number_of_samples_for_training required"),
2600 1   50     17 _debug => $args{debug} || 0,
      50        
      50        
      50        
2601             _class_names => [],
2602             _class_priors => [],
2603             _features_and_values_hash => {},
2604             _bias_hash => {},
2605             _training_sample_records => {},
2606             }, $class;
2607             }
2608              
2609             sub check_for_illegal_params4 {
2610 1     1   2 my @params = @_;
2611 1         2 my @legal_params = qw / output_training_datafile
2612             parameter_file
2613             number_of_samples_for_training
2614             debug
2615             /;
2616 1         1 my $found_match_flag;
2617 1         2 foreach my $param (@params) {
2618 3         3 foreach my $legal (@legal_params) {
2619 6         3 $found_match_flag = 0;
2620 6 100       10 if ($param eq $legal) {
2621 3         2 $found_match_flag = 1;
2622 3         3 last;
2623             }
2624             }
2625 3 50       6 last if $found_match_flag == 0;
2626             }
2627 1         3 return $found_match_flag;
2628             }
2629              
2630             ## Read a parameter file for generating symbolic training data. See the script
2631             ## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
2632             ## pass the name of the parameter file to the constructor of the
2633             ## TrainingDataGeneratorSymbolic class.
2634             sub read_parameter_file_symbolic {
2635 1     1   4 my $self = shift;
2636 1         5 my $debug = $self->{_debug};
2637 1         1 my $number_of_training_samples = $self->{_number_of_samples_for_training};
2638 1         2 my $input_parameter_file = $self->{_parameter_file};
2639 1 50       3 croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
2640 1         2 my $output_file_training = $self->{_output_training_datafile};
2641 1         1 my $output_file_testing = $self->{_output_test_datafile};
2642 1         1 my @all_params;
2643             my $param_string;
2644 1   33     24 open INPUT, $input_parameter_file || "unable to open parameter file: $!";
2645 1         30 @all_params = ;
2646 1         3 @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
  40         46  
2647 1         3 @all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
  36         58  
2648 1         5 $param_string = join ' ', @all_params;
2649 1         7 my ($class_names, $class_priors, $rest_param) =
2650             $param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
2651 1 50       40 my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
  3         14  
2652 1         1 push @{$self->{_class_names}}, @class_names;
  1         2  
2653 1 50       5 my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
  3         10  
2654 1         1 push @{$self->{_class_priors}}, @class_priors;
  1         3  
2655 1         8 my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;
2656 1         1 my %features_and_values_hash;
2657 1         6 my @features = split /(feature[:])/, $feature_string;
2658 1 50       5 @features = grep {defined($_) && length($_) > 0} @features;
  9         23  
2659 1         2 foreach my $item (@features) {
2660 8 100       12 next if $item =~ /feature/;
2661 4         13 my @splits = split / /, $item;
2662 4 50       4 @splits = grep {defined($_) && length($_) > 0} @splits;
  27         61  
2663 4         8 foreach my $i (0..@splits-1) {
2664 22 100       24 if ($i == 0) {
2665 4         7 $features_and_values_hash{$splits[0]} = [];
2666             } else {
2667 18 100       25 next if $splits[$i] =~ /values/;
2668 14         9 push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
  14         21  
2669             }
2670             }
2671             }
2672 1         2 $self->{_features_and_values_hash} = \%features_and_values_hash;
2673 1         1 my %bias_hash = %{$self->{_bias_hash}};
  1         3  
2674 1         7 my @biases = split /(bias[:]\s*class[:])/, $bias_string;
2675 1 50       1 @biases = grep {defined($_) && length($_) > 0} @biases;
  5         13  
2676 1         2 foreach my $item (@biases) {
2677 4 100       8 next if $item =~ /bias/;
2678 2         16 my @splits = split /\s+/, $item;
2679 2 50       4 @splits = grep {defined($_) && length($_) > 0} @splits;
  18         43  
2680 2         1 my $feature_name;
2681 2         4 foreach my $i (0..@splits-1) {
2682 16 100       29 if ($i == 0) {
    100          
2683 2         3 $bias_hash{$splits[0]} = {};
2684             } elsif ($splits[$i] =~ /(^.+)[:]$/) {
2685 8         9 $feature_name = $1;
2686 8         14 $bias_hash{$splits[0]}->{$feature_name} = [];
2687             } else {
2688 6 50       9 next if !defined $feature_name;
2689 6 50       8 push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
  6         11  
2690             if defined $feature_name;
2691             }
2692             }
2693             }
2694 1         1 $self->{_bias_hash} = \%bias_hash;
2695 1 50       10 if ($debug) {
2696 0         0 print "\n\nClass names: @class_names\n";
2697 0         0 my $num_of_classes = @class_names;
2698 0         0 print "Class priors: @class_priors\n";
2699 0         0 print "Number of classes: $num_of_classes\n";
2700 0         0 print "\nHere are the features and their possible values:\n";
2701 0         0 while ( my ($k, $v) = each %features_and_values_hash ) {
2702 0         0 print "$k ===> @$v\n";
2703             }
2704 0         0 print "\nHere is the biasing for each class:\n";
2705 0         0 while ( my ($k, $v) = each %bias_hash ) {
2706 0         0 print "$k:\n";
2707 0         0 while ( my ($k1, $v1) = each %$v ) {
2708 0         0 print " $k1 ===> @$v1\n";
2709             }
2710             }
2711             }
2712             }
2713              
2714             ## This method generates training data according to the specifications placed in a
2715             ## parameter file that is read by the previous method.
2716             sub gen_symbolic_training_data {
2717 1     1   5 my $self = shift;
2718 1         1 my @class_names = @{$self->{_class_names}};
  1         3  
2719 1         1 my @class_priors = @{$self->{_class_priors}};
  1         3  
2720 1         1 my %training_sample_records;
2721 1         1 my %features_and_values_hash = %{$self->{_features_and_values_hash}};
  1         4  
2722 1         1 my %bias_hash = %{$self->{_bias_hash}};
  1         3  
2723 1         1 my $how_many_training_samples = $self->{_number_of_samples_for_training};
2724 1         1 my $how_many_test_samples = $self->{_number_of_samples_for_testing};
2725 1         1 my %class_priors_to_unit_interval_map;
2726 1         2 my $accumulated_interval = 0;
2727 1         2 foreach my $i (0..@class_names-1) {
2728 2         5 $class_priors_to_unit_interval_map{$class_names[$i]}
2729             = [$accumulated_interval, $accumulated_interval + $class_priors[$i]];
2730 2         4 $accumulated_interval += $class_priors[$i];
2731             }
2732 1 50       2 if ($self->{_debug}) {
2733 0         0 print "Mapping of class priors to unit interval: \n";
2734 0         0 while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {
2735 0         0 print "$k => @$v\n";
2736             }
2737 0         0 print "\n\n";
2738             }
2739 1         2 my $ele_index = 0;
2740 1         2 while ($ele_index < $how_many_training_samples) {
2741 35         33 my $sample_name = "sample" . "_$ele_index";
2742 35         50 $training_sample_records{$sample_name} = [];
2743             # Generate class label for this training sample:
2744 35         49 my $roll_the_dice = rand(1.0);
2745 35         33 my $class_label;
2746 35         39 foreach my $class_name (keys %class_priors_to_unit_interval_map ) {
2747 46         30 my $v = $class_priors_to_unit_interval_map{$class_name};
2748 46 100 66     121 if ( ($roll_the_dice >= $v->[0]) && ($roll_the_dice <= $v->[1]) ) {
2749 35         21 push @{$training_sample_records{$sample_name}},
  35         51  
2750             "class=" . $class_name;
2751 35         25 $class_label = $class_name;
2752 35         36 last;
2753             }
2754             }
2755 35         45 foreach my $feature (keys %features_and_values_hash) {
2756 140         88 my @values = @{$features_and_values_hash{$feature}};
  140         212  
2757 140         123 my $bias_string = $bias_hash{$class_label}->{$feature}->[0];
2758 140         111 my $no_bias = 1.0 / @values;
2759 140 100       271 $bias_string = "$values[0]" . "=$no_bias" if !defined $bias_string;
2760 140         68 my %value_priors_to_unit_interval_map;
2761 140         352 my @splits = split /\s*=\s*/, $bias_string;
2762 140         102 my $chosen_for_bias_value = $splits[0];
2763 140         91 my $chosen_bias = $splits[1];
2764 140         151 my $remaining_bias = 1 - $chosen_bias;
2765 140         123 my $remaining_portion_bias = $remaining_bias / (@values -1);
2766 140 50       111 @splits = grep {defined($_) && length($_) > 0} @splits;
  280         786  
2767 140         105 my $accumulated = 0;
2768 140         149 foreach my $i (0..@values-1) {
2769 490 100       481 if ($values[$i] eq $chosen_for_bias_value) {
2770 140         197 $value_priors_to_unit_interval_map{$values[$i]}
2771             = [$accumulated, $accumulated + $chosen_bias];
2772 140         124 $accumulated += $chosen_bias;
2773             } else {
2774 350         411 $value_priors_to_unit_interval_map{$values[$i]}
2775             = [$accumulated, $accumulated + $remaining_portion_bias];
2776 350         283 $accumulated += $remaining_portion_bias;
2777             }
2778             }
2779 140         108 my $roll_the_dice = rand(1.0);
2780 140         80 my $value_label;
2781 140         173 foreach my $value_name (keys %value_priors_to_unit_interval_map ) {
2782 312         222 my $v = $value_priors_to_unit_interval_map{$value_name};
2783 312 100 100     749 if ( ($roll_the_dice >= $v->[0])
2784             && ($roll_the_dice <= $v->[1]) ) {
2785 140         79 push @{$training_sample_records{$sample_name}},
  140         268  
2786             $feature . "=" . $value_name;
2787 140         91 $value_label = $value_name;
2788 140         102 last;
2789             }
2790             }
2791 140 50       401 if ($self->{_debug}) {
2792 0         0 print "mapping feature value priors for '$feature' " .
2793             "to unit interval: \n";
2794 0         0 while ( my ($k, $v) =
2795             each %value_priors_to_unit_interval_map ) {
2796 0         0 print "$k => @$v\n";
2797             }
2798 0         0 print "\n\n";
2799             }
2800             }
2801 35         61 $ele_index++;
2802             }
2803 1         2 $self->{_training_sample_records} = \%training_sample_records;
2804 1 50       3 if ($self->{_debug}) {
2805 0         0 print "\n\nPRINTING TRAINING RECORDS:\n\n";
2806 0         0 foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {
  0         0  
2807 0         0 print "$kee => @{$training_sample_records{$kee}}\n\n";
  0         0  
2808             }
2809             }
2810 1         1 my $output_training_file = $self->{_output_training_datafile};
2811 1 50       3 print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};
2812 1         77 open FILEHANDLE, ">$output_training_file";
2813 1         7 my @features = sort keys %features_and_values_hash;
2814 1         1 my $title_string = ',class';
2815 1         2 foreach my $feature_name (@features) {
2816 4         5 $title_string .= ',' . $feature_name;
2817             }
2818 1         11 print FILEHANDLE "$title_string\n";
2819 1         11 my @sample_names = sort {$a <=> $b} map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;
  105         91  
  35         38  
  35         36  
2820 1         5 my $record_string = '';
2821 1         2 foreach my $sample_name (@sample_names) {
2822 35         27 $record_string .= "$sample_name,";
2823 35         19 my @record = @{$training_sample_records{"sample_$sample_name"}};
  35         55  
2824 35         45 my %item_parts_hash;
2825 35         26 foreach my $item (@record) {
2826 175         220 my @splits = grep $_, split /=/, $item;
2827 175         196 $item_parts_hash{$splits[0]} = $splits[1];
2828             }
2829 35         25 $record_string .= $item_parts_hash{"class"};
2830 35         29 delete $item_parts_hash{"class"};
2831 35         60 my @kees = sort keys %item_parts_hash;
2832 35         30 foreach my $kee (@kees) {
2833 140         115 $record_string .= ",$item_parts_hash{$kee}";
2834             }
2835 35         33 print FILEHANDLE "$record_string\n";
2836 35         47 $record_string = '';
2837             }
2838 1         36 close FILEHANDLE;
2839             }
2840              
2841             sub sample_index {
2842 0     0     my $arg = shift;
2843 0           $arg =~ /_(.+)$/;
2844 0           return $1;
2845             }
2846              
2847             ################################# Decision Tree Introspection #######################################
2848             ################################# Class DTIntrospection #######################################
2849              
2850             package DTIntrospection;
2851              
2852             ## Instances constructed from this class can provide explanations for the
2853             ## classification decisions at the nodes of a decision tree.
2854             ##
2855             ## When used in the interactive mode, the decision-tree introspection made possible
2856             ## by this class provides answers to the following three questions: (1) List of the
2857             ## training samples that fall in the portion of the feature space that corresponds
2858             ## to a node of the decision tree; (2) The probabilities associated with the last
2859             ## feature test that led to the node; and (3) The class probabilities predicated on
2860             ## just the last feature test on the path to that node.
2861             ##
2862             ## CAVEAT: It is possible for a node to exist even when there are no training
2863             ## samples in the portion of the feature space that corresponds to the node. That
2864             ## is because a decision tree is based on the probability densities estimated from
2865             ## the training data. When training data is non-uniformly distributed, it is
2866             ## possible for the probability associated with a point in the feature space to be
2867             ## non-zero even when there are no training samples at or in the vicinity of that
2868             ## point.
2869             ##
2870             ## For a node to exist even where there are no training samples in the portion of
2871             ## the feature space that belongs to the node is an indication of the generalization
2872             ## ability of decision-tree based classification.
2873             ##
2874             ## When used in a non-interactive mode, an instance of this class can be used to
2875             ## create a tabular display that shows what training samples belong directly to the
2876             ## portion of the feature space that corresponds to each node of the decision tree.
2877             ## An instance of this class can also construct a tabular display that shows how the
2878             ## influence of each training sample propagates in the decision tree. For each
2879             ## training sample, this display first shows the list of nodes that came into
2880             ## existence through feature test(s) that used the data provided by that sample.
2881             ## This list for each training sample is followed by a subtree of the nodes that owe
2882             ## their existence indirectly to the training sample. A training sample influences a
2883             ## node indirectly if the node is a descendant of another node that is affected
2884             ## directly by the training sample.
2885              
2886 1     1   5 use strict;
  1         1  
  1         17  
2887 1     1   4 use Carp;
  1         0  
  1         1992  
2888              
2889             sub new {
2890 0     0     my ($class, $dt) = @_;
2891 0 0         croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
2892             unless ref($dt) eq "Algorithm::DecisionTree";
2893             bless {
2894             _dt => $dt,
2895             _root_dtnode => $dt->{_root_node},
2896 0           _samples_at_nodes_hash => {},
2897             _branch_features_to_nodes_hash => {},
2898             _sample_to_node_mapping_direct_hash => {},
2899             _node_serial_num_to_node_hash => {},
2900             _awareness_raising_msg_shown => 0,
2901             _debug => 0,
2902             }, $class;
2903             }
2904              
2905             sub initialize {
2906 0     0     my $self = shift;
2907             croak "You must first construct the decision tree before using the DTIntrospection class"
2908 0 0         unless $self->{_root_dtnode};
2909 0           $self->recursive_descent($self->{_root_dtnode});
2910             }
2911              
2912             sub recursive_descent {
2913 0     0     my $self = shift;
2914 0           my $node = shift;
2915 0           my $node_serial_number = $node->get_serial_num();
2916 0           my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2917 0 0         print "\nAt node $node_serial_number: the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
  0            
2918 0           $self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
2919 0           $self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
2920 0           my @samples_at_node = ();
2921 0           foreach my $item (@$branch_features_and_values_or_thresholds) {
2922 0           my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
2923 0 0         unless (@samples_at_node) {
2924 0           @samples_at_node = @$samples_for_feature_value_combo;
2925             } else {
2926 0           my @accum;
2927 0           foreach my $sample (@samples_at_node) {
2928 0 0         push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);
2929             }
2930 0           @samples_at_node = @accum;
2931             }
2932 0 0         last unless @samples_at_node;
2933             }
2934 0           @samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node;
  0            
2935 0 0         print "Node: $node_serial_number the samples are: [@samples_at_node]\n" if ($self->{_debug});
2936 0           $self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
2937 0 0         if (@samples_at_node) {
2938 0           foreach my $sample (@samples_at_node) {
2939 0 0         if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2940 0           $self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number];
2941             } else {
2942 0           push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
  0            
2943             }
2944             }
2945             }
2946 0           my $children = $node->get_children();
2947 0           foreach my $child (@$children) {
2948 0           $self->recursive_descent($child);
2949             }
2950             }
2951              
2952             sub display_training_samples_at_all_nodes_direct_influence_only {
2953 0     0     my $self = shift;
2954             croak "You must first construct the decision tree before using the DT Introspection class."
2955 0 0         unless $self->{_root_dtnode};
2956 0           $self->recursive_descent_for_showing_samples_at_a_node($self->{_root_dtnode});
2957             }
2958              
2959             sub recursive_descent_for_showing_samples_at_a_node{
2960 0     0     my $self = shift;
2961 0           my $node = shift;
2962 0           my $node_serial_number = $node->get_serial_num();
2963 0           my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2964 0 0         if (exists $self->{_samples_at_nodes_hash}->{$node_serial_number}) {
2965 0 0         print "\nAt node $node_serial_number: the branch features and values are: [@{$branch_features_and_values_or_thresholds}]\n" if $self->{_debug};
  0            
2966 0           print "Node $node_serial_number: the samples are: [@{$self->{_samples_at_nodes_hash}->{$node_serial_number}}]\n";
  0            
2967             }
2968 0           map $self->recursive_descent_for_showing_samples_at_a_node($_), @{$node->get_children()};
  0            
2969             }
2970              
2971             sub display_training_samples_to_nodes_influence_propagation {
2972 0     0     my $self = shift;
2973 0           foreach my $sample (sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} keys %{$self->{_dt}->{_training_data_hash}}) {
  0            
  0            
2974 0 0         if (exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2975 0           my $nodes_directly_affected = $self->{_sample_to_node_mapping_direct_hash}->{$sample};
2976 0           print "\n$sample:\n nodes affected directly: [@{$nodes_directly_affected}]\n";
  0            
2977 0           print " nodes affected through probabilistic generalization:\n";
2978 0           map $self->recursive_descent_for_sample_to_node_influence($_, $nodes_directly_affected, " "), @$nodes_directly_affected;
2979             }
2980             }
2981             }
2982              
2983             sub recursive_descent_for_sample_to_node_influence {
2984 0     0     my $self = shift;
2985 0           my $node_serial_num = shift;
2986 0           my $nodes_already_accounted_for = shift;
2987 0           my $offset = shift;
2988 0           $offset .= " ";
2989 0           my $node = $self->{_node_serial_num_to_node_hash}->{$node_serial_num};
2990 0           my @children = map $_->get_serial_num(), @{$node->get_children()};
  0            
2991 0           my @children_affected = grep {!Algorithm::DecisionTree::contained_in($_, @{$nodes_already_accounted_for})} @children;
  0            
  0            
2992 0 0         if (@children_affected) {
2993 0           print "$offset $node_serial_num => [@children_affected]\n";
2994             }
2995 0           map $self->recursive_descent_for_sample_to_node_influence($_, \@children_affected, $offset), @children_affected;
2996             }
2997              
2998             sub get_samples_for_feature_value_combo {
2999 0     0     my $self = shift;
3000 0           my $feature_value_combo = shift;
3001 0           my ($feature,$op,$value) = $self->extract_feature_op_val($feature_value_combo);
3002 0           my @samples = ();
3003 0 0         if ($op eq '=') {
    0          
    0          
3004 0           @samples = grep Algorithm::DecisionTree::contained_in($feature_value_combo, @{$self->{_dt}->{_training_data_hash}->{$_}}), keys %{$self->{_dt}->{_training_data_hash}};
  0            
  0            
3005             } elsif ($op eq '<') {
3006 0           foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
  0            
3007 0           my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
  0            
3008 0           foreach my $item (@features_and_values) {
3009 0           my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3010 0 0 0       if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data <= $value)) {
      0        
3011 0           push @samples, $sample;
3012 0           last;
3013             }
3014             }
3015             }
3016             } elsif ($op eq '>') {
3017 0           foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
  0            
3018 0           my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
  0            
3019 0           foreach my $item (@features_and_values) {
3020 0           my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3021 0 0 0       if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data > $value)) {
      0        
3022 0           push @samples, $sample;
3023 0           last;
3024             }
3025             }
3026             }
3027             } else {
3028 0           die "Something strange is going on";
3029             }
3030 0           return \@samples;
3031             }
3032              
3033             sub extract_feature_op_val {
3034 0     0     my $self = shift;
3035 0           my $feature_value_combo = shift;
3036 0           my $pattern1 = '(.+)=(.+)';
3037 0           my $pattern2 = '(.+)<(.+)';
3038 0           my $pattern3 = '(.+)>(.+)';
3039 0           my ($feature,$value,$op);
3040 0 0         if ($feature_value_combo =~ /$pattern2/) {
    0          
    0          
3041 0           ($feature,$op,$value) = ($1,'<',$2);
3042             } elsif ($feature_value_combo =~ /$pattern3/) {
3043 0           ($feature,$op,$value) = ($1,'>',$2);
3044             } elsif ($feature_value_combo =~ /$pattern1/) {
3045 0           ($feature,$op,$value) = ($1,'=',$2);
3046             }
3047 0           return ($feature,$op,$value);
3048             }
3049              
3050             sub explain_classifications_at_multiple_nodes_interactively {
3051 0     0     my $self = shift;
3052             croak "You called explain_classification_at_multiple_nodes_interactively() without " .
3053             "first initializing the DTIntrospection instance in your code. Aborting."
3054 0 0         unless $self->{_samples_at_nodes_hash};
3055 0           print "\n\nIn order for the decision tree to introspect\n\n";
3056 0           print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3057             " that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3058             " with it even when there are NO training data points in that region?\n" .
3059             "\nEnter 'y' for yes or any other character for no: ";
3060 0           my $ans = ;
3061 0           $ans =~ s/^\s*|\s*$//g;
3062 0 0         die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decisions in the decision tree. Aborting!\n" if $ans !~ /^ye?s?$/;
3063 0           $self->{_awareness_raising_msg_shown} = 1;
3064 0           while (1) {
3065 0           my $node_id;
3066             my $ans;
3067 0           while (1) {
3068 0           print "\nEnter the integer ID of a node: ";
3069 0           $ans = ;
3070 0           $ans =~ s/^\s*|\s*$//g;
3071 0 0         return if $ans =~ /^exit$/;
3072 0 0         last if Algorithm::DecisionTree::contained_in($ans, keys %{$self->{_samples_at_nodes_hash}});
  0            
3073 0           print "\nYour answer must be an integer ID of a node. Try again or enter 'exit'.\n";
3074             }
3075 0           $node_id = $ans;
3076 0           $self->explain_classification_at_one_node($node_id)
3077             }
3078             }
3079              
3080             sub explain_classification_at_one_node {
3081 0     0     my $self = shift;
3082 0           my $node_id = shift;
3083             croak "You called explain_classification_at_one_node() without first initializing " .
3084 0 0         "the DTIntrospection instance in your code. Aborting." unless $self->{_samples_at_nodes_hash};
3085 0 0         unless (exists $self->{_samples_at_nodes_hash}->{$node_id}) {
3086 0           print "Node $node_id is not a node in the tree\n";
3087 0           return;
3088             }
3089 0 0         unless ($self->{_awareness_raising_msg_shown}) {
3090 0           print "\n\nIn order for the decision tree to introspect at Node $node_id: \n\n";
3091 0           print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3092             " that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3093             " with it even when there are NO training data points in that region?\n" .
3094             "\nEnter 'y' for yes or any other character for no: ";
3095 0           my $ans = ;
3096 0           $ans =~ s/^\s*|\s*$//g;
3097 0 0         die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decision at node $node_id\n" if $ans !~ /^ye?s?$/;
3098             }
3099 0           my @samples_at_node = @{$self->{_samples_at_nodes_hash}->{$node_id}};
  0            
3100 0           my @branch_features_to_node = @{$self->{_branch_features_to_nodes_hash}->{$node_id}};
  0            
3101             # my @class_names = @{$self->get_class_names()};
3102 0           my @class_names = $self->{_dt}->get_class_names();
3103 0           my $class_probabilities = $self->{_root_dtnode}->get_class_probabilities();
3104 0           my ($feature,$op,$value) = $self->extract_feature_op_val( $branch_features_to_node[-1] );
3105 0 0         my $msg = @samples_at_node == 0
3106             ? "\n\n There are NO training data samples directly in the region of the feature space assigned to node $node_id: @samples_at_node\n\n"
3107             : "\n Samples in the portion of the feature space assigned to Node $node_id: @samples_at_node\n";
3108 0           $msg .= "\n Features tests on the branch to node $node_id: [@branch_features_to_node]\n\n";
3109 0           $msg .= "\n Would you like to see the probability associated with the last feature test on the branch leading to Node $node_id?\n";
3110 0           $msg .= "\n Enter 'y' if yes and any other character for 'no': ";
3111 0           print $msg;
3112 0           my $ans = ;
3113 0           $ans =~ s/^\s*|\s*$//g;
3114 0 0         if ($ans =~ /^ye?s?$/) {
3115 0           my $sequence = [$branch_features_to_node[-1]];
3116 0           my $prob = $self->{_dt}->probability_of_a_sequence_of_features_and_values_or_thresholds($sequence);
3117 0           print "\n probability of @{$sequence} is: $prob\n";
  0            
3118             }
3119 0           $msg = "\n Using Bayes rule, would you like to see the class probabilities predicated on just the last feature test on the branch leading to Node $node_id?\n";
3120 0           $msg .= "\n Enter 'y' for yes and any other character for no: ";
3121 0           print $msg;
3122 0           $ans = ;
3123 0           $ans =~ s/^\s*|\s*$//g;
3124 0 0         if ($ans =~ /^ye?s?$/) {
3125 0           my $sequence = [$branch_features_to_node[-1]];
3126 0           foreach my $cls (@class_names) {
3127 0           my $prob = $self->{_dt}->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($cls, $sequence);
3128 0           print "\n probability of class $cls given just the feature test @{$sequence} is: $prob\n";
  0            
3129             }
3130             } else {
3131 0           print "goodbye\n";
3132             }
3133 0           print "\n Finished supplying information on Node $node_id\n\n";
3134             }
3135              
3136             1;
3137              
3138             =pod
3139              
3140             =head1 NAME
3141              
3142             Algorithm::DecisionTree - A Perl module for decision-tree based classification of
3143             multidimensional data.
3144              
3145              
3146             =head1 SYNOPSIS
3147              
3148             # FOR CONSTRUCTING A DECISION TREE AND FOR CLASSIFYING A SAMPLE:
3149              
3150             # In general, your call for constructing an instance of the DecisionTree class
3151             # will look like:
3152              
3153             my $training_datafile = "stage3cancer.csv";
3154             my $dt = Algorithm::DecisionTree->new(
3155             training_datafile => $training_datafile,
3156             csv_class_column_index => 2,
3157             csv_columns_for_features => [3,4,5,6,7,8],
3158             entropy_threshold => 0.01,
3159             max_depth_desired => 8,
3160             symbolic_to_numeric_cardinality_threshold => 10,
3161             csv_cleanup_needed => 1,
3162             );
3163              
3164             # The constructor option `csv_class_column_index' informs the module as to which
3165             # column of your CSV file contains the class label. THE COLUMN INDEXING IS ZERO
3166             # BASED. The constructor option `csv_columns_for_features' specifies which columns
3167             # are to be used for feature values. The first row of the CSV file must specify
3168             # the names of the features. See examples of CSV files in the `Examples'
3169             # subdirectory.
3170              
3171             # The option `symbolic_to_numeric_cardinality_threshold' is also important. For
3172             # the example shown above, if an ostensibly numeric feature takes on only 10 or
3173             # fewer different values in your training datafile, it will be treated like a
3174             # symbolic features. The option `entropy_threshold' determines the granularity
3175             # with which the entropies are sampled for the purpose of calculating entropy gain
3176             # with a particular choice of decision threshold for a numeric feature or a feature
3177             # value for a symbolic feature.
3178              
3179             # The option 'csv_cleanup_needed' is by default set to 0. If you set it
3180             # to 1, that would cause all line records in your CSV file to be "sanitized" before
3181             # they are used for constructing a decision tree. You need this option if your CSV
3182             # file uses double-quoted field names and field values in the line records and if
3183             # such double-quoted strings are allowed to include commas for, presumably, better
3184             # readability.
3185              
3186             # After you have constructed an instance of the DecisionTree class as shown above,
3187             # you read in the training data file and initialize the probability cache by
3188             # calling:
3189              
3190             $dt->get_training_data();
3191             $dt->calculate_first_order_probabilities();
3192             $dt->calculate_class_priors();
3193              
3194             # Next you construct a decision tree for your training data by calling:
3195              
3196             $root_node = $dt->construct_decision_tree_classifier();
3197              
3198             # where $root_node is an instance of the DTNode class that is also defined in the
3199             # module file. Now you are ready to classify a new data record. Let's say that
3200             # your data record looks like:
3201              
3202             my @test_sample = qw / g2=4.2
3203             grade=2.3
3204             gleason=4
3205             eet=1.7
3206             age=55.0
3207             ploidy=diploid /;
3208              
3209             # You can classify it by calling:
3210              
3211             my $classification = $dt->classify($root_node, \@test_sample);
3212              
3213             # The call to `classify()' returns a reference to a hash whose keys are the class
3214             # names and the values the associated classification probabilities. This hash also
3215             # includes another key-value pair for the solution path from the root node to the
3216             # leaf node at which the final classification was carried out.
3217              
3218              
3219             =head1 CHANGES
3220              
3221             B This version reintroduces C as an optional
3222             parameter in the module constructor. This was done in response to several requests
3223             received from the user community. (Previously, all line records from a CSV file were
3224             processed by the C function no matter what.) The main point made by
3225             the users was that invoking C when there was no need for CSV clean-up
3226             extracted a performance penalty when ingesting large database files with tens of
3227             thousands of line records. In addition to making C optional, I
3228             have also tweaked up the code in the C function in order to extract
3229             data from a larger range of messy CSV files.
3230              
3231             B All the changes made in this version relate to the construction of
3232             regression trees. I have fixed a couple of bugs in the calculation of the regression
3233             coefficients. Additionally, the C class now comes with a new
3234             constructor parameter named C. For most cases, you'd set this
3235             parameter to 0, which causes the regression coefficients to be estimated through
3236             linear least-squares minimization.
3237              
3238             B In addition to constructing decision trees, this version of the
3239             module also allows you to construct regression trees. The regression tree capability
3240             has been packed into a separate subclass, named C, of the main
3241             C class. The subdirectory C in the main
3242             installation directory illustrates how you can use this new functionality of the
3243             module.
3244              
3245             B This version incorporates four very significant upgrades/changes to
3246             the C module: B<(1)> The CSV cleanup is now the default. So you do not
3247             have to set any special parameters in the constructor calls to initiate CSV
3248             cleanup. B<(2)> In the form of a new Perl class named C,
3249             this module provides you with an easy-to-use programming interface for attempting
3250             needle-in-a-haystack solutions for the case when your training data is overwhelmingly
3251             dominated by a single class. You need to set the constructor parameter
3252             C to invoke the logic that constructs multiple
3253             decision trees, each using the minority class samples along with samples drawn
3254             randomly from the majority class. The final classification is made through a
3255             majority vote from all the decision trees. B<(3)> Assuming you are faced with a
3256             big-data problem --- in the sense that you have been given a training database with a
3257             very large number of training records --- the class C will
3258             also let you construct multiple decision trees by pulling training data randomly from
3259             your training database (without paying attention to the relative populations of the
3260             classes). The final classification decision for a test sample is based on a majority
3261             vote from all the decision trees thus constructed. See the C
3262             directory for how to use these new features of the module. And, finally, B<(4)>
3263             Support for the old-style '.dat' training files has been dropped in this version.
3264              
3265             B This version makes it easier to use a CSV training file that
3266             violates the assumption that a comma be used only to separate the different field
3267             values in a line record. Some large econometrics databases use double-quoted values
3268             for fields, and these values may contain commas (presumably for better readability).
3269             This version also allows you to specify the leftmost entry in the first CSV record
3270             that names all the fields. Previously, this entry was required to be an empty
3271             double-quoted string. I have also made some minor changes to the
3272             'C' method to make it more user friendly for large
3273             training files that may contain tens of thousands of records. When pulling training
3274             data from such files, this method prints out a dot on the terminal screen for every
3275             10000 records it has processed.
3276              
3277             B This version brings the boosting capability to the C
3278             module.
3279              
3280             B This version adds bagging to the C module. If your
3281             training dataset is large enough, you can ask the module to construct multiple
3282             decision trees using data bags extracted from your dataset. The module can show you
3283             the results returned by the individual decision trees and also the results obtained
3284             by taking a majority vote of the classification decisions made by the individual
3285             trees. You can specify any arbitrary extent of overlap between the data bags.
3286              
3287             B The introspection capability in this version packs more of a punch.
3288             For each training data sample, you can now figure out not only the decision-tree
3289             nodes that are affected directly by that sample, but also those nodes that are
3290             affected indirectly through the generalization achieved by the probabilistic modeling
3291             of the data. The 'examples' directory of this version includes additional scripts
3292             that illustrate these enhancements to the introspection capability. See the section
3293             "The Introspection API" for a declaration of the introspection related methods, old
3294             and new.
3295              
3296             B In response to requests from several users, this version includes a new
3297             capability: You can now ask the module to introspect about the classification
3298             decisions returned by the decision tree. Toward that end, the module includes a new
3299             class named C. Perhaps the most important bit of information you
3300             are likely to seek through DT introspection is the list of the training samples that
3301             fall directly in the portion of the feature space that is assigned to a node.
3302             B When training samples are non-uniformly distributed in the underlying
3303             feature space, IT IS POSSIBLE FOR A NODE TO EXIST EVEN WHEN NO TRAINING SAMPLES FALL
3304             IN THE PORTION OF THE FEATURE SPACE ASSIGNED TO THE NODE. B<(This is an important
3305             part of the generalization achieved by probabilistic modeling of the training data.)>
3306             For additional information related to DT introspection, see the section titled
3307             "DECISION TREE INTROSPECTION" in this documentation page.
3308              
3309             B makes the logic of tree construction from the old-style '.dat' training
3310             files more consistent with how trees are constructed from the data in `.csv' files.
3311             The inconsistency in the past was with respect to the naming convention for the class
3312             labels associated with the different data records.
3313              
3314             B fixes a bug in the part of the module that some folks use for generating
3315             synthetic data for experimenting with decision tree construction and classification.
3316             In the class C that is a part of the module, there
3317             was a problem with the order in which the features were recorded from the
3318             user-supplied parameter file. The basic code for decision tree construction and
3319             classification remains unchanged.
3320              
3321             B further downshifts the required version of Perl for this module. This
3322             was a result of testing the module with Version 5.10.1 of Perl. Only one statement
3323             in the module code needed to be changed for the module to work with the older version
3324             of Perl.
3325              
3326             B fixes the C restriction on the required Perl version. This
3327             version should work with Perl versions 5.14.0 and higher.
3328              
3329             B changes the required version of Perl from 5.18.0 to 5.14.0. Everything
3330             else remains the same.
3331              
3332             B should prove more robust when the probability distribution for the
3333             values of a feature is expected to be heavy-tailed; that is, when the supposedly rare
3334             observations can occur with significant probabilities. A new option in the
3335             DecisionTree constructor lets the user specify the precision with which the
3336             probability distributions are estimated for such features.
3337              
3338             B fixes a bug that was caused by the explicitly set zero values for
3339             numerical features being misconstrued as "false" in the conditional statements in
3340             some of the method definitions.
3341              
3342             B makes it easier to write code for classifying in one go all of your test
3343             data samples in a CSV file. The bulk classifications obtained can be written out to
3344             either a CSV file or to a regular text file. See the script
3345             C in the C directory for how to
3346             classify all of your test data records in a CSV file. This version also includes
3347             improved code for generating synthetic numeric/symbolic training and test data
3348             records for experimenting with the decision tree classifier.
3349              
3350             B allows you to test the quality of your training data by running a 10-fold
3351             cross-validation test on the data. This test divides all of the training data into
3352             ten parts, with nine parts used for training a decision tree and one part used for
3353             testing its ability to classify correctly. This selection of nine parts for training
3354             and one part for testing is carried out in all of the ten different ways that are
3355             possible. This testing functionality in Version 2.1 can also be used to find the
3356             best values to use for the constructor parameters C,
3357             C, and C.
3358              
3359             B Now you can use both numeric and
3360             symbolic features for constructing a decision tree. A feature is numeric if it can
3361             take any floating-point value over an interval.
3362              
3363             B fixes a bug in the code that was triggered by 0 being declared as one of
3364             the features values in the training datafile. Version 1.71 also include an additional
3365             safety feature that is useful for training datafiles that contain a very large number
3366             of features. The new version makes sure that the number of values you declare for
3367             each sample record matches the number of features declared at the beginning of the
3368             training datafile.
3369              
3370             B includes safety checks on the consistency of the data you place in your
3371             training datafile. When a training file contains thousands of samples, it is
3372             difficult to manually check that you used the same class names in your sample records
3373             that you declared at top of your training file or that the values you have for your
3374             features are legal vis-a-vis the earlier declarations of the values in the training
3375             file. Another safety feature incorporated in this version is the non-consideration
3376             of classes that are declared at the top of the training file but that have no sample
3377             records in the file.
3378              
3379             B uses probability caching much more extensively compared to the previous
3380             versions. This should result in faster construction of large decision trees.
3381             Another new feature in Version 1.6 is the use of a decision tree for interactive
3382             classification. In this mode, after you have constructed a decision tree from the
3383             training data, the user is prompted for answers to the questions pertaining to the
3384             feature tests at the nodes of the tree.
3385              
3386             Some key elements of the documentation were cleaned up and made more readable in
3387             B. The implementation code remains unchanged from Version 1.4.
3388              
3389             B should make things faster (and easier) for folks who want to use this
3390             module with training data that creates very large decision trees (that is, trees with
3391             tens of thousands or more decision nodes). The speedup in Version 1.4 has been
3392             achieved by eliminating duplicate calculation of probabilities as the tree grows. In
3393             addition, this version provides an additional constructor parameter,
3394             C for controlling the size of the decision tree. This is in
3395             addition to the tree size control achieved by the parameter C that
3396             was introduced in Version 1.3. Since large decision trees can take a long time to
3397             create, you may find yourself wishing you could store the tree you just created in a
3398             disk file and that, subsequently, you could use the stored tree for classification
3399             work. The C directory contains two scripts, C and
3400             C, that show how you can do exactly that with the
3401             help of Perl's C module.
3402              
3403             B addresses the issue that arises when the header of a training datafile
3404             declares a certain possible value for a feature but that (feature,value) pair does
3405             NOT show up anywhere in the training data. Version 1.3 also makes it possible for a
3406             user to control the size of the decision tree by changing the value of the parameter
3407             C Additionally, Version 1.3 includes a method called
3408             C that displays useful information regarding the size and
3409             some other attributes of the training data. It also warns the user if the training
3410             data might result in a decision tree that would simply be much too large --- unless
3411             the user controls the size with the entropy_threshold parameter.
3412              
3413             In addition to the removal of a couple of serious bugs, B incorporates a
3414             number of enhancements: (1) Version 1.2 includes checks on the names of the features
3415             and values used in test data --- this is the data you want to classify with the
3416             decision tree classifier constructed by this module. (2) Version 1.2 includes a
3417             separate constructor for generating test data. To make it easier to generate test
3418             data whose probabilistic parameters may not be identical to that used for the
3419             training data, I have used separate routines for generating the test data. (3)
3420             Version 1.2 also includes in its examples directory a script that classifies the test
3421             data in a file and outputs the class labels into another file. This is for folks who
3422             do not wish to write their own scripts using this module. (4) Version 1.2 also
3423             includes addition to the documentation regarding the issue of numeric values for
3424             features.
3425              
3426             =head1 DESCRIPTION
3427              
3428             B is a I module for constructing a decision tree from
3429             a training datafile containing multidimensional data. In one form or another,
3430             decision trees have been around for about fifty years. From a statistical
3431             perspective, they are closely related to classification and regression by recursive
3432             partitioning of multidimensional data. Early work that demonstrated the usefulness
3433             of such partitioning of data for classification and regression can be traced to the
3434             work of Terry Therneau in the early 1980's in the statistics community, and to the
3435             work of Ross Quinlan in the mid 1990's in the machine learning community.
3436              
3437             For those not familiar with decision tree ideas, the traditional way to classify
3438             multidimensional data is to start with a feature space whose dimensionality is the
3439             same as that of the data. Each feature in this space corresponds to the attribute
3440             that each dimension of the data measures. You then use the training data to carve up
3441             the feature space into different regions, each corresponding to a different class.
3442             Subsequently, when you try to classify a new data sample, you locate it in the
3443             feature space and find the class label of the region to which it belongs. One can
3444             also give the new data point the same class label as that of the nearest training
3445             sample. This is referred to as the nearest neighbor classification. There exist
3446             hundreds of variations of varying power on these two basic approaches to the
3447             classification of multidimensional data.
3448              
3449             A decision tree classifier works differently. When you construct a decision tree,
3450             you select for the root node a feature test that partitions the training data in a
3451             way that causes maximal disambiguation of the class labels associated with the data.
3452             In terms of information content as measured by entropy, such a feature test would
3453             cause maximum reduction in class entropy in going from all of the training data taken
3454             together to the data as partitioned by the feature test. You then drop from the root
3455             node a set of child nodes, one for each partition of the training data created by the
3456             feature test at the root node. When your features are purely symbolic, you'll have
3457             one child node for each value of the feature chosen for the feature test at the root.
3458             When the test at the root involves a numeric feature, you find the decision threshold
3459             for the feature that best bipartitions the data and you drop from the root node two
3460             child nodes, one for each partition. Now at each child node you pose the same
3461             question that you posed when you found the best feature to use at the root: Which
3462             feature at the child node in question would maximally disambiguate the class labels
3463             associated with the training data corresponding to that child node?
3464              
3465             As the reader would expect, the two key steps in any approach to decision-tree based
3466             classification are the construction of the decision tree itself from a file
3467             containing the training data, and then using the decision tree thus obtained for
3468             classifying new data.
3469              
3470             What is cool about decision tree classification is that it gives you soft
3471             classification, meaning it may associate more than one class label with a given data
3472             vector. When this happens, it may mean that your classes are indeed overlapping in
3473             the underlying feature space. It could also mean that you simply have not supplied
3474             sufficient training data to the decision tree classifier. For a tutorial
3475             introduction to how a decision tree is constructed and used, visit
3476             L
3477              
3478             This module also allows you to generate your own synthetic training and test
3479             data. Generating your own training data, using it for constructing a decision-tree
3480             classifier, and subsequently testing the classifier on a synthetically generated
3481             test set of data is a good way to develop greater proficiency with decision trees.
3482              
3483              
3484             =head1 WHAT PRACTICAL PROBLEM IS SOLVED BY THIS MODULE
3485              
3486             If you are new to the concept of a decision tree, their practical utility is best
3487             understood with an example that only involves symbolic features. However, as
3488             mentioned earlier, versions of the module higher than 2.0 allow you to use both
3489             symbolic and numeric features.
3490              
3491             Consider the following scenario: Let's say you are running a small investment company
3492             that employs a team of stockbrokers who make buy/sell decisions for the customers of
3493             your company. Assume that your company has asked the traders to make each investment
3494             decision on the basis of the following four criteria:
3495              
3496             price_to_earnings_ratio (P_to_E)
3497              
3498             price_to_sales_ratio (P_to_S)
3499              
3500             return_on_equity (R_on_E)
3501              
3502             market_share (MS)
3503              
3504             Since you are the boss, you keep track of the buy/sell decisions made by the
3505             individual traders. But one unfortunate day, all of your traders decide to quit
3506             because you did not pay them enough. So what do you do? If you had a module like
3507             the one here, you could still run your company and do so in such a way that, on the
3508             average, would do better than any of the individual traders who worked for your
3509             company. This is what you do: You pool together the individual trader buy/sell
3510             decisions you have accumulated during the last one year. This pooled information is
3511             likely to look like:
3512              
3513              
3514             example buy/sell P_to_E P_to_S R_on_E MS
3515             ============================================================+=
3516              
3517             example_1 buy high low medium low
3518             example_2 buy medium medium low low
3519             example_3 sell low medium low high
3520             ....
3521             ....
3522              
3523             This data, when formatted according to CSV, would constitute your training file. You
3524             could feed this file into the module by calling:
3525              
3526             my $dt = Algorithm::DecisionTree->new(
3527             training_datafile => $training_datafile,
3528             csv_class_column_index => 1,
3529             csv_columns_for_features => [2,3,4,5],
3530             );
3531             $dt->get_training_data();
3532             $dt->calculate_first_order_probabilities();
3533             $dt->calculate_class_priors();
3534              
3535             Subsequently, you would construct a decision tree by calling:
3536              
3537             my $root_node = $dt->construct_decision_tree_classifier();
3538              
3539             Now you and your company (with practically no employees) are ready to service the
3540             customers again. Suppose your computer needs to make a buy/sell decision about an
3541             investment prospect that is best described by:
3542              
3543             price_to_earnings_ratio = low
3544             price_to_sales_ratio = very_low
3545             return_on_equity = none
3546             market_share = medium
3547              
3548             All that your computer would need to do would be to construct a data vector like
3549              
3550             my @data = qw / P_to_E=low
3551             P_to_S=very_low
3552             R_on_E=none
3553             MS=medium /;
3554              
3555             and call the decision tree classifier you just constructed by
3556              
3557             $dt->classify($root_node, \@data);
3558              
3559             The answer returned will be 'buy' and 'sell', along with the associated
3560             probabilities. So if the probability of 'buy' is considerably greater than the
3561             probability of 'sell', that's what you should instruct your computer to do.
3562              
3563             The chances are that, on the average, this approach would beat the performance of any
3564             of your individual traders who worked for you previously since the buy/sell decisions
3565             made by the computer would be based on the collective wisdom of all your previous
3566             traders. B
3567             captured by the silly little example here. However, it does nicely the convey the
3568             sense in which the current module could be used.>
3569              
3570             =head1 SYMBOLIC FEATURES VERSUS NUMERIC FEATURES
3571              
3572             A feature is symbolic when its values are compared using string comparison operators.
3573             By the same token, a feature is numeric when its values are compared using numeric
3574             comparison operators. Having said that, features that take only a small number of
3575             numeric values in the training data can be treated symbolically provided you are
3576             careful about handling their values in the test data. At the least, you have to set
3577             the test data value for such a feature to its closest value in the training data.
3578             The module does that automatically for you for those numeric features for which the
3579             number different numeric values is less than a user-specified threshold. For those
3580             numeric features that the module is allowed to treat symbolically, this snapping of
3581             the values of the features in the test data to the small set of values in the training
3582             data is carried out automatically by the module. That is, after a user has told the
3583             module which numeric features to treat symbolically, the user need not worry about
3584             how the feature values appear in the test data.
3585              
3586             The constructor parameter C let's you tell
3587             the module when to consider an otherwise numeric feature symbolically. Suppose you
3588             set this parameter to 10, that means that all numeric looking features that take 10
3589             or fewer different values in the training datafile will be considered to be symbolic
3590             features by the module. See the tutorial at
3591             L for
3592             further information on the implementation issues related to the symbolic and numeric
3593             features.
3594              
3595             =head1 FEATURES WITH NOT SO "NICE" STATISTICAL PROPERTIES
3596              
3597             For the purpose of estimating the probabilities, it is necessary to sample the range
3598             of values taken on by a numerical feature. For features with "nice" statistical
3599             properties, this sampling interval is set to the median of the differences between
3600             the successive feature values in the training data. (Obviously, as you would expect,
3601             you first sort all the values for a feature before computing the successive
3602             differences.) This logic will not work for the sort of a feature described below.
3603              
3604             Consider a feature whose values are heavy-tailed, and, at the same time, the values
3605             span a million to one range. What I mean by heavy-tailed is that rare values can
3606             occur with significant probabilities. It could happen that most of the values for
3607             such a feature are clustered at one of the two ends of the range. At the same time,
3608             there may exist a significant number of values near the end of the range that is less
3609             populated. (Typically, features related to human economic activities --- such as
3610             wealth, incomes, etc. --- are of this type.) With the logic described in the
3611             previous paragraph, you could end up with a sampling interval that is much too small,
3612             which could result in millions of sampling points for the feature if you are not
3613             careful.
3614              
3615             Beginning with Version 2.22, you have two options in dealing with such features. You
3616             can choose to go with the default behavior of the module, which is to sample the
3617             value range for such a feature over a maximum of 500 points. Or, you can supply an
3618             additional option to the constructor that sets a user-defined value for the number of
3619             points to use. The name of the option is C. The following
3620             script
3621              
3622             construct_dt_for_heavytailed.pl
3623              
3624             in the C directory shows an example of how to call the constructor of the
3625             module with the C option.
3626              
3627              
3628             =head1 TESTING THE QUALITY OF YOUR TRAINING DATA
3629              
3630             Versions 2.1 and higher include a new class named C, derived from
3631             the main class C, that runs a 10-fold cross-validation test on your
3632             training data to test its ability to discriminate between the classes mentioned in
3633             the training file.
3634              
3635             The 10-fold cross-validation test divides all of the training data into ten parts,
3636             with nine parts used for training a decision tree and one part used for testing its
3637             ability to classify correctly. This selection of nine parts for training and one part
3638             for testing is carried out in all of the ten different possible ways.
3639              
3640             The following code fragment illustrates how you invoke the testing function of the
3641             EvalTrainingData class:
3642              
3643             my $training_datafile = "training.csv";
3644             my $eval_data = EvalTrainingData->new(
3645             training_datafile => $training_datafile,
3646             csv_class_column_index => 1,
3647             csv_columns_for_features => [2,3],
3648             entropy_threshold => 0.01,
3649             max_depth_desired => 3,
3650             symbolic_to_numeric_cardinality_threshold => 10,
3651             csv_cleanup_needed => 1,
3652             );
3653             $eval_data->get_training_data();
3654             $eval_data->evaluate_training_data()
3655              
3656             The last statement above prints out a Confusion Matrix and the value of Training Data
3657             Quality Index on a scale of 0 to 100, with 100 designating perfect training data.
3658             The Confusion Matrix shows how the different classes were mislabeled in the 10-fold
3659             cross-validation test.
3660              
3661             This testing functionality can also be used to find the best values to use for the
3662             constructor parameters C, C, and
3663             C.
3664              
3665             The following two scripts in the C directory illustrate the use of the
3666             C class for testing the quality of your data:
3667              
3668             evaluate_training_data1.pl
3669             evaluate_training_data2.pl
3670              
3671              
3672             =head1 HOW TO MAKE THE BEST CHOICES FOR THE CONSTRUCTOR PARAMETERS
3673              
3674             Assuming your training data is good, the quality of the results you get from a
3675             decision tree would depend on the choices you make for the constructor parameters
3676             C, C, and
3677             C. You can optimize your choices for
3678             these parameters by running the 10-fold cross-validation test that is made available
3679             in Versions 2.2 and higher through the new class C that is included
3680             in the module file. A description of how to run this test is in the previous section
3681             of this document.
3682              
3683              
3684             =head1 DECISION TREE INTROSPECTION
3685              
3686             Starting with Version 2.30, you can ask the C class of the module to
3687             explain the classification decisions made at the different nodes of the decision
3688             tree.
3689              
3690             Perhaps the most important bit of information you are likely to seek through DT
3691             introspection is the list of the training samples that fall directly in the portion
3692             of the feature space that is assigned to a node.
3693              
3694             However, note that, when training samples are non-uniformly distributed in the
3695             underlying feature space, it is possible for a node to exist even when there are no
3696             training samples in the portion of the feature space assigned to the node. That is
3697             because the decision tree is constructed from the probability densities estimated
3698             from the training data. When the training samples are non-uniformly distributed, it
3699             is entirely possible for the estimated probability densities to be non-zero in a
3700             small region around a point even when there are no training samples specifically in
3701             that region. (After you have created a statistical model for, say, the height
3702             distribution of people in a community, the model may return a non-zero probability
3703             for the height values in a small interval even if the community does not include a
3704             single individual whose height falls in that interval.)
3705              
3706             That a decision-tree node can exist even when there are no training samples in that
3707             portion of the feature space that belongs to the node is an important indication of
3708             the generalization ability of a decision-tree-based classifier.
3709              
3710             In light of the explanation provided above, before the DTIntrospection class supplies
3711             any answers at all, it asks you to accept the fact that features can take on non-zero
3712             probabilities at a point in the feature space even though there are zero training
3713             samples at that point (or in a small region around that point). If you do not accept
3714             this rudimentary fact, the introspection class will not yield any answers (since you
3715             are not going to believe the answers anyway).
3716              
3717             The point made above implies that the path leading to a node in the decision tree may
3718             test a feature for a certain value or threshold despite the fact that the portion of
3719             the feature space assigned to that node is devoid of any training data.
3720              
3721             See the following three scripts in the Examples directory for how to carry out DT
3722             introspection:
3723              
3724             introspection_in_a_loop_interactive.pl
3725              
3726             introspection_show_training_samples_at_all_nodes_direct_influence.pl
3727              
3728             introspection_show_training_samples_to_nodes_influence_propagation.pl
3729              
3730             The first script places you in an interactive session in which you will first be
3731             asked for the node number you are interested in. Subsequently, you will be asked for
3732             whether or not you are interested in specific questions that the introspection can
3733             provide answers for. The second script descends down the decision tree and shows for
3734             each node the training samples that fall directly in the portion of the feature space
3735             assigned to that node. The third script shows for each training sample how it
3736             affects the decision-tree nodes either directly or indirectly through the
3737             generalization achieved by the probabilistic modeling of the data.
3738              
3739             The output of the script
3740             C looks like:
3741              
3742             Node 0: the samples are: None
3743             Node 1: the samples are: [sample_46 sample_58]
3744             Node 2: the samples are: [sample_1 sample_4 sample_7 .....]
3745             Node 3: the samples are: []
3746             Node 4: the samples are: []
3747             ...
3748             ...
3749              
3750             The nodes for which no samples are listed come into existence through
3751             the generalization achieved by the probabilistic modeling of the data.
3752              
3753             The output produced by the script
3754             C looks like
3755              
3756             sample_1:
3757             nodes affected directly: [2 5 19 23]
3758             nodes affected through probabilistic generalization:
3759             2=> [3 4 25]
3760             25=> [26]
3761             5=> [6]
3762             6=> [7 13]
3763             7=> [8 11]
3764             8=> [9 10]
3765             11=> [12]
3766             13=> [14 18]
3767             14=> [15 16]
3768             16=> [17]
3769             19=> [20]
3770             20=> [21 22]
3771             23=> [24]
3772            
3773             sample_4:
3774             nodes affected directly: [2 5 6 7 11]
3775             nodes affected through probabilistic generalization:
3776             2=> [3 4 25]
3777             25=> [26]
3778             5=> [19]
3779             19=> [20 23]
3780             20=> [21 22]
3781             23=> [24]
3782             6=> [13]
3783             13=> [14 18]
3784             14=> [15 16]
3785             16=> [17]
3786             7=> [8]
3787             8=> [9 10]
3788             11=> [12]
3789            
3790             ...
3791             ...
3792             ...
3793              
3794             For each training sample, the display shown above first presents the list of nodes
3795             that are directly affected by the sample. A node is affected directly by a sample if
3796             the latter falls in the portion of the feature space that belongs to the former.
3797             Subsequently, for each training sample, the display shows a subtree of the nodes that
3798             are affected indirectly by the sample through the generalization achieved by the
3799             probabilistic modeling of the data. In general, a node is affected indirectly by a
3800             sample if it is a descendant of another node that is affected directly.
3801              
3802             Also see the section titled B regarding how to invoke the
3803             introspection capabilities of the module in your own code.
3804              
3805             =head1 METHODS
3806              
3807             The module provides the following methods for constructing a decision tree from
3808             training data in a disk file and for classifying new data records with the decision
3809             tree thus constructed:
3810              
3811             =over 4
3812              
3813             =item B
3814              
3815             my $dt = Algorithm::DecisionTree->new(
3816             training_datafile => $training_datafile,
3817             csv_class_column_index => 2,
3818             csv_columns_for_features => [3,4,5,6,7,8],
3819             entropy_threshold => 0.01,
3820             max_depth_desired => 8,
3821             symbolic_to_numeric_cardinality_threshold => 10,
3822             csv_cleanup_needed => 1,
3823             );
3824              
3825             A call to C constructs a new instance of the C class.
3826             For this call to make sense, the training data in the training datafile must be
3827             in the CSV format.
3828              
3829             =back
3830              
3831             =head2 The Constructor Parameters
3832              
3833             =over 8
3834              
3835             =item C:
3836              
3837             This parameter supplies the name of the file that contains the training data.
3838              
3839             =item C:
3840              
3841             When using a CSV file for your training data, this parameter supplies the zero-based
3842             column index for the column that contains the class label for each data record in the
3843             training file.
3844              
3845              
3846             =item C:
3847              
3848             You need to set this parameter to 1 if your CSV file has double quoted strings (which
3849             may include commas) as values for the fields and if such values are allowed to
3850             include commas for, presumably, better readability.
3851              
3852             =item C:
3853              
3854             When using a CSV file for your training data, this parameter supplies a list of
3855             columns corresponding to the features you wish to use for decision tree construction.
3856             Each column is specified by its zero-based index.
3857              
3858             =item C:
3859              
3860             This parameter sets the granularity with which the entropies are sampled by the
3861             module. For example, a feature test at a node in the decision tree is acceptable if
3862             the entropy gain achieved by the test exceeds this threshold. The larger the value
3863             you choose for this parameter, the smaller the tree. Its default value is 0.001.
3864              
3865             =item C:
3866              
3867             This parameter sets the maximum depth of the decision tree. For obvious reasons, the
3868             smaller the value you choose for this parameter, the smaller the tree.
3869              
3870             =item C:
3871              
3872             This parameter allows the module to treat an otherwise numeric feature symbolically
3873             if the number of different values the feature takes in the training data file does
3874             not exceed the value of this parameter.
3875              
3876             =item C:
3877              
3878             This parameter gives the user the option to set the number of points at which the
3879             value range for a feature should be sampled for estimating the probabilities. This
3880             parameter is effective only for those features that occupy a large value range and
3881             whose probability distributions are heavy tailed. B
3882             when you have a very large training dataset:> In general, the larger the dataset, the
3883             smaller the smallest difference between any two values for a numeric feature in
3884             relation to the overall range of values for that feature. In such cases, the module
3885             may use too large a number of bins for estimating the probabilities and that may slow
3886             down the calculation of the decision tree. You can get around this difficulty by
3887             explicitly giving a value to the 'C' parameter.
3888              
3889             =back
3890              
3891              
3892             You can choose the best values to use for the last three constructor parameters by
3893             running a 10-fold cross-validation test on your training data through the class
3894             C that comes with Versions 2.1 and higher of this module. See the
3895             section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.
3896              
3897             =over
3898              
3899             =item B
3900              
3901             After you have constructed a new instance of the C class,
3902             you must now read in the training data that is the file named in the call to the
3903             constructor. This you do by:
3904              
3905             $dt->get_training_data();
3906              
3907              
3908             =item B
3909              
3910             If you wish to see the training data that was just digested by the module,
3911             call
3912              
3913             $dt->show_training_data();
3914              
3915             =item B
3916              
3917             =item B
3918              
3919             After the module has read the training data file, it needs to initialize the
3920             probability cache. This you do by invoking:
3921              
3922             $dt->calculate_first_order_probabilities()
3923             $dt->calculate_class_priors()
3924              
3925             =item B
3926              
3927             With the probability cache initialized, it is time to construct a decision tree
3928             classifier. This you do by
3929              
3930             my $root_node = $dt->construct_decision_tree_classifier();
3931              
3932             This call returns an instance of type C. The C class is defined
3933             within the main package file. So, don't forget, that C<$root_node> in the above
3934             example call will be instantiated to an object of type C.
3935              
3936             =item B<$root_nodeC<< -> >>display_decision_tree(" "):>
3937              
3938             $root_node->display_decision_tree(" ");
3939              
3940             This will display the decision tree in your terminal window by using a recursively
3941             determined offset for each node as the display routine descends down the tree.
3942              
3943             I have intentionally left the syntax fragment C<$root_node> in the above call to
3944             remind the reader that C is NOT called on the instance of
3945             the C we constructed earlier, but on the C instance returned by
3946             the call to C.
3947              
3948             =item B
3949              
3950             Let's say you want to classify the following data record:
3951              
3952             my @test_sample = qw / g2=4.2
3953             grade=2.3
3954             gleason=4
3955             eet=1.7
3956             age=55.0
3957             ploidy=diploid /;
3958              
3959             you'd make the following call:
3960              
3961             my $classification = $dt->classify($root_node, \@test_sample);
3962              
3963             where, again, C<$root_node> is an instance of type C returned by the call to
3964             C. The variable C<$classification> holds a
3965             reference to a hash whose keys are the class names and whose values the associated
3966             probabilities. The hash that is returned by the above call also includes a special
3967             key-value pair for a key named C. The value associated with this key
3968             is an anonymous array that holds the path, in the form of a list of nodes, from the
3969             root node to the leaf node in the decision tree where the final classification was
3970             made.
3971              
3972              
3973             =item B
3974              
3975             This method allows you to use a decision-tree based classifier in an interactive
3976             mode. In this mode, a user is prompted for answers to the questions pertaining to
3977             the feature tests at the nodes of the tree. The syntax for invoking this method is:
3978              
3979             my $classification = $dt->classify_by_asking_questions($root_node);
3980              
3981             where C<$dt> is an instance of the C class returned by a
3982             call to C and C<$root_node> the root node of the decision tree returned by a
3983             call to C.
3984              
3985             =back
3986              
3987              
3988             =head1 THE INTROSPECTION API
3989              
3990             To construct an instance of C, you call
3991              
3992             my $introspector = DTIntrospection->new($dt);
3993              
3994             where you supply the instance of the C class you used for constructing
3995             the decision tree through the parameter C<$dt>. After you have constructed an
3996             instance of the introspection class, you must initialize it by
3997              
3998             $introspector->initialize();
3999              
4000             Subsequently, you can invoke either of the following methods:
4001              
4002             $introspector->explain_classification_at_one_node($node);
4003              
4004             $introspector->explain_classifications_at_multiple_nodes_interactively();
4005              
4006             depending on whether you want introspection at a single specified node or inside an
4007             infinite loop for an arbitrary number of nodes.
4008              
4009             If you want to output a tabular display that shows for each node in the decision tree
4010             all the training samples that fall in the portion of the feature space that belongs
4011             to that node, call
4012              
4013             $introspector->display_training_samples_at_all_nodes_direct_influence_only();
4014              
4015             If you want to output a tabular display that shows for each training sample a list of
4016             all the nodes that are affected directly AND indirectly by that sample, call
4017              
4018             $introspector->display_training_training_samples_to_nodes_influence_propagation();
4019              
4020             A training sample affects a node directly if the sample falls in the portion of the
4021             features space assigned to that node. On the other hand, a training sample is
4022             considered to affect a node indirectly if the node is a descendant of a node that is
4023             affected directly by the sample.
4024              
4025              
4026             =head1 BULK CLASSIFICATION OF DATA RECORDS
4027              
4028             For large test datasets, you would obviously want to process an entire file of test
4029             data at a time. The following scripts in the C directory illustrate how you
4030             can do that:
4031              
4032             classify_test_data_in_a_file.pl
4033              
4034             This script requires three command-line arguments, the first argument names the
4035             training datafile, the second the test datafile, and the third the file in which the
4036             classification results are to be deposited.
4037              
4038             The other examples directories, C, C, and
4039             C, also contain scripts that illustrate how to carry out
4040             bulk classification of data records when you wish to take advantage of bagging,
4041             boosting, or tree randomization. In their respective directories, these scripts are
4042             named:
4043              
4044             bagging_for_bulk_classification.pl
4045             boosting_for_bulk_classification.pl
4046             classify_database_records.pl
4047              
4048              
4049             =head1 HOW THE CLASSIFICATION RESULTS ARE DISPLAYED
4050              
4051             It depends on whether you apply the classifier at once to all the data samples in a
4052             file, or whether you feed one data sample at a time into the classifier.
4053              
4054             In general, the classifier returns soft classification for a test data vector. What
4055             that means is that, in general, the classifier will list all the classes to which a
4056             given data vector could belong and the probability of each such class label for the
4057             data vector. Run the examples scripts in the Examples directory to see how the output
4058             of classification can be displayed.
4059              
4060             With regard to the soft classifications returned by this classifier, if the
4061             probability distributions for the different classes overlap in the underlying feature
4062             space, you would want the classifier to return all of the applicable class labels for
4063             a data vector along with the corresponding class probabilities. Another reason for
4064             why the decision tree classifier may associate significant probabilities with
4065             multiple class labels is that you used inadequate number of training samples to
4066             induce the decision tree. The good thing is that the classifier does not lie to you
4067             (unlike, say, a hard classification rule that would return a single class label
4068             corresponding to the partitioning of the underlying feature space). The decision
4069             tree classifier give you the best classification that can be made given the training
4070             data you fed into it.
4071              
4072              
4073             =head1 USING BAGGING
4074              
4075             Starting with Version 3.0, you can use the class C that
4076             comes with the module to incorporate bagging in your decision tree based
4077             classification. Bagging means constructing multiple decision trees for different
4078             (possibly overlapping) segments of the data extracted from your training dataset and
4079             then aggregating the decisions made by the individual decision trees for the final
4080             classification. The aggregation of the classification decisions can average out the
4081             noise and bias that may otherwise affect the classification decision obtained from
4082             just one tree.
4083              
4084             =over 4
4085              
4086             =item B
4087              
4088             A typical call to the constructor for the C class looks
4089             like:
4090              
4091             use Algorithm::DecisionTreeWithBagging;
4092            
4093             my $training_datafile = "stage3cancer.csv";
4094            
4095             my $dtbag = Algorithm::DecisionTreeWithBagging->new(
4096             training_datafile => $training_datafile,
4097             csv_class_column_index => 2,
4098             csv_columns_for_features => [3,4,5,6,7,8],
4099             entropy_threshold => 0.01,
4100             max_depth_desired => 8,
4101             symbolic_to_numeric_cardinality_threshold => 10,
4102             how_many_bags => 4,
4103             bag_overlap_fraction => 0.2,
4104             csv_cleanup_needed => 1,
4105             );
4106            
4107             Note in particular the following two constructor parameters:
4108            
4109             how_many_bags
4110              
4111             bag_overlap_fraction
4112              
4113             where, as the name implies, the parameter C controls how many bags
4114             (and, therefore, how many decision trees) will be constructed from your training
4115             dataset; and where the parameter C controls the degree of
4116             overlap between the bags. To understand what exactly is achieved by setting the
4117             parameter C to 0.2 in the above example, let's say that the
4118             non-overlapping partitioning of the training data between the bags results in 100
4119             training samples per bag. With bag_overlap_fraction set to 0.2, additional 20 samples
4120             drawn randomly from the other bags will be added to the data in each bag.
4121              
4122             =back
4123              
4124             =head2 B class>
4125              
4126             =over 8
4127              
4128             =item B
4129              
4130             This method reads your training datafile, randomizes it, and then partitions it into
4131             the specified number of bags. Subsequently, if the constructor parameter
4132             C is non-zero, it adds to each bag additional samples drawn at
4133             random from the other bags. The number of these additional samples added to each bag
4134             is controlled by the constructor parameter C. If this
4135             parameter is set to, say, 0.2, the size of each bag will grow by 20% with the samples
4136             drawn from the other bags.
4137              
4138             =item B
4139              
4140             Shows for each bag the names of the training data samples in that bag.
4141              
4142             =item B
4143              
4144             Calls on the appropriate methods of the main C class to estimate the
4145             first-order probabilities from the data samples in each bag.
4146              
4147             =item B
4148              
4149             Calls on the appropriate method of the main C class to estimate the
4150             class priors for the data samples in each bag.
4151              
4152             =item B
4153              
4154             Calls on the appropriate method of the main C class to construct a
4155             decision tree from the training data in each bag.
4156              
4157             =item B
4158              
4159             Display separately the decision tree for each bag..
4160              
4161             =item B
4162              
4163             Calls on the appropriate methods of the main C class to classify the
4164             argument test sample.
4165              
4166             =item B
4167              
4168             Displays separately the classification decision made by each the decision tree
4169             constructed for each bag.
4170              
4171             =item B
4172              
4173             Using majority voting, this method aggregates the classification decisions made by
4174             the individual decision trees into a single decision.
4175              
4176             =back
4177              
4178             See the example scripts in the directory C for how to call these
4179             methods for classifying individual samples and for bulk classification when you place
4180             all your test samples in a single file.
4181              
4182             =head1 USING BOOSTING
4183              
4184             Starting with Version 3.20, you can use the class C for
4185             constructing a boosted decision-tree classifier. Boosting results in a cascade of
4186             decision trees in which each decision tree is constructed with samples that are
4187             mostly those that are misclassified by the previous decision tree. To be precise,
4188             you create a probability distribution over the training samples for the selection of
4189             samples for training each decision tree in the cascade. To start out, the
4190             distribution is uniform over all of the samples. Subsequently, this probability
4191             distribution changes according to the misclassifications by each tree in the cascade:
4192             if a sample is misclassified by a given tree in the cascade, the probability of its
4193             being selected for training the next tree is increased significantly. You also
4194             associate a trust factor with each decision tree depending on its power to classify
4195             correctly all of the training data samples. After a cascade of decision trees is
4196             constructed in this manner, you construct a final classifier that calculates the
4197             class label for a test data sample by taking into account the classification
4198             decisions made by each individual tree in the cascade, the decisions being weighted
4199             by the trust factors associated with the individual classifiers. These boosting
4200             notions --- generally referred to as the AdaBoost algorithm --- are based on a now
4201             celebrated paper "A Decision-Theoretic Generalization of On-Line Learning and an
4202             Application to Boosting" by Yoav Freund and Robert Schapire that appeared in 1995 in
4203             the Proceedings of the 2nd European Conf. on Computational Learning Theory. For a
4204             tutorial introduction to AdaBoost, see L
4205              
4206             Keep in mind the fact that, ordinarily, the theoretical guarantees provided by
4207             boosting apply only to the case of binary classification. Additionally, your
4208             training dataset must capture all of the significant statistical variations in the
4209             classes represented therein.
4210              
4211             =over 4
4212              
4213             =item B
4214              
4215             If you'd like to experiment with boosting, a typical call to the constructor for the
4216             C class looks like:
4217              
4218             use Algorithm::BoostedDecisionTree;
4219             my $training_datafile = "training6.csv";
4220             my $boosted = Algorithm::BoostedDecisionTree->new(
4221             training_datafile => $training_datafile,
4222             csv_class_column_index => 1,
4223             csv_columns_for_features => [2,3],
4224             entropy_threshold => 0.01,
4225             max_depth_desired => 8,
4226             symbolic_to_numeric_cardinality_threshold => 10,
4227             how_many_stages => 4,
4228             csv_cleanup_needed => 1,
4229             );
4230              
4231             Note in particular the constructor parameter:
4232            
4233             how_many_stages
4234              
4235             As its name implies, this parameter controls how many stages will be used in the
4236             boosted decision tree classifier. As mentioned above, a separate decision tree is
4237             constructed for each stage of boosting using a set of training samples that are drawn
4238             through a probability distribution maintained over the entire training dataset.
4239              
4240             =back
4241              
4242             =head2 B class>
4243              
4244             =over 8
4245              
4246             =item B
4247              
4248             This method reads your training datafile, creates the data structures from the data
4249             ingested for constructing the base decision tree.
4250              
4251             =item B
4252              
4253             Writes to the standard output the training data samples and also some relevant
4254             properties of the features used in the training dataset.
4255              
4256             =item B
4257              
4258             Calls on the appropriate methods of the main C class to estimate the
4259             first-order probabilities and the class priors.
4260              
4261             =item B
4262              
4263             Calls on the appropriate method of the main C class to construct the
4264             base decision tree.
4265              
4266             =item B
4267              
4268             Displays the base decision tree in your terminal window. (The textual form of the
4269             decision tree is written out to the standard output.)
4270              
4271             =item B
4272              
4273             Uses the AdaBoost algorithm to construct a cascade of decision trees. As mentioned
4274             earlier, the training samples for each tree in the cascade are drawn using a
4275             probability distribution over the entire training dataset. This probability
4276             distribution for any given tree in the cascade is heavily influenced by which
4277             training samples are misclassified by the previous tree.
4278              
4279             =item B
4280              
4281             Displays separately in your terminal window the decision tree constructed for each
4282             stage of the cascade. (The textual form of the trees is written out to the standard
4283             output.)
4284              
4285             =item B
4286              
4287             Calls on each decision tree in the cascade to classify the argument C<$test_sample>.
4288              
4289             =item B
4290              
4291             You can call this method to display in your terminal window the classification
4292             decision made by each decision tree in the cascade. The method also prints out the
4293             trust factor associated with each decision tree. It is important to look
4294             simultaneously at the classification decision and the trust factor for each tree ---
4295             since a classification decision made by a specific tree may appear bizarre for a
4296             given test sample. This method is useful primarily for debugging purposes.
4297              
4298             =item B
4299              
4300             As with the previous method, this method is useful mostly for debugging. It returns
4301             class labels for the samples misclassified by the stage whose integer index is
4302             supplied as an argument to the method. Say you have 10 stages in your cascade. The
4303             value of the argument C would go from 0 to 9, with 0 corresponding to
4304             the base tree.
4305              
4306             =item B
4307              
4308             Uses the "final classifier" formula of the AdaBoost algorithm to pool together the
4309             classification decisions made by the individual trees while taking into account the
4310             trust factors associated with the trees. As mentioned earlier, we associate with
4311             each tree of the cascade a trust factor that depends on the overall misclassification
4312             rate associated with that tree.
4313              
4314             =back
4315              
4316             See the example scripts in the C subdirectory for how to call the
4317             methods listed above for classifying individual data samples with boosting and for
4318             bulk classification when you place all your test samples in a single file.
4319              
4320              
4321             =head1 USING RANDOMIZED DECISION TREES
4322              
4323             As mentioned earlier, the new C class allows you to solve
4324             the following two problems: (1) Data classification using the needle-in-a-haystack
4325             metaphor, that is, when a vast majority of your training samples belong to just one
4326             class. And (2) You have access to a very large database of training samples and you
4327             wish to construct an ensemble of decision trees for classification.
4328              
4329             =over 4
4330              
4331             =item B
4332              
4333             Here is how you'd call the C constructor for
4334             needle-in-a-haystack classification:
4335              
4336             use Algorithm::RandomizedTreesForBigData;
4337             my $training_datafile = "your_database.csv";
4338             my $rt = Algorithm::RandomizedTreesForBigData->new(
4339             training_datafile => $training_datafile,
4340             csv_class_column_index => 48,
4341             csv_columns_for_features => [24,32,33,34,41],
4342             entropy_threshold => 0.01,
4343             max_depth_desired => 8,
4344             symbolic_to_numeric_cardinality_threshold => 10,
4345             how_many_trees => 5,
4346             looking_for_needles_in_haystack => 1,
4347             csv_cleanup_needed => 1,
4348             );
4349              
4350             Note in particular the constructor parameters:
4351              
4352             looking_for_needles_in_haystack
4353             how_many_trees
4354              
4355             The first of these parameters, C, invokes the logic for
4356             constructing an ensemble of decision trees, each based on a training dataset that
4357             uses all of the minority class samples, and a random drawing from the majority class
4358             samples.
4359              
4360             Here is how you'd call the C constructor for a more
4361             general attempt at constructing an ensemble of decision trees, with each tree trained
4362             with randomly drawn samples from a large database of training data (without paying
4363             attention to the differences in the sizes of the populations for the different
4364             classes):
4365              
4366             use Algorithm::RandomizedTreesForBigData;
4367             my $training_datafile = "your_database.csv";
4368             my $rt = Algorithm::RandomizedTreesForBigData->new(
4369             training_datafile => $training_datafile,
4370             csv_class_column_index => 2,
4371             csv_columns_for_features => [3,4,5,6,7,8],
4372             entropy_threshold => 0.01,
4373             max_depth_desired => 8,
4374             symbolic_to_numeric_cardinality_threshold => 10,
4375             how_many_trees => 3,
4376             how_many_training_samples_per_tree => 50,
4377             csv_cleanup_needed => 1,
4378             );
4379              
4380             Note in particular the constructor parameters:
4381              
4382             how_many_training_samples_per_tree
4383             how_many_trees
4384              
4385             When you set the C parameter, you are not allowed
4386             to also set the C parameter, and vice versa.
4387              
4388             =back
4389              
4390             =head2 B class>
4391              
4392             =over 8
4393              
4394             =item B
4395              
4396             What this method does depends on which of the two constructor parameters ---
4397             C or C --- is
4398             set. When the former is set, it creates a collection of training datasets for
4399             C number of decision trees, with each dataset being a mixture of the
4400             minority class and sample drawn randomly from the majority class. However, when the
4401             latter option is set, all the datasets are drawn randomly from the training database
4402             with no particular attention given to the relative populations of the two classes.
4403              
4404             =item B
4405              
4406             As the name implies, this method shows the training data being used for all the
4407             decision trees. This method is useful for debugging purposes using small datasets.
4408              
4409             =item B
4410              
4411             Calls on the appropriate method of the main C to estimate the
4412             first-order probabilities for the training dataset to be used for each decision tree.
4413              
4414             =item B
4415              
4416             Calls on the appropriate method of the main C class to estimate the
4417             class priors for the training dataset to be used for each decision tree.
4418              
4419             =item B
4420              
4421             Calls on the appropriate method of the main C class to construct the
4422             decision trees.
4423              
4424             =item B
4425              
4426             Displays all the decision trees in your terminal window. (The textual form of the
4427             decision trees is written out to the standard output.)
4428              
4429             =item B
4430              
4431             The test_sample is sent to each decision tree for classification.
4432              
4433             =item B
4434              
4435             The classification decisions returned by the individual decision trees are written
4436             out to the standard output.
4437              
4438             =item B
4439              
4440             This method aggregates the classification results returned by the individual decision
4441             trees and returns the majority decision.
4442              
4443             =back
4444              
4445             =head1 CONSTRUCTING REGRESSION TREES:
4446              
4447             Decision tree based modeling requires that the class labels be distinct. That is,
4448             the training dataset must contain a relatively small number of discrete class labels
4449             for all of your data records if you want to model the data with one or more decision
4450             trees. However, when one is trying to understand all of the associational
4451             relationships that exist in a large database, one often runs into situations where,
4452             instead of discrete class labels, you have a continuously valued variable as a
4453             dependent variable whose values are predicated on a set of feature values. It is for
4454             such situations that you will find useful the new class C that is now
4455             a part of the C module. The C class has been
4456             programmed as a subclass of the main C class.
4457              
4458             You can think of regression with a regression tree as a powerful generalization of
4459             the very commonly used Linear Regression algorithms. Although you can certainly
4460             carry out polynomial regression with run-of-the-mill Linear Regression algorithms for
4461             modeling nonlinearities between the predictor variables and the dependent variable,
4462             specifying the degree of the polynomial is often tricky. Additionally, a polynomial
4463             can inject continuities between the predictor and the predicted variables that may
4464             not really exist in the real data. Regression trees, on the other hand, give you a
4465             piecewise linear relationship between the predictor and the predicted variables that
4466             is freed from the constraints of superimposed continuities at the joins between the
4467             different segments. See the following tutorial for further information regarding the
4468             standard linear regression approach and the regression that can be achieved with the
4469             RegressionTree class in this module:
4470             L
4471              
4472             The RegressionTree class in the current version of the module assumes that all of
4473             your data is numerical. That is, unlike what is possible with the DecisionTree class
4474             (and the other more closely related classes in this module) that allow your training
4475             file to contain a mixture of numerical and symbolic data, the RegressionTree class
4476             requires that ALL of your data be numerical. I hope to relax this constraint in
4477             future versions of this module. Obviously, the dependent variable will always be
4478             numerical for regression.
4479              
4480             See the example scripts in the directory C if you wish to become
4481             more familiar with the regression capabilities of the module.
4482              
4483             =over 4
4484              
4485             =item B
4486              
4487             my $training_datafile = "gendata5.csv";
4488             my $rt = Algorithm::RegressionTree->new(
4489             training_datafile => $training_datafile,
4490             dependent_variable_column => 2,
4491             predictor_columns => [1],
4492             mse_threshold => 0.01,
4493             max_depth_desired => 2,
4494             jacobian_choice => 0,
4495             csv_cleanup_needed => 1,
4496             );
4497              
4498             Note in particular the constructor parameters:
4499              
4500             dependent_variable
4501             predictor_columns
4502             mse_threshold
4503             jacobian_choice
4504              
4505             The first of these parameters, C, is set to the column index in
4506             the CSV file for the dependent variable. The second constructor parameter,
4507             C, tells the system as to which columns contain values for the
4508             predictor variables. The third parameter, C, is for deciding when to
4509             partition the data at a node into two child nodes as a regression tree is being
4510             constructed. If the minmax of MSE (Mean Squared Error) that can be achieved by
4511             partitioning any of the features at a node is smaller than C, that
4512             node becomes a leaf node of the regression tree.
4513              
4514             The last parameter, C, must be set to either 0 or 1 or 2. Its
4515             default value is 0. When this parameter equals 0, the regression coefficients are
4516             calculated using the linear least-squares method and no further "refinement" of the
4517             coefficients is carried out using gradient descent. This is the fastest way to
4518             calculate the regression coefficients. When C is set to 1, you get
4519             a weak version of gradient descent in which the Jacobian is set to the "design
4520             matrix" itself. Choosing 2 for C results in a more reasonable
4521             approximation to the Jacobian. That, however, is at a cost of much longer
4522             computation time. B For most cases, using 0 for C is the
4523             best choice. See my tutorial "I" for why
4524             that is the case.
4525              
4526             =back
4527              
4528             =head2 B class>
4529              
4530             =over 8
4531              
4532             =item B
4533              
4534             Only CSV training datafiles are allowed. Additionally, the first record in the file
4535             must list the names of the fields, and the first column must contain an integer ID
4536             for each record.
4537              
4538             =item B
4539              
4540             As the name implies, this is the method that construct a regression tree.
4541              
4542             =item B
4543              
4544             Displays the regression tree, as the name implies. The white-space string argument
4545             specifies the offset to use in displaying the child nodes in relation to a parent
4546             node.
4547              
4548             =item B
4549              
4550             You call this method after you have constructed a regression tree if you want to
4551             calculate the prediction for one sample. The parameter C<$root_node> is what is
4552             returned by the call C. The formatting of the argument
4553             bound to the C<$test_sample> parameter is important. To elaborate, let's say you are
4554             using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In
4555             this case, the C<$test_sample> parameter will be bound to a list that will look like
4556              
4557             ['xvar1 = 23.4', 'xvar2 = 12.9']
4558              
4559             Arbitrary amount of white space, including none, on the two sides of the equality
4560             symbol is allowed in the construct shown above. A call to this method returns a
4561             dictionary with two key-value pairs. One of the keys is called C and
4562             the other C. The value associated with key C is the path
4563             in the regression tree to the leaf node that yielded the prediction. And the value
4564             associated with the key C is the answer you are looking for.
4565              
4566             =item B
4567              
4568             This call calculates the predictions for all of the predictor variables data in your
4569             training file. The parameter C<$root_node> is what is returned by the call to
4570             C. The values for the dependent variable thus predicted
4571             can be seen by calling C, which is the method mentioned below.
4572              
4573             =item B
4574              
4575             This method displays the results obtained by calling the prediction method of the
4576             previous entry. This method also creates a hardcopy of the plots and saves it as a
4577             C<.png> disk file. The name of this output file is always C.
4578              
4579             =item B
4580              
4581             This method carries out an error analysis of the predictions for the samples in your
4582             training datafile. It shows you the overall MSE (Mean Squared Error) with tree-based
4583             regression, the MSE for the data samples at each of the leaf nodes of the regression
4584             tree, and the MSE for the plain old Linear Regression as applied to all of the data.
4585             The parameter C<$root_node> in the call syntax is what is returned by the call to
4586             C.
4587              
4588             =item B
4589              
4590             Call this method if you want to apply the regression tree to all your test data in a
4591             disk file. The predictions for all of the test samples in the disk file are written
4592             out to another file whose name is the same as that of the test file except for the
4593             addition of C<_output> in the name of the file. The parameter C<$filename> is the
4594             name of the disk file that contains the test data. And the parameter C<$columns> is a
4595             list of the column indices for the predictor variables in the test file.
4596              
4597             =back
4598              
4599             =head1 GENERATING SYNTHETIC TRAINING DATA
4600              
4601             The module file contains the following additional classes: (1)
4602             C, and (2) C for
4603             generating synthetic training data.
4604              
4605             The class C outputs one CSV file for the
4606             training data and another one for the test data for experimenting with numeric
4607             features. The numeric values are generated using a multivariate Gaussian
4608             distribution whose mean and covariance are specified in a parameter file. See the
4609             file C in the C directory for an example of such a
4610             parameter file. Note that the dimensionality of the data is inferred from the
4611             information you place in the parameter file.
4612              
4613             The class C generates synthetic training for the
4614             purely symbolic case. The relative frequencies of the different possible values for
4615             the features is controlled by the biasing information you place in a parameter file.
4616             See C for an example of such a file.
4617              
4618              
4619             =head1 THE C DIRECTORY
4620              
4621             See the C directory in the distribution for how to construct a decision
4622             tree, and how to then classify new data using the decision tree. To become more
4623             familiar with the module, run the scripts
4624              
4625             construct_dt_and_classify_one_sample_case1.pl
4626             construct_dt_and_classify_one_sample_case2.pl
4627             construct_dt_and_classify_one_sample_case3.pl
4628             construct_dt_and_classify_one_sample_case4.pl
4629              
4630             The first script is for the purely symbolic case, the second for the case that
4631             involves both numeric and symbolic features, the third for the case of purely numeric
4632             features, and the last for the case when the training data is synthetically generated
4633             by the script C.
4634              
4635             Next run the following script as it is for bulk classification of data records placed
4636             in a CSV file:
4637              
4638             classify_test_data_in_a_file.pl training4.csv test4.csv out4.csv
4639              
4640             The script first constructs a decision tree using the training data in the training
4641             file supplied by the first argument file C. The script then
4642             calculates the class label for each data record in the test data file supplied
4643             through the second argument file, C. The estimated class labels are
4644             written out to the output file which in the call shown above is C. An
4645             important thing to note here is that your test file --- in this case C ---
4646             must have a column for class labels. Obviously, in real-life situations, there will
4647             be no class labels in this column. What that is the case, you can place an empty
4648             string C<""> there for each data record. This is demonstrated by the following call:
4649              
4650             classify_test_data_in_a_file.pl training4.csv test4_no_class_labels.csv out4.csv
4651              
4652             The following script in the C directory
4653              
4654             classify_by_asking_questions.pl
4655              
4656             shows how you can use a decision-tree classifier interactively. In this mode, you
4657             first construct the decision tree from the training data and then the user is
4658             prompted for answers to the feature tests at the nodes of the tree.
4659              
4660             If your training data has a feature whose values span a large range and, at the same
4661             time, are characterized by a heavy-tail distribution, you should look at the script
4662              
4663             construct_dt_for_heavytailed.pl
4664              
4665             to see how to use the option C in the call to the
4666             constructor. This option was introduced in Version 2.22 for dealing with such
4667             features. If you do not set this option, the module will use the default value of
4668             500 for the number of points at which to sample the value range for such a feature.
4669              
4670             The C directory also contains the following scripts:
4671              
4672             generate_training_data_numeric.pl
4673             generate_training_data_symbolic.pl
4674              
4675             that show how you can use the module to generate synthetic training. Synthetic
4676             training is generated according to the specifications laid out in a parameter file.
4677             There are constraints on how the information is laid out in a parameter file. See
4678             the files C and C in the C directory
4679             for how to structure these files.
4680              
4681             The C directory of Versions 2.1 and higher of the module also contains the
4682             following two scripts:
4683              
4684             evaluate_training_data1.pl
4685             evaluate_training_data2.pl
4686              
4687             that illustrate how the Perl class C can be used to evaluate the
4688             quality of your training data (as long as it resides in a `C<.csv>' file.) This new
4689             class is a subclass of the C class in the module file. See the README
4690             in the C directory for further information regarding these two scripts.
4691              
4692             The C directory of Versions 2.31 and higher of the module contains the
4693             following three scripts:
4694              
4695             introspection_in_a_loop_interactive.pl
4696              
4697             introspection_show_training_samples_at_all_nodes_direct_influence.pl
4698              
4699             introspection_show_training_samples_to_nodes_influence_propagation.pl
4700              
4701             The first script illustrates how to use the C class of the module
4702             interactively for generating explanations for the classification decisions made at
4703             the nodes of the decision tree. In the interactive session you are first asked for
4704             the node number you are interested in. Subsequently, you are asked for whether or
4705             not you are interested in specific questions that the introspector can provide
4706             answers for. The second script generates a tabular display that shows for each node
4707             of the decision tree a list of the training samples that fall directly in the portion
4708             of the feature space assigned that node. (As mentioned elsewhere in this
4709             documentation, when this list is empty for a node, that means the node is a result of
4710             the generalization achieved by probabilistic modeling of the data. Note that this
4711             module constructs a decision tree NOT by partitioning the set of training samples,
4712             BUT by partitioning the domains of the probability density functions.) The third
4713             script listed above also generates a tabular display, but one that shows how the
4714             influence of each training sample propagates in the tree. This display first shows
4715             the list of nodes that are affected directly by the data in a training sample. This
4716             list is followed by an indented display of the nodes that are affected indirectly by
4717             the training sample. A training sample affects a node indirectly if the node is a
4718             descendant of one of the nodes affected directly.
4719              
4720             The latest addition to the Examples directory is the script:
4721              
4722             get_indexes_associated_with_fields.py
4723              
4724             As to why you may find this script useful, note that large database files may have
4725             hundreds of fields and it is not always easy to figure out what numerical index is
4726             associated with a given field. At the same time, the constructor of the DecisionTree
4727             module requires that the field that holds the class label and the fields that contain
4728             the feature values be specified by their numerical zero-based indexes. If you have a
4729             large database and you are faced with this problem, you can run this script to see
4730             the zero-based numerical index values associated with the different columns of your
4731             CSV file.
4732              
4733              
4734             =head1 THE C DIRECTORY
4735              
4736             The C directory contains the following scripts:
4737              
4738             bagging_for_classifying_one_test_sample.pl
4739            
4740             bagging_for_bulk_classification.pl
4741              
4742             As the names of the scripts imply, the first shows how to call the different methods
4743             of the C class for classifying a single test sample. When
4744             you are classifying a single test sample, you can also see how each bag is
4745             classifying the test sample. You can, for example, display the training data used in
4746             each bag, the decision tree constructed for each bag, etc.
4747              
4748             The second script is for the case when you place all of the test samples in a single
4749             file. The demonstration script displays for each test sample a single aggregate
4750             classification decision that is obtained through majority voting by all the decision
4751             trees.
4752              
4753              
4754             =head1 THE C DIRECTORY
4755              
4756             The C subdirectory in the main installation directory contains the
4757             following three scripts:
4758              
4759             boosting_for_classifying_one_test_sample_1.pl
4760              
4761             boosting_for_classifying_one_test_sample_2.pl
4762              
4763             boosting_for_bulk_classification.pl
4764              
4765             As the names of the first two scripts imply, these show how to call the different
4766             methods of the C class for classifying a single test sample.
4767             When you are classifying a single test sample, you can see how each stage of the
4768             cascade of decision trees is classifying the test sample. You can also view each
4769             decision tree separately and also see the trust factor associated with the tree.
4770              
4771             The third script is for the case when you place all of the test samples in a single
4772             file. The demonstration script outputs for each test sample a single aggregate
4773             classification decision that is obtained through trust-factor weighted majority
4774             voting by all the decision trees.
4775              
4776             =head1 THE C DIRECTORY
4777              
4778             The C directory shows example scripts that you can use to
4779             become more familiar with the C class for solving
4780             needle-in-a-haystack and big-data data classification problems. These scripts are:
4781              
4782             randomized_trees_for_classifying_one_test_sample_1.pl
4783              
4784             randomized_trees_for_classifying_one_test_sample_2.pl
4785              
4786             classify_database_records.pl
4787              
4788             The first script shows the constructor options to use for solving a
4789             needle-in-a-haystack problem --- that is, a problem in which a vast majority of the
4790             training data belongs to just one class. The second script shows the constructor
4791             options for using randomized decision trees for the case when you have access to a
4792             very large database of training samples and you'd like to construct an ensemble of
4793             decision trees using training samples pulled randomly from the training database.
4794             The last script illustrates how you can evaluate the classification power of an
4795             ensemble of decision trees as constructed by C by classifying
4796             a large number of test samples extracted randomly from the training database.
4797              
4798              
4799             =head1 THE C DIRECTORY
4800              
4801             The C subdirectory in the main installation directory shows
4802             example scripts that you can use to become familiar with regression trees and how
4803             they can be used for nonlinear regression. If you are new to the concept of
4804             regression trees, start by executing the following scripts without changing them and
4805             see what sort of output is produced by them:
4806              
4807             regression4.pl
4808              
4809             regression5.pl
4810              
4811             regression6.pl
4812              
4813             regression8.pl
4814              
4815             The C script involves only one predictor variable and one dependent
4816             variable. The training data for this exercise is drawn from the file C.
4817             This data file contains strongly nonlinear data. When you run the script
4818             C, you will see how much better the result from tree regression is
4819             compared to what you can get with linear regression.
4820              
4821             The C script is essentially the same as the previous script except
4822             for the fact that the training datafile used in this case, C, consists
4823             of three noisy segments, as opposed to just two in the previous case.
4824              
4825             The script C deals with the case when we have two predictor variables
4826             and one dependent variable. You can think of the data as consisting of noisy height
4827             values over an C<(x1,x2)> plane. The data used in this script is drawn from the csv
4828             file C.
4829              
4830             Finally, the script C shows how you can carry out bulk prediction for
4831             all your test data records in a disk file. The script writes all the calculated
4832             predictions into another disk file whose name is derived from the name of the test
4833             data file.
4834              
4835              
4836             =head1 EXPORT
4837              
4838             None by design.
4839              
4840             =head1 BUGS
4841              
4842             Please notify the author if you encounter any bugs. When sending email, please place
4843             the string 'DecisionTree' in the subject line.
4844              
4845             =head1 INSTALLATION
4846              
4847             Download the archive from CPAN in any directory of your choice. Unpack the archive
4848             with a command that on a Linux machine would look like:
4849              
4850             tar zxvf Algorithm-DecisionTree-3.42.tar.gz
4851              
4852             This will create an installation directory for you whose name will be
4853             C. Enter this directory and execute the following
4854             commands for a standard install of the module if you have root privileges:
4855              
4856             perl Makefile.PL
4857             make
4858             make test
4859             sudo make install
4860              
4861             If you do not have root privileges, you can carry out a non-standard install the
4862             module in any directory of your choice by:
4863              
4864             perl Makefile.PL prefix=/some/other/directory/
4865             make
4866             make test
4867             make install
4868              
4869             With a non-standard install, you may also have to set your PERL5LIB environment
4870             variable so that this module can find the required other modules. How you do that
4871             would depend on what platform you are working on. In order to install this module in
4872             a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
4873             variable by
4874              
4875             setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4876              
4877             If I used bash, I'd need to declare:
4878              
4879             export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4880              
4881              
4882             =head1 THANKS
4883              
4884             I wish to thank many users of this module for their feedback. Many of the
4885             improvements I have made to the module over the years are a result of the feedback
4886             received.
4887              
4888             I thank Slaven Rezic for pointing out that the module worked with Perl 5.14.x. For
4889             Version 2.22, I had set the required version of Perl to 5.18.0 since that's what I
4890             used for testing the module. Slaven's feedback in the form of the Bug report
4891             C<#96547> resulted in Version 2.23 of the module. Version 2.25 further downshifts
4892             the required version of Perl to 5.10.
4893              
4894             On the basis of the report posted by Slaven at C regarding Version 2.27,
4895             I am removing the Perl version restriction altogether from Version 2.30. Thanks
4896             Slaven!
4897              
4898              
4899             =head1 AUTHOR
4900              
4901             The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy Project"
4902             with the publication of the book I by John-Wiley. If
4903             interested, visit his web page at Purdue to find out what this project was all
4904             about. You might like I especially if you enjoyed reading
4905             Harry Potter as a kid (or even as an adult, for that matter).
4906              
4907             If you send email regarding this module, please place the string "DecisionTree" in
4908             your subject line to get past my spam filter. Avi Kak's email address is
4909             C
4910              
4911             =head1 COPYRIGHT
4912              
4913             This library is free software; you can redistribute it and/or modify it under the
4914             same terms as Perl itself.
4915              
4916             Copyright 2016 Avinash Kak
4917              
4918             =cut
4919