line  
 stmt  
 bran  
 cond  
 sub  
 pod  
 time  
 code  
 
1 
 
  
 
   
 
 
 
 
 
 
 
 
 
 
 
 package Algorithm::DecisionTree;  
 
2 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #--------------------------------------------------------------------------------------  
 
4 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # Copyright (c) 2017 Avinash Kak. All rights reserved.  This program is free  
 
5 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # software.  You may modify and/or distribute it under the same terms as Perl itself.  
 
6 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # This copyright notice must remain attached to the file.  
 
7 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #  
 
8 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # Algorithm::DecisionTree is a Perl module for decision-tree based classification of   
 
9 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # multidimensional data.  
 
10 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # -------------------------------------------------------------------------------------  
 
11 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
12 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #use 5.10.0;  
 
13 
 
1
 
 
 
 
 
  
1
   
 
 
 
22561
 
 use strict;  
 
  
 
1
 
 
 
 
 
 
 
 
 
4
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
46
 
    
 
14 
 
1
 
 
 
 
 
  
1
   
 
 
 
9
 
 use warnings;  
 
  
 
1
 
 
 
 
 
 
 
 
 
3
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
51
 
    
 
15 
 
1
 
 
 
 
 
  
1
   
 
 
 
7
 
 use Carp;  
 
  
 
1
 
 
 
 
 
 
 
 
 
9
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
21944
 
    
 
16 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
17 
 
 
 
 
 
 
 
 
 
 
 
 
 
 our $VERSION = '3.43';  
 
18 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
19 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ############################################   Constructor  ##############################################  
 
20 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub new {   
 
21 
 
1
 
 
 
 
 
  
1
   
 
  
1
   
 
559
 
     my ($class, %args, $eval_or_boosting_mode);  
 
22 
 
1
 
  
 50
   
 
 
 
 
 
 
 
6
 
     if (@_ % 2 != 0) {  
 
23 
 
1
 
 
 
 
 
 
 
 
 
7
 
         ($class, %args) = @_;  
 
24 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
25 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $class = shift;  
 
26 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $eval_or_boosting_mode = shift;  
 
27 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
         die unless $eval_or_boosting_mode eq 'evalmode' || $eval_or_boosting_mode eq 'boostingmode';  
 
28 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         die "Only one string arg allowed in eval and boosting modes" if @_;  
 
29 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
30 
 
1
 
  
 50
   
 
 
 
 
 
 
 
5
 
     unless ($eval_or_boosting_mode) {  
 
31 
 
1
 
 
 
 
 
 
 
 
 
5
 
         my @params = keys %args;  
 
32 
 
1
 
  
 50
   
 
 
 
 
 
 
 
5
 
         croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"   
 
33 
 
 
 
 
 
 
 
 
 
 
 
 
 
                            if check_for_illegal_params2(@params) == 0;  
 
34 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
35 
 
 
 
 
 
 
 
 
 
 
 
 
 
     bless {  
 
36 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _training_datafile                   =>    $args{training_datafile},   
 
37 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _entropy_threshold                   =>    $args{entropy_threshold} || 0.01,  
 
38 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _max_depth_desired                   =>    exists $args{max_depth_desired} ?   
 
39 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                        $args{max_depth_desired} : undef,  
 
40 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _debug1                              =>    $args{debug1} || 0,  
 
41 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _debug2                              =>    $args{debug2} || 0,  
 
42 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _debug3                              =>    $args{debug3} || 0,  
 
43 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _csv_class_column_index              =>    $args{csv_class_column_index} || undef,  
 
44 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _csv_columns_for_features            =>    $args{csv_columns_for_features} || undef,  
 
45 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _symbolic_to_numeric_cardinality_threshold  
 
46 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                              =>    $args{symbolic_to_numeric_cardinality_threshold} || 10,  
 
47 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _number_of_histogram_bins            =>    $args{number_of_histogram_bins} || undef,  
 
48 
 
1
 
  
 50
   
 
  
 50
   
 
 
 
 
 
76
 
         _csv_cleanup_needed                  =>    $args{csv_cleanup_needed} || 0,  
 
  
 
 
 
 
 
  
 50
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
 50
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
 50
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
 50
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
 50
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
 50
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
 50
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
 50
   
 
 
 
 
 
 
 
    
 
49 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _training_data                       =>    [],  
 
50 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _root_node                           =>    undef,  
 
51 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _probability_cache                   =>    {},  
 
52 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _entropy_cache                       =>    {},  
 
53 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _training_data_hash                  =>    {},  
 
54 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _features_and_values_hash            =>    {},  
 
55 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _samples_class_label_hash            =>    {},  
 
56 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _class_names                         =>    [],  
 
57 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _class_priors                        =>    [],  
 
58 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _class_priors_hash                   =>    {},  
 
59 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _feature_names                       =>    [],  
 
60 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _numeric_features_valuerange_hash    =>    {},  
 
61 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _sampling_points_for_numeric_feature_hash      =>      {},  
 
62 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _feature_values_how_many_uniques_hash          =>      {},  
 
63 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _prob_distribution_numeric_features_hash       =>      {},  
 
64 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _histogram_delta_hash                          =>      {},  
 
65 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _num_of_histogram_bins_hash                    =>      {},  
 
66 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }, $class;  
 
67 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
68 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
69 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
70 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ####################################  Classify with Decision Tree  #######################################  
 
71 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
72 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  Classifies one test sample at a time using the decision tree constructed from  
 
73 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  your training file.  The data record for the test sample must be supplied as  
 
74 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  shown in the scripts in the `examples' subdirectory.  See the scripts  
 
75 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.  
 
76 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub classify {  
 
77 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
1
   
 
0
 
     my $self = shift;  
 
78 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $root_node = shift;  
 
79 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature_and_values = shift;  
 
80 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';  
 
81 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @features_and_values = @$feature_and_values;  
 
82 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     @features_and_values = @{deep_copy_array(\@features_and_values)};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
83 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     die "\n\nError in the names you have used for features and/or values.  " .  
 
84 
 
 
 
 
 
 
 
 
 
 
 
 
 
         "Try using the csv_cleanup_needed option in the constructor call."   
 
85 
 
 
 
 
 
 
 
 
 
 
 
 
 
                         unless $self->check_names_used(\@features_and_values);  
 
86 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @new_features_and_values = ();  
 
87 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $pattern = '(\S+)\s*=\s*(\S+)';  
 
88 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $feature_and_value (@features_and_values) {  
 
89 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $feature_and_value =~ /$pattern/;  
 
90 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my ($feature, $value) = ($1, $2);  
 
91 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $newvalue = $value;  
 
92 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
93 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $not_all_values_float = 0;  
 
94 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
95 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
         if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
96 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                        $not_all_values_float == 0) {  
 
97 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $newvalue = closest_sampling_point($value, \@unique_values_for_feature);  
 
98 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
99 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @new_features_and_values, "$feature" . '=' . "$newvalue";  
 
100 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
101 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     @features_and_values = @new_features_and_values;  
 
102 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};  
 
103 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %answer = ();  
 
104 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $class_name (@{$self->{_class_names}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
105 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $answer{$class_name} = undef;  
 
106 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
107 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $answer{'solution_path'} = [];  
 
108 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %classification = %{$self->recursive_descent_for_classification($root_node,   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
109 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                     \@features_and_values,\%answer)};  
 
110 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     @{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
111 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if ($self->{_debug3}) {  
 
112 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nCL2 The classification:\n";  
 
113 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $class_name (@{$self->{_class_names}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
114 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "    $class_name  with probability $classification{$class_name}\n";  
 
115 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
116 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
117 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %classification_for_display = ();  
 
118 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $item (keys %classification) {  
 
119 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         if ($item ne 'solution_path') {  
 
120 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});  
 
121 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
122 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @outlist = ();  
 
123 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $x (@{$classification{$item}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
124 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 push @outlist, "NODE$x";  
 
125 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
126 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $classification_for_display{$item} =  \@outlist;  
 
127 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
128 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
129 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return \%classification_for_display;  
 
130 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
131 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
132 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub recursive_descent_for_classification {  
 
133 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $self = shift;  
 
134 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $node = shift;  
 
135 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $features_and_values = shift;  
 
136 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $answer = shift;  
 
137 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @features_and_values = @$features_and_values;  
 
138 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %answer = %$answer;  
 
139 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @children = @{$node->get_children()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
140 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if (@children == 0) {  
 
141 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
142 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $i (0..@{$self->{_class_names}}-1) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
143 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];  
 
144 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
145 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @{$answer{'solution_path'}}, $node->get_serial_num();  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
146 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         return \%answer;  
 
147 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
148 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature_tested_at_node = $node->get_feature();  
 
149 
 
 
 
 
 
 
 
 
 
 
 
 
 
     print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n"   
 
150 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         if $self->{_debug3};  
 
151 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $value_for_feature;  
 
152 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $path_found;  
 
153 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $pattern = '(\S+)\s*=\s*(\S+)';  
 
154 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $feature_and_value (@features_and_values) {  
 
155 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $feature_and_value =~ /$pattern/;  
 
156 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         $value_for_feature = $2 if $feature_tested_at_node eq $1;  
 
157 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
158 
 
 
 
 
 
 
 
 
 
 
 
 
 
     # The following clause introduced in Version 3.20  
 
159 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if (!defined $value_for_feature) {  
 
160 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
161 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $i (0..@{$self->{_class_names}}-1) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
162 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];  
 
163 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
164 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @{$answer{'solution_path'}}, $node->get_serial_num();  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
165 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         return \%answer;  
 
166 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
167 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if ($value_for_feature) {  
 
168 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
169 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};  
 
170 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $pattern1 = '(.+)<(.+)';  
 
171 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $pattern2 = '(.+)>(.+)';  
 
172 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $child (@children) {  
 
173 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
174 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my $last_feature_and_value_on_branch = $branch_features_and_values[-1];   
 
175 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                 if ($last_feature_and_value_on_branch =~ /$pattern1/) {  
 
176 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     my ($feature, $threshold) = ($1,$2);   
 
177 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                     if ($value_for_feature <= $threshold) {  
 
178 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         $path_found = 1;  
 
179 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         %answer = %{$self->recursive_descent_for_classification($child,  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
180 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                              $features_and_values,\%answer)};  
 
181 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         push @{$answer{'solution_path'}}, $node->get_serial_num();  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
182 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         last;  
 
183 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     }  
 
184 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
185 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                 if ($last_feature_and_value_on_branch =~ /$pattern2/) {  
 
186 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     my ($feature, $threshold) = ($1,$2);   
 
187 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                     if ($value_for_feature > $threshold) {  
 
188 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         $path_found = 1;  
 
189 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         %answer = %{$self->recursive_descent_for_classification($child,  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
190 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                             $features_and_values,\%answer)};  
 
191 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         push @{$answer{'solution_path'}}, $node->get_serial_num();  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
192 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         last;  
 
193 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     }  
 
194 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
195 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
196 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             return \%answer if $path_found;  
 
197 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
198 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";  
 
199 
 
 
 
 
 
 
 
 
 
 
 
 
 
             print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n"   
 
200 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                 if $self->{_debug3};  
 
201 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $child (@children) {  
 
202 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
203 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                 print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};  
 
204 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my $last_feature_and_value_on_branch = $branch_features_and_values[-1];   
 
205 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                 if ($last_feature_and_value_on_branch eq $feature_value_combo) {  
 
206 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     %answer = %{$self->recursive_descent_for_classification($child,  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
207 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                               $features_and_values,\%answer)};  
 
208 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     push @{$answer{'solution_path'}}, $node->get_serial_num();  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
209 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     $path_found = 1;  
 
210 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     last;  
 
211 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
212 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
213 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             return \%answer if $path_found;  
 
214 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
215 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
216 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if (! $path_found) {  
 
217 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
218 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $i (0..@{$self->{_class_names}}-1) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
219 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];  
 
220 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
221 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @{$answer{'solution_path'}}, $node->get_serial_num();  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
222 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
223 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return \%answer;  
 
224 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
225 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
226 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  If you want classification to be carried out by engaging a human user in a  
 
227 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  question-answer session, this is the method to use for that purpose.  See, for  
 
228 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  example, the script classify_by_asking_questions.pl in the `examples'  
 
229 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  subdirectory for an illustration of how to do that.  
 
230 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub classify_by_asking_questions {  
 
231 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
1
   
 
0
 
     my $self = shift;  
 
232 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $root_node = shift;  
 
233 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %answer = ();  
 
234 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $class_name (@{$self->{_class_names}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
235 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $answer{$class_name} = undef;  
 
236 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
237 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $answer{'solution_path'} = [];  
 
238 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %scratchpad_for_numeric_answers = ();  
 
239 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $feature_name (keys %{$self->{_prob_distribution_numeric_features_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
240 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $scratchpad_for_numeric_answers{$feature_name} = undef;  
 
241 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
242 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %classification = %{$self->interactive_recursive_descent_for_classification($root_node,  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
243 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                        \%answer, \%scratchpad_for_numeric_answers)};  
 
244 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     @{$classification{'solution_path'}} = reverse @{$classification{'solution_path'}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
245 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %classification_for_display = ();  
 
246 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $item (keys %classification) {  
 
247 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         if ($item ne 'solution_path') {  
 
248 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $classification_for_display{$item} = sprintf("%0.3f", $classification{$item});  
 
249 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
250 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @outlist = ();  
 
251 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $x (@{$classification{$item}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
252 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 push @outlist, "NODE$x";  
 
253 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
254 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $classification_for_display{$item} =  \@outlist;  
 
255 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
256 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
257 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return \%classification_for_display;  
 
258 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
259 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
260 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub interactive_recursive_descent_for_classification {  
 
261 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $self = shift;  
 
262 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $node = shift;  
 
263 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $answer = shift;  
 
264 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $scratchpad_for_numerics = shift;  
 
265 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %answer = %$answer;  
 
266 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %scratchpad_for_numerics = %$scratchpad_for_numerics;  
 
267 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $pattern1 = '(.+)<(.+)';  
 
268 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $pattern2 = '(.+)>(.+)';  
 
269 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $user_value_for_feature;  
 
270 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @children = @{$node->get_children()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
271 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if (@children == 0) {  
 
272 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
273 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $i (0..@{$self->{_class_names}}-1) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
274 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];  
 
275 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
276 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @{$answer{'solution_path'}}, $node->get_serial_num();  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
277 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         return \%answer;  
 
278 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
279 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @list_of_branch_attributes_to_children = ();  
 
280 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $child (@children) {     
 
281 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
282 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $feature_and_value_on_branch = $branch_features_and_values[-1];  
 
283 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @list_of_branch_attributes_to_children, $feature_and_value_on_branch;  
 
284 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
285 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature_tested_at_node = $node->get_feature();  
 
286 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature_value_combo;  
 
287 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $path_found = 0;  
 
288 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
289 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         if ($scratchpad_for_numerics{$feature_tested_at_node}) {  
 
290 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $user_value_for_feature = $scratchpad_for_numerics{$feature_tested_at_node};  
 
291 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
292 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @valuerange =  @{$self->{_numeric_features_valuerange_hash}->{$feature_tested_at_node}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
293 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             while (1) {   
 
294 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "\nWhat is the value for the feature $feature_tested_at_node ?\n";  
 
295 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "\nEnter a value in the range (@valuerange): ";  
 
296 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $user_value_for_feature = ;   
 
297 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $user_value_for_feature =~ s/\r?\n?$//;  
 
298 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;  
 
299 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my $answer_found = 0;  
 
300 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
                 if ($user_value_for_feature >= $valuerange[0] && $user_value_for_feature <= $valuerange[1]) {  
 
301 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     $answer_found = 1;  
 
302 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     last;  
 
303 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
304 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                 last if $answer_found;  
 
305 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print("You entered illegal value. Let's try again")  
 
306 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
307 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $scratchpad_for_numerics{$feature_tested_at_node} = $user_value_for_feature;  
 
308 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
309 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $i (0..@list_of_branch_attributes_to_children-1) {  
 
310 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $branch_attribute = $list_of_branch_attributes_to_children[$i];  
 
311 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($branch_attribute =~ /$pattern1/) {  
 
312 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my ($feature,$threshold) = ($1,$2);  
 
313 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                 if ($user_value_for_feature <= $threshold) {  
 
314 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
315 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                      \%answer, \%scratchpad_for_numerics)};  
 
316 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     $path_found = 1;  
 
317 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     push @{$answer{'solution_path'}}, $node->get_serial_num();  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
318 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     last;  
 
319 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
320 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
321 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($branch_attribute =~ /$pattern2/) {  
 
322 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my ($feature,$threshold) = ($1,$2);  
 
323 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                 if ($user_value_for_feature > $threshold) {  
 
324 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
325 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                      \%answer, \%scratchpad_for_numerics)};  
 
326 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     $path_found = 1;  
 
327 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     push @{$answer{'solution_path'}}, $node->get_serial_num();  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
328 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     last;  
 
329 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
330 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
331 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
332 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         return \%answer if $path_found;  
 
333 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
334 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @possible_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature_tested_at_node}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
335 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         while (1) {  
 
336 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "\nWhat is the value for the feature $feature_tested_at_node ?\n";  
 
337 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "\nEnter a value from the list (@possible_values_for_feature): ";  
 
338 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $user_value_for_feature = ;   
 
339 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $user_value_for_feature =~ s/\r?\n?$//;  
 
340 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;  
 
341 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $answer_found = 0;  
 
342 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (contained_in($user_value_for_feature, @possible_values_for_feature)) {  
 
343 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $answer_found = 1;  
 
344 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 last;  
 
345 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
346 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             last if $answer_found;  
 
347 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print("You entered illegal value. Let's try again");  
 
348 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
349 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $feature_value_combo = "$feature_tested_at_node=$user_value_for_feature";  
 
350 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $i (0..@list_of_branch_attributes_to_children-1) {  
 
351 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $branch_attribute = $list_of_branch_attributes_to_children[$i];  
 
352 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($branch_attribute eq $feature_value_combo) {  
 
353 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 %answer = %{$self->interactive_recursive_descent_for_classification($children[$i],  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
354 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                      \%answer, \%scratchpad_for_numerics)};  
 
355 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $path_found = 1;  
 
356 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 push @{$answer{'solution_path'}}, $node->get_serial_num();  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
357 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 last;  
 
358 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
359 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
360 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         return \%answer if $path_found;  
 
361 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
362 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if (! $path_found) {  
 
363 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
364 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $i (0..@{$self->{_class_names}}-1) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
365 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];  
 
366 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
367 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @{$answer{'solution_path'}}, $node->get_serial_num();  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
368 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
369 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return \%answer;  
 
370 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
371 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
372 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ######################################    Decision Tree Construction  ####################################  
 
373 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
374 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  At the root node, we find the best feature that yields the greatest reduction in  
 
375 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  class entropy from the entropy based on just the class priors. The logic for  
 
376 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  finding this feature is different for symbolic features and for numeric features.  
 
377 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  That logic is built into the method shown later for best feature calculations.  
 
378 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub construct_decision_tree_classifier {  
 
379 
 
1
 
 
 
 
 
  
1
   
 
  
1
   
 
18
 
     print "\nConstructing the decision tree ...\n";  
 
380 
 
1
 
 
 
 
 
 
 
 
 
6
 
     my $self = shift;  
 
381 
 
1
 
  
 50
   
 
 
 
 
 
 
 
7
 
     if ($self->{_debug3}) {          
 
382 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $self->determine_data_condition();   
 
383 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nStarting construction of the decision tree:\n";  
 
384 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
385 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};  
 
  
 
2
 
 
 
 
 
 
 
 
 
12
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
6
 
    
 
386 
 
1
 
  
 50
   
 
 
 
 
 
 
 
9
 
     if ($self->{_debug3}) {   
 
387 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nPrior class probabilities: @class_probabilities\n";  
 
388 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nClass names: @{$self->{_class_names}}\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
389 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
390 
 
1
 
 
 
 
 
 
 
 
 
8
 
     my $entropy = $self->class_entropy_on_priors();  
 
391 
 
1
 
  
 50
   
 
 
 
 
 
 
 
9
 
     print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};  
 
392 
 
1
 
 
 
 
 
 
 
 
 
35
 
     my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');  
 
393 
 
1
 
 
 
 
 
 
 
 
 
4
 
     $root_node->set_class_names(\@{$self->{_class_names}});  
 
  
 
1
 
 
 
 
 
 
 
 
 
8
 
    
 
394 
 
1
 
 
 
 
 
 
 
 
 
4
 
     $self->{_root_node} = $root_node;  
 
395 
 
1
 
 
 
 
 
 
 
 
 
7
 
     $self->recursive_descent($root_node);  
 
396 
 
1
 
 
 
 
 
 
 
 
 
11
 
     return $root_node;  
 
397 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
398 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
399 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  After the root node of the decision tree is calculated by the previous methods,  
 
400 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  we invoke this method recursively to create the rest of the tree.  At each node,  
 
401 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  we find the feature that achieves the largest entropy reduction with regard to  
 
402 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  the partitioning of the training data samples that correspond to that node.  
 
403 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub recursive_descent {  
 
404 
 
46
 
 
 
 
 
  
46
   
 
  
0
   
 
119
 
     my $self = shift;  
 
405 
 
46
 
 
 
 
 
 
 
 
 
109
 
     my $node = shift;  
 
406 
 
 
 
 
 
 
 
 
 
 
 
 
 
     print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"  
 
407 
 
46
 
  
 50
   
 
 
 
 
 
 
 
182
 
         if $self->{_debug3};  
 
408 
 
46
 
 
 
 
 
 
 
 
 
171
 
     my $node_serial_number = $node->get_serial_num();  
 
409 
 
46
 
 
 
 
 
 
 
 
 
106
 
     my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};  
 
  
 
46
 
 
 
 
 
 
 
 
 
175
 
    
 
410 
 
46
 
 
 
 
 
 
 
 
 
165
 
     my $existing_node_entropy = $node->get_node_entropy();  
 
411 
 
46
 
  
 50
   
 
 
 
 
 
 
 
315
 
     if ($self->{_debug3}) {   
 
412 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";  
 
413 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";  
 
414 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";  
 
415 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @class_probs = @{$node->get_class_probabilities()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
416 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nRD4 Class probabilities: @class_probs\n";  
 
417 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
418 
 
46
 
  
 50
   
 
 
 
 
 
 
 
249
 
     if ($existing_node_entropy < $self->{_entropy_threshold}) {   
 
419 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};  
 
420 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         return;  
 
421 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
422 
 
46
 
 
 
 
 
 
 
 
 
108
 
     my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};  
 
  
 
46
 
 
 
 
 
 
 
 
 
157
 
    
 
423 
 
46
 
 
 
 
 
 
 
 
 
245
 
     my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =  
 
424 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     $self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);  
 
425 
 
46
 
 
 
 
 
 
 
 
 
329
 
     $node->set_feature($best_feature);  
 
426 
 
46
 
  
 50
   
 
 
 
 
 
 
 
205
 
     $node->display_node() if $self->{_debug3};  
 
427 
 
46
 
  
 50
   
 
  
 33
   
 
 
 
 
 
188
 
     if (defined($self->{_max_depth_desired}) &&   
 
428 
 
 
 
 
 
 
 
 
 
 
 
 
 
                (@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {  
 
429 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3};   
 
430 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         return;  
 
431 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
432 
 
46
 
  
 50
   
 
 
 
 
 
 
 
175
 
     return if ! defined $best_feature;  
 
433 
 
46
 
  
 50
   
 
 
 
 
 
 
 
161
 
     if ($self->{_debug3}) {   
 
434 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nRD7 Existing entropy at node: $existing_node_entropy\n";  
 
435 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";  
 
436 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nRD9 Best feature entropy: $best_feature_entropy\n";  
 
437 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";  
 
438 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
439 
 
46
 
 
 
 
 
 
 
 
 
140
 
     my $entropy_gain = $existing_node_entropy - $best_feature_entropy;  
 
440 
 
46
 
  
 50
   
 
 
 
 
 
 
 
174
 
     print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};  
 
441 
 
46
 
  
100
   
 
 
 
 
 
 
 
189
 
     if ($entropy_gain > $self->{_entropy_threshold}) {  
 
442 
 
26
 
  
 50
   
 
  
 33
   
 
 
 
 
 
154
 
         if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} &&   
 
443 
 
 
 
 
 
 
 
 
 
 
 
 
 
               $self->{_feature_values_how_many_uniques_hash}->{$best_feature} >   
 
444 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                         $self->{_symbolic_to_numeric_cardinality_threshold}) {  
 
445 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $best_threshold = $decision_val;            # as returned by best feature calculator  
 
446 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;  
 
447 
 
 
 
 
 
 
 
 
 
 
 
 
 
             my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =   
 
448 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                         @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
449 
 
 
 
 
 
 
 
 
 
 
 
 
 
             my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child  =   
 
450 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                         @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
451 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";  
 
452 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";  
 
453 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,   
 
454 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                   $feature_threshold_combo_for_less_than;  
 
455 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,   
 
456 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                $feature_threshold_combo_for_greater_than;  
 
457 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($self->{_debug3}) {  
 
458 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .  
 
459 
 
 
 
 
 
 
 
 
 
 
 
 
 
                       "@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";  
 
460 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .  
 
461 
 
 
 
 
 
 
 
 
 
 
 
 
 
                       "@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";  
 
462 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
463 
 
 
 
 
 
 
 
 
 
 
 
 
 
             my @class_probabilities_for_lessthan_child_node =   
 
464 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,  
 
465 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                  \@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
466 
 
 
 
 
 
 
 
 
 
 
 
 
 
             my @class_probabilities_for_greaterthan_child_node =   
 
467 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,  
 
468 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
               \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
469 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($self->{_debug3}) {  
 
470 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";  
 
471 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";  
 
472 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
473 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {  
 
474 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my $left_child_node = DTNode->new(undef, $best_entropy_for_less,  
 
475 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                          \@class_probabilities_for_lessthan_child_node,  
 
476 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               \@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);  
 
477 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $node->add_child_link($left_child_node);  
 
478 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $self->recursive_descent($left_child_node);  
 
479 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
480 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {  
 
481 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,  
 
482 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                          \@class_probabilities_for_greaterthan_child_node,  
 
483 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             \@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);  
 
484 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $node->add_child_link($right_child_node);  
 
485 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $self->recursive_descent($right_child_node);  
 
486 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
487 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
488 
 
 
 
 
 
 
 
 
 
 
 
 
 
             print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"  
 
489 
 
26
 
  
 50
   
 
 
 
 
 
 
 
108
 
                 if $self->{_debug3};  
 
490 
 
26
 
 
 
 
 
 
 
 
 
68
 
             my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};  
 
  
 
26
 
 
 
 
 
 
 
 
 
189
 
    
 
491 
 
26
 
  
 50
   
 
 
 
 
 
 
 
116
 
             print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};  
 
492 
 
26
 
 
 
 
 
 
 
 
 
95
 
             my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;  
 
  
 
88
 
 
 
 
 
 
 
 
 
495
 
    
 
493 
 
26
 
 
 
 
 
 
 
 
 
88
 
             my @class_entropies_for_children = ();  
 
494 
 
26
 
 
 
 
 
 
 
 
 
126
 
             foreach my $feature_and_value_index (0..@feature_value_combos-1) {  
 
495 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"  
 
496 
 
88
 
  
 50
   
 
 
 
 
 
 
 
328
 
                     if $self->{_debug3};  
 
497 
 
88
 
 
 
 
 
 
 
 
 
186
 
                 my @extended_branch_features_and_values_or_thresholds;  
 
498 
 
88
 
  
100
   
 
 
 
 
 
 
 
296
 
                 if (! @features_and_values_or_thresholds_on_branch) {  
 
499 
 
4
 
 
 
 
 
 
 
 
 
18
 
                     @extended_branch_features_and_values_or_thresholds =   
 
500 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                           ($feature_value_combos[$feature_and_value_index]);  
 
501 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 } else {  
 
502 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     @extended_branch_features_and_values_or_thresholds =   
 
503 
 
84
 
 
 
 
 
 
 
 
 
177
 
                         @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};  
 
  
 
84
 
 
 
 
 
 
 
 
 
258
 
    
 
504 
 
84
 
 
 
 
 
 
 
 
 
279
 
                     push @extended_branch_features_and_values_or_thresholds,   
 
505 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                            $feature_value_combos[$feature_and_value_index];  
 
506 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
507 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 my @class_probabilities =  
 
508 
 
176
 
 
 
 
 
 
 
 
 
568
 
                    map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,  
 
509 
 
88
 
 
 
 
 
 
 
 
 
191
 
                                \@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};  
 
  
 
88
 
 
 
 
 
 
 
 
 
290
 
    
 
510 
 
88
 
 
 
 
 
 
 
 
 
310
 
                 my $class_entropy_for_child =   
 
511 
 
 
 
 
 
 
 
 
 
 
 
 
 
                       $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(  
 
512 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                          \@extended_branch_features_and_values_or_thresholds);  
 
513 
 
88
 
  
 50
   
 
 
 
 
 
 
 
297
 
                 if ($self->{_debug3}) {  
 
514 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";  
 
515 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     print "\nRD20 class entropy for child: $class_entropy_for_child\n";   
 
516 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
517 
 
88
 
  
100
   
 
 
 
 
 
 
 
302
 
                 if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {  
 
518 
 
45
 
 
 
 
 
 
 
 
 
290
 
                     my $child_node = DTNode->new(undef, $class_entropy_for_child,  
 
519 
 
 
 
 
 
 
 
 
 
 
 
 
 
                           \@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);  
 
520 
 
45
 
 
 
 
 
 
 
 
 
227
 
                     $node->add_child_link($child_node);  
 
521 
 
45
 
 
 
 
 
 
 
 
 
289
 
                     $self->recursive_descent($child_node);  
 
522 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 } else {  
 
523 
 
43
 
  
 50
   
 
 
 
 
 
 
 
281
 
                     print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};  
 
524 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
525 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
526 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
527 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
528 
 
 
 
 
 
 
 
 
 
 
 
 
 
         print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n"   
 
529 
 
20
 
  
 50
   
 
 
 
 
 
 
 
74
 
             if $self->{_debug3};  
 
530 
 
20
 
 
 
 
 
 
 
 
 
159
 
         return;  
 
531 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
532 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
533 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
534 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  This is the heart of the decision tree constructor.  Its main job is to figure  
 
535 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  out the best feature to use for partitioning the training data samples that  
 
536 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  correspond to the current node.  The search for the best feature is carried out  
 
537 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  differently for symbolic features and for numeric features.  For a symbolic  
 
538 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  feature, the method estimates the entropy for each value of the feature and then  
 
539 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  averages out these entropies as a measure of the discriminatory power of that  
 
540 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  features.  For a numeric feature, on the other hand, it estimates the entropy  
 
541 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  reduction that can be achieved if were to partition the set of training samples  
 
542 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  for each possible threshold.  For a numeric feature, all possible sampling points  
 
543 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  relevant to the node in question are considered as candidates for thresholds.  
 
544 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub best_feature_calculator {  
 
545 
 
46
 
 
 
 
 
  
46
   
 
  
0
   
 
108
 
     my $self = shift;  
 
546 
 
46
 
 
 
 
 
 
 
 
 
105
 
     my $features_and_values_or_thresholds_on_branch = shift;  
 
547 
 
46
 
 
 
 
 
 
 
 
 
120
 
     my $existing_node_entropy = shift;  
 
548 
 
46
 
 
 
 
 
 
 
 
 
217
 
     my @features_and_values_or_thresholds_on_branch =  @$features_and_values_or_thresholds_on_branch;  
 
549 
 
46
 
 
 
 
 
 
 
 
 
109
 
     my $pattern1 = '(.+)=(.+)';  
 
550 
 
46
 
 
 
 
 
 
 
 
 
106
 
     my $pattern2 = '(.+)<(.+)';  
 
551 
 
46
 
 
 
 
 
 
 
 
 
145
 
     my $pattern3 = '(.+)>(.+)';  
 
552 
 
46
 
 
 
 
 
 
 
 
 
125
 
     my @all_symbolic_features = ();  
 
553 
 
46
 
 
 
 
 
 
 
 
 
112
 
     foreach my $feature_name (@{$self->{_feature_names}}) {  
 
  
 
46
 
 
 
 
 
 
 
 
 
153
 
    
 
554 
 
 
 
 
 
 
 
 
 
 
 
 
 
         push @all_symbolic_features, $feature_name   
 
555 
 
184
 
  
 50
   
 
 
 
 
 
 
 
611
 
             if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};  
 
556 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
557 
 
46
 
 
 
 
 
 
 
 
 
150
 
     my @symbolic_features_already_used = ();    
 
558 
 
46
 
 
 
 
 
 
 
 
 
136
 
     foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {  
 
559 
 
138
 
  
 50
   
 
 
 
 
 
 
 
1156
 
         push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;  
 
560 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
561 
 
46
 
 
 
 
 
 
 
 
 
150
 
     my @symbolic_features_not_yet_used;  
 
562 
 
46
 
 
 
 
 
 
 
 
 
130
 
     foreach my $x (@all_symbolic_features) {  
 
563 
 
184
 
  
100
   
 
 
 
 
 
 
 
539
 
         push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);  
 
564 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
565 
 
46
 
 
 
 
 
 
 
 
 
128
 
     my @true_numeric_types = ();  
 
566 
 
46
 
 
 
 
 
 
 
 
 
103
 
     my @symbolic_types = ();  
 
567 
 
46
 
 
 
 
 
 
 
 
 
160
 
     my @true_numeric_types_feature_names = ();  
 
568 
 
46
 
 
 
 
 
 
 
 
 
116
 
     my @symbolic_types_feature_names = ();  
 
569 
 
46
 
 
 
 
 
 
 
 
 
118
 
     foreach my $item (@features_and_values_or_thresholds_on_branch) {  
 
570 
 
138
 
  
 50
   
 
 
 
 
 
 
 
1047
 
         if ($item =~ /$pattern2/) {  
 
  
 
 
 
  
 50
   
 
 
 
 
 
 
 
 
 
    
 
  
 
 
 
  
 50
   
 
 
 
 
 
 
 
 
 
    
 
571 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @true_numeric_types, $item;  
 
572 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @true_numeric_types_feature_names, $1;  
 
573 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif ($item =~ /$pattern3/) {  
 
574 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @true_numeric_types, $item;  
 
575 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @true_numeric_types_feature_names, $1;  
 
576 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif ($item =~ /$pattern1/) {  
 
577 
 
138
 
 
 
 
 
 
 
 
 
342
 
             push @symbolic_types, $item;  
 
578 
 
138
 
 
 
 
 
 
 
 
 
439
 
             push @symbolic_types_feature_names, $1;  
 
579 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
580 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             die "format error in the representation of feature and values or thresholds";  
 
581 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
582 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
583 
 
46
 
 
 
 
 
 
 
 
 
182
 
     my %seen = ();  
 
584 
 
46
 
  
  0
   
 
 
 
 
 
 
 
130
 
     @true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
585 
 
46
 
 
 
 
 
 
 
 
 
123
 
     %seen = ();  
 
586 
 
46
 
  
 50
   
 
 
 
 
 
 
 
141
 
     @symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;  
 
  
 
138
 
 
 
 
 
 
 
 
 
765
 
    
 
587 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my @bounded_intervals_numeric_types =   
 
588 
 
46
 
 
 
 
 
 
 
 
 
112
 
                        @{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};  
 
  
 
46
 
 
 
 
 
 
 
 
 
205
 
    
 
589 
 
 
 
 
 
 
 
 
 
 
 
 
 
     # Calculate the upper and the lower bounds to be used when searching for the best  
 
590 
 
 
 
 
 
 
 
 
 
 
 
 
 
     # threshold for each of the numeric features that are in play at the current node:  
 
591 
 
46
 
 
 
 
 
 
 
 
 
141
 
     my (%upperbound, %lowerbound);  
 
592 
 
46
 
 
 
 
 
 
 
 
 
144
 
     foreach my $feature (@true_numeric_types_feature_names) {  
 
593 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $upperbound{$feature} = undef;  
 
594 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $lowerbound{$feature} = undef;  
 
595 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
596 
 
46
 
 
 
 
 
 
 
 
 
130
 
     foreach my $item (@bounded_intervals_numeric_types) {  
 
597 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $feature_grouping (@$item) {  
 
598 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($feature_grouping->[1] eq '>') {  
 
599 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];  
 
600 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
601 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];  
 
602 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
603 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
604 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
605 
 
46
 
 
 
 
 
 
 
 
 
141
 
     my %entropy_values_for_different_features = ();  
 
606 
 
46
 
 
 
 
 
 
 
 
 
114
 
     my %partitioning_point_child_entropies_hash = ();  
 
607 
 
46
 
 
 
 
 
 
 
 
 
107
 
     my %partitioning_point_threshold = ();  
 
608 
 
46
 
 
 
 
 
 
 
 
 
118
 
     my %entropies_for_different_values_of_symbolic_feature = ();  
 
609 
 
46
 
 
 
 
 
 
 
 
 
106
 
     foreach my $feature (@{$self->{_feature_names}}) {  
 
  
 
46
 
 
 
 
 
 
 
 
 
159
 
    
 
610 
 
184
 
 
 
 
 
 
 
 
 
448
 
         $entropy_values_for_different_features{$feature} = [];  
 
611 
 
184
 
 
 
 
 
 
 
 
 
453
 
         $partitioning_point_child_entropies_hash{$feature} = {};  
 
612 
 
184
 
 
 
 
 
 
 
 
 
374
 
         $partitioning_point_threshold{$feature} = undef;  
 
613 
 
184
 
 
 
 
 
 
 
 
 
519
 
         $entropies_for_different_values_of_symbolic_feature{$feature} = [];  
 
614 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
615 
 
46
 
 
 
 
 
 
 
 
 
112
 
     foreach my $i (0..@{$self->{_feature_names}}-1) {  
 
  
 
46
 
 
 
 
 
 
 
 
 
167
 
    
 
616 
 
184
 
 
 
 
 
 
 
 
 
544
 
         my $feature_name = $self->{_feature_names}->[$i];  
 
617 
 
184
 
  
 50
   
 
 
 
 
 
 
 
545
 
         print "\n\nBFC1          FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};  
 
618 
 
184
 
  
100
   
 
  
 33
   
 
 
 
 
 
590
 
         if (contained_in($feature_name, @symbolic_features_already_used)) {  
 
  
 
 
 
  
 50
   
 
 
 
 
 
 
 
 
 
    
 
619 
 
138
 
 
 
 
 
 
 
 
 
431
 
             next;  
 
620 
 
46
 
 
 
 
 
 
 
 
 
231
 
         } elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&  
 
621 
 
 
 
 
 
 
 
 
 
 
 
 
 
                  $self->{_feature_values_how_many_uniques_hash}->{$feature_name} >  
 
622 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                       $self->{_symbolic_to_numeric_cardinality_threshold}) {  
 
623 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
624 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};        
 
625 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @newvalues = ();  
 
626 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (contained_in($feature_name, @true_numeric_types_feature_names)) {  
 
627 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
                 if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&  
 
  
 
 
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
    
 
628 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               $lowerbound{$feature_name} >= $upperbound{$feature_name}) {  
 
629 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     next;  
 
630 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 } elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&  
 
631 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                     $lowerbound{$feature_name} < $upperbound{$feature_name}) {  
 
632 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     foreach my $x (@values) {  
 
633 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
                         push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};  
 
634 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     }  
 
635 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 } elsif (defined($upperbound{$feature_name})) {  
 
636 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     foreach my $x (@values) {  
 
637 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                         push @newvalues, $x if $x <= $upperbound{$feature_name};  
 
638 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     }  
 
639 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 } elsif (defined($lowerbound{$feature_name})) {  
 
640 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     foreach my $x (@values) {  
 
641 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                         push @newvalues, $x if $x > $lowerbound{$feature_name};  
 
642 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     }  
 
643 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 } else {  
 
644 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     die "Error is bound specifications in best feature calculator";  
 
645 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
646 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
647 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 @newvalues = @{deep_copy_array(\@values)};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
648 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
649 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             next if @newvalues == 0;  
 
650 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @partitioning_entropies = ();              
 
651 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $value (@newvalues) {  
 
652 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my $feature_and_less_than_value_string =  "$feature_name" . '<' . "$value";  
 
653 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";  
 
654 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my @for_left_child;  
 
655 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 my @for_right_child;  
 
656 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                 if (@features_and_values_or_thresholds_on_branch) {  
 
657 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     @for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
658 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     push @for_left_child, $feature_and_less_than_value_string;  
 
659 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     @for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
660 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     push @for_right_child, $feature_and_greater_than_value_string;  
 
661 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 } else {  
 
662 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     @for_left_child = ($feature_and_less_than_value_string);  
 
663 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     @for_right_child = ($feature_and_greater_than_value_string);  
 
664 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
665 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(  
 
666 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                     \@features_and_values_or_thresholds_on_branch, $feature_name, $value);  
 
667 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(  
 
668 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                     \@features_and_values_or_thresholds_on_branch, $feature_name, $value);  
 
669 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my $partitioning_entropy = $entropy1 *   
 
670 
 
 
 
 
 
 
 
 
 
 
 
 
 
                      $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +  
 
671 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                            $entropy2 *  
 
672 
 
 
 
 
 
 
 
 
 
 
 
 
 
                      $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);  
 
673 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
674 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 push @partitioning_entropies, $partitioning_entropy;  
 
675 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];  
 
676 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
677 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);  
 
678 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($min_entropy < $existing_node_entropy) {  
 
679 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];  
 
680 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $entropy_values_for_different_features{$feature_name} = $min_entropy;  
 
681 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
682 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
683 
 
46
 
  
 50
   
 
 
 
 
 
 
 
190
 
             print "\nBFC2:  Entering section reserved for symbolic features\n" if $self->{_debug3};  
 
684 
 
46
 
  
 50
   
 
 
 
 
 
 
 
175
 
             print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};  
 
685 
 
46
 
 
 
 
 
 
 
 
 
140
 
             my %seen;  
 
686 
 
159
 
  
 50
   
 
 
 
 
 
 
 
1182
 
             my @values = grep {$_ ne 'NA' && !$seen{$_}++}   
 
687 
 
46
 
 
 
 
 
 
 
 
 
117
 
                                     @{$self->{_features_and_unique_values_hash}->{$feature_name}};  
 
  
 
46
 
 
 
 
 
 
 
 
 
223
 
    
 
688 
 
46
 
 
 
 
 
 
 
 
 
282
 
             @values = sort @values;  
 
689 
 
46
 
  
 50
   
 
 
 
 
 
 
 
210
 
             print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};  
 
690 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
691 
 
46
 
 
 
 
 
 
 
 
 
134
 
             my $entropy = 0;  
 
692 
 
46
 
 
 
 
 
 
 
 
 
154
 
             foreach my $value (@values) {  
 
693 
 
159
 
 
 
 
 
 
 
 
 
674
 
                 my $feature_value_string = "$feature_name" . '=' . "$value";  
 
694 
 
159
 
  
 50
   
 
 
 
 
 
 
 
536
 
                 print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};  
 
695 
 
159
 
 
 
 
 
 
 
 
 
313
 
                 my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};  
 
  
 
159
 
 
 
 
 
 
 
 
 
528
 
    
 
696 
 
159
 
  
100
   
 
 
 
 
 
 
 
570
 
                 if (@features_and_values_or_thresholds_on_branch) {  
 
697 
 
145
 
 
 
 
 
 
 
 
 
385
 
                     push @extended_attributes, $feature_value_string;  
 
698 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 } else {  
 
699 
 
14
 
 
 
 
 
 
 
 
 
62
 
                     @extended_attributes = ($feature_value_string);  
 
700 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
701 
 
159
 
 
 
 
 
 
 
 
 
608
 
                 $entropy +=   
 
702 
 
 
 
 
 
 
 
 
 
 
 
 
 
            $self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) *   
 
703 
 
 
 
 
 
 
 
 
 
 
 
 
 
            $self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);  
 
704 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .  
 
705 
 
159
 
  
 50
   
 
 
 
 
 
 
 
595
 
                       "is $entropy\n" if $self->{_debug3};  
 
706 
 
159
 
 
 
 
 
 
 
 
 
377
 
                 push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;  
 
  
 
159
 
 
 
 
 
 
 
 
 
799
 
    
 
707 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
708 
 
46
 
  
 50
   
 
 
 
 
 
 
 
225
 
             if ($entropy < $existing_node_entropy) {  
 
709 
 
46
 
 
 
 
 
 
 
 
 
465
 
                 $entropy_values_for_different_features{$feature_name} = $entropy;  
 
710 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
711 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
712 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
713 
 
46
 
 
 
 
 
 
 
 
 
189
 
     my $min_entropy_for_best_feature;  
 
714 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $best_feature_name;  
 
715 
 
46
 
 
 
 
 
 
 
 
 
224
 
     foreach my $feature_nom (keys %entropy_values_for_different_features) {   
 
716 
 
184
 
  
100
   
 
 
 
 
 
 
 
478
 
         if (!defined($best_feature_name)) {  
 
717 
 
46
 
 
 
 
 
 
 
 
 
106
 
             $best_feature_name = $feature_nom;  
 
718 
 
46
 
 
 
 
 
 
 
 
 
157
 
             $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};  
 
719 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
720 
 
138
 
  
100
   
 
 
 
 
 
 
 
503
 
             if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {  
 
721 
 
43
 
 
 
 
 
 
 
 
 
131
 
                 $best_feature_name = $feature_nom;  
 
722 
 
43
 
 
 
 
 
 
 
 
 
118
 
                 $min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};  
 
723 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
724 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
725 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
726 
 
46
 
 
 
 
 
 
 
 
 
120
 
     my $threshold_for_best_feature;  
 
727 
 
46
 
  
 50
   
 
 
 
 
 
 
 
155
 
     if (exists $partitioning_point_threshold{$best_feature_name}) {  
 
728 
 
46
 
 
 
 
 
 
 
 
 
126
 
         $threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};  
 
729 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
730 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $threshold_for_best_feature = undef;  
 
731 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
732 
 
46
 
 
 
 
 
 
 
 
 
116
 
     my $best_feature_entropy = $min_entropy_for_best_feature;  
 
733 
 
46
 
 
 
 
 
 
 
 
 
112
 
     my @val_based_entropies_to_be_returned;  
 
734 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $decision_val_to_be_returned;  
 
735 
 
46
 
  
 50
   
 
  
 33
   
 
 
 
 
 
236
 
     if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} &&   
 
736 
 
 
 
 
 
 
 
 
 
 
 
 
 
           $self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} >   
 
737 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                     $self->{_symbolic_to_numeric_cardinality_threshold}) {  
 
738 
 
 
 
 
 
 
 
 
 
 
 
 
 
         @val_based_entropies_to_be_returned =   
 
739 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             @{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
740 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
741 
 
46
 
 
 
 
 
 
 
 
 
125
 
         @val_based_entropies_to_be_returned = ();  
 
742 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
743 
 
46
 
  
 50
   
 
 
 
 
 
 
 
152
 
     if (exists $partitioning_point_threshold{$best_feature_name}) {  
 
744 
 
46
 
 
 
 
 
 
 
 
 
110
 
         $decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};  
 
745 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
746 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $decision_val_to_be_returned = undef;  
 
747 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
748 
 
 
 
 
 
 
 
 
 
 
 
 
 
     print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .  
 
749 
 
46
 
  
 50
   
 
 
 
 
 
 
 
174
 
         "@val_based_entropies_to_be_returned\n"  if $self->{_debug3};  
 
750 
 
46
 
 
 
 
 
 
 
 
 
1506
 
     return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned,   
 
751 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                       $decision_val_to_be_returned);  
 
752 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
753 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
754 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #########################################    Entropy Calculators     #####################################  
 
755 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
756 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub class_entropy_on_priors {  
 
757 
 
1
 
 
 
 
 
  
1
   
 
  
0
   
 
5
 
     my $self = shift;  
 
758 
 
 
 
 
 
 
 
 
 
 
 
 
 
     return $self->{_entropy_cache}->{'priors'}   
 
759 
 
1
 
  
 50
   
 
 
 
 
 
 
 
8
 
         if exists $self->{_entropy_cache}->{"priors"};  
 
760 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my @class_names = @{$self->{_class_names}};  
 
  
 
1
 
 
 
 
 
 
 
 
 
7
 
    
 
761 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my $entropy;  
 
762 
 
1
 
 
 
 
 
 
 
 
 
5
 
     foreach my $class (@class_names) {  
 
763 
 
2
 
 
 
 
 
 
 
 
 
11
 
         my $prob = $self->prior_probability_for_class($class);  
 
764 
 
2
 
  
 50
   
 
  
 33
   
 
 
 
 
 
28
 
         my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;  
 
765 
 
2
 
  
 50
   
 
 
 
 
 
 
 
10
 
         $log_prob = 0 if $prob < 0.0001;           # since X.log(X)->0 as X->0  
 
766 
 
2
 
  
 50
   
 
 
 
 
 
 
 
10
 
         $log_prob = 0 if $prob > 0.999;            # since log(1) = 0  
 
767 
 
2
 
  
100
   
 
 
 
 
 
 
 
10
 
         if (!defined $entropy) {  
 
768 
 
1
 
 
 
 
 
 
 
 
 
4
 
             $entropy = -1.0 * $prob * $log_prob;   
 
769 
 
1
 
 
 
 
 
 
 
 
 
5
 
             next;  
 
770 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
771 
 
1
 
 
 
 
 
 
 
 
 
7
 
         $entropy += -1.0 * $prob * $log_prob;  
 
772 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
773 
 
1
 
 
 
 
 
 
 
 
 
5
 
     $self->{_entropy_cache}->{'priors'} = $entropy;  
 
774 
 
1
 
 
 
 
 
 
 
 
 
5
 
     return $entropy;  
 
775 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
776 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
777 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub entropy_scanner_for_a_numeric_feature {  
 
778 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     local $| = 1;  
 
779 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $self = shift;  
 
780 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature = shift;  
 
781 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
782 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @entropies_for_less_than_thresholds = ();  
 
783 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @entropies_for_greater_than_thresholds = ();  
 
784 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $point (@all_sampling_points) {  
 
785 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print ". ";  
 
786 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @entropies_for_less_than_thresholds,   
 
787 
 
 
 
 
 
 
 
 
 
 
 
 
 
                          $self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);  
 
788 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @entropies_for_greater_than_thresholds,  
 
789 
 
 
 
 
 
 
 
 
 
 
 
 
 
                       $self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);  
 
790 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
791 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ".   
 
792 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                 "@entropies_for_less_than_thresholds\n";  
 
793 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ".   
 
794 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                              "@entropies_for_greater_than_thresholds\n";  
 
795 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }     
 
796 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
797 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub class_entropy_for_less_than_threshold_for_feature {  
 
798 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $self = shift;  
 
799 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $arr = shift;  
 
800 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature = shift;  
 
801 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $threshold = shift;  
 
802 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @array_of_features_and_values_or_thresholds = @$arr;  
 
803 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature_threshold_combo = "$feature" . '<' . "$threshold";  
 
804 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $sequence = join ":", @array_of_features_and_values_or_thresholds;  
 
805 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $sequence .= ":" . $feature_threshold_combo;  
 
806 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     return $self->{_entropy_cache}->{$sequence}  if exists $self->{_entropy_cache}->{$sequence};  
 
807 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my @copy_of_array_of_features_and_values_or_thresholds =   
 
808 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                        @{deep_copy_array(\@array_of_features_and_values_or_thresholds)};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
809 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;  
 
810 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $entropy = 0;  
 
811 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $class_name (@{$self->{_class_names}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
812 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $log_prob = undef;  
 
813 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(  
 
814 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                    $class_name, \@copy_of_array_of_features_and_values_or_thresholds);  
 
815 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
         if ($prob >= 0.0001 && $prob <= 0.999) {  
 
  
 
 
 
  
  0
   
 
 
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
 
 
 
 
 
 
 
 
    
 
816 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $log_prob = log($prob) / log(2.0);  
 
817 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif ($prob < 0.0001) {  
 
818 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $log_prob = 0;  
 
819 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif ($prob > 0.999) {  
 
820 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $log_prob = 0;  
 
821 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
822 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             die "An error has occurred in log_prob calculation";  
 
823 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
824 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $entropy +=  -1.0 * $prob * $log_prob;  
 
825 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
826 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if (abs($entropy) < 0.0000001) {  
 
827 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $entropy = 0.0;  
 
828 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
829 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $self->{_entropy_cache}->{$sequence} = $entropy;  
 
830 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $entropy;  
 
831 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
832 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
833 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub class_entropy_for_greater_than_threshold_for_feature {  
 
834 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $self = shift;  
 
835 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $arr = shift;  
 
836 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature = shift;  
 
837 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $threshold = shift;  
 
838 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @array_of_features_and_values_or_thresholds = @$arr;  
 
839 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature_threshold_combo = "$feature" . '>' . "$threshold";  
 
840 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $sequence = join ":", @array_of_features_and_values_or_thresholds;  
 
841 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $sequence .= ":" . $feature_threshold_combo;  
 
842 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     return $self->{_entropy_cache}->{$sequence}  if exists $self->{_entropy_cache}->{$sequence};  
 
843 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my @copy_of_array_of_features_and_values_or_thresholds =   
 
844 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                        @{deep_copy_array(\@array_of_features_and_values_or_thresholds)};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
845 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;  
 
846 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $entropy = 0;  
 
847 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $class_name (@{$self->{_class_names}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
848 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $log_prob = undef;  
 
849 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(  
 
850 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                    $class_name, \@copy_of_array_of_features_and_values_or_thresholds);  
 
851 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
         if ($prob >= 0.0001 && $prob <= 0.999) {  
 
  
 
 
 
  
  0
   
 
 
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
 
 
 
 
 
 
 
 
    
 
852 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $log_prob = log($prob) / log(2.0);  
 
853 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif ($prob < 0.0001) {  
 
854 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $log_prob = 0;  
 
855 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif ($prob > 0.999) {  
 
856 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $log_prob = 0;  
 
857 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
858 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             die "An error has occurred in log_prob calculation";  
 
859 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
860 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $entropy +=  -1.0 * $prob * $log_prob;  
 
861 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
862 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if (abs($entropy) < 0.0000001) {  
 
863 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $entropy = 0.0;  
 
864 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
865 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $self->{_entropy_cache}->{$sequence} = $entropy;  
 
866 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $entropy;  
 
867 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
868 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
869 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds {  
 
870 
 
247
 
 
 
 
 
  
247
   
 
  
0
   
 
566
 
     my $self = shift;  
 
871 
 
247
 
 
 
 
 
 
 
 
 
456
 
     my $array_of_features_and_values_or_thresholds = shift;  
 
872 
 
247
 
 
 
 
 
 
 
 
 
670
 
     my @array_of_features_and_values_or_thresholds = @$array_of_features_and_values_or_thresholds;  
 
873 
 
247
 
 
 
 
 
 
 
 
 
725
 
     my $sequence = join ":", @array_of_features_and_values_or_thresholds;  
 
874 
 
247
 
  
100
   
 
 
 
 
 
 
 
1033
 
     return $self->{_entropy_cache}->{$sequence}  if exists $self->{_entropy_cache}->{$sequence};  
 
875 
 
159
 
 
 
 
 
 
 
 
 
363
 
     my $entropy = 0;  
 
876 
 
159
 
 
 
 
 
 
 
 
 
310
 
     foreach my $class_name (@{$self->{_class_names}}) {  
 
  
 
159
 
 
 
 
 
 
 
 
 
469
 
    
 
877 
 
318
 
 
 
 
 
 
 
 
 
698
 
         my $log_prob = undef;  
 
878 
 
318
 
 
 
 
 
 
 
 
 
1015
 
         my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(  
 
879 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                              $class_name, \@array_of_features_and_values_or_thresholds);  
 
880 
 
318
 
  
 50
   
 
  
 33
   
 
 
 
 
 
1672
 
         if ($prob >= 0.0001 && $prob <= 0.999) {  
 
  
 
 
 
  
  0
   
 
 
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
 
 
 
 
 
 
 
 
    
 
881 
 
318
 
 
 
 
 
 
 
 
 
1056
 
             $log_prob = log($prob) / log(2.0);  
 
882 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif ($prob < 0.0001) {  
 
883 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $log_prob = 0;  
 
884 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif ($prob > 0.999) {  
 
885 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $log_prob = 0;  
 
886 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
887 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             die "An error has occurred in log_prob calculation";  
 
888 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
889 
 
318
 
 
 
 
 
 
 
 
 
918
 
         $entropy +=  -1.0 * $prob * $log_prob;  
 
890 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
891 
 
159
 
  
 50
   
 
 
 
 
 
 
 
593
 
     if (abs($entropy) < 0.0000001) {  
 
892 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $entropy = 0.0;  
 
893 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
894 
 
159
 
 
 
 
 
 
 
 
 
578
 
     $self->{_entropy_cache}->{$sequence} = $entropy;  
 
895 
 
159
 
 
 
 
 
 
 
 
 
791
 
     return $entropy;  
 
896 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
897 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
898 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
899 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #####################################   Probability Calculators   ########################################  
 
900 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
901 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub prior_probability_for_class {  
 
902 
 
4
 
 
 
 
 
  
4
   
 
  
0
   
 
10
 
     my $self = shift;  
 
903 
 
4
 
 
 
 
 
 
 
 
 
11
 
     my $class = shift;  
 
904 
 
4
 
 
 
 
 
 
 
 
 
14
 
     my $class_name_in_cache = "prior" . '::' . $class;  
 
905 
 
 
 
 
 
 
 
 
 
 
 
 
 
     return $self->{_probability_cache}->{$class_name_in_cache}  
 
906 
 
4
 
  
 50
   
 
 
 
 
 
 
 
25
 
         if exists $self->{_probability_cache}->{$class_name_in_cache};  
 
907 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $total_num_of_samples = keys %{$self->{_samples_class_label_hash}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
908 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @values = values %{$self->{_samples_class_label_hash}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
909 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $class_name (@{$self->{_class_names}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
910 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @trues = grep {$_ eq $class_name} @values;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
911 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $prior_for_this_class = (1.0 * @trues) / $total_num_of_samples;   
 
912 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $this_class_name_in_cache = "prior" . '::' . $class_name;  
 
913 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;  
 
914 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
915 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $self->{_probability_cache}->{$class_name_in_cache};  
 
916 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
917 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
918 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub calculate_class_priors {  
 
919 
 
1
 
 
 
 
 
  
1
   
 
  
1
   
 
19
 
     my $self = shift;  
 
920 
 
1
 
  
 50
   
 
 
 
 
 
 
 
4
 
     return if scalar keys %{$self->{_class_priors_hash}} > 1;  
 
  
 
1
 
 
 
 
 
 
 
 
 
10
 
    
 
921 
 
1
 
 
 
 
 
 
 
 
 
5
 
     foreach my $class_name (@{$self->{_class_names}}) {  
 
  
 
1
 
 
 
 
 
 
 
 
 
6
 
    
 
922 
 
2
 
 
 
 
 
 
 
 
 
11
 
         my $class_name_in_cache = "prior::$class_name";  
 
923 
 
2
 
 
 
 
 
 
 
 
 
6
 
         my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};  
 
  
 
2
 
 
 
 
 
 
 
 
 
10
 
    
 
924 
 
2
 
 
 
 
 
 
 
 
 
6
 
         my @all_values = values %{$self->{_samples_class_label_hash}};  
 
  
 
2
 
 
 
 
 
 
 
 
 
28
 
    
 
925 
 
2
 
 
 
 
 
 
 
 
 
8
 
         my @trues_for_this_class = grep {$_ eq $class_name} @all_values;  
 
  
 
70
 
 
 
 
 
 
 
 
 
196
 
    
 
926 
 
2
 
 
 
 
 
 
 
 
 
11
 
         my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;  
 
927 
 
2
 
 
 
 
 
 
 
 
 
11
 
         $self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;  
 
928 
 
2
 
 
 
 
 
 
 
 
 
9
 
         my $this_class_name_in_cache = "prior::$class_name";  
 
929 
 
2
 
 
 
 
 
 
 
 
 
17
 
         $self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;  
 
930 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
931 
 
1
 
  
 50
   
 
 
 
 
 
 
 
9
 
     if ($self->{_debug1}) {  
 
932 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $class (sort keys %{$self->{_class_priors_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
933 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$class  =>  $self->{_class_priors_hash}->{$class}\n";  
 
934 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
935 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
936 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
937 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
938 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub calculate_first_order_probabilities {  
 
939 
 
1
 
 
 
 
 
  
1
   
 
  
1
   
 
21
 
     print "\nEstimating probabilities...\n";  
 
940 
 
1
 
 
 
 
 
 
 
 
 
3
 
     my $self = shift;  
 
941 
 
1
 
 
 
 
 
 
 
 
 
2
 
     foreach my $feature (@{$self->{_feature_names}}) {  
 
  
 
1
 
 
 
 
 
 
 
 
 
4
 
    
 
942 
 
4
 
 
 
 
 
 
 
 
 
30
 
         $self->probability_of_feature_value($feature, undef);     
 
943 
 
4
 
  
 50
   
 
 
 
 
 
 
 
28
 
         if ($self->{_debug2}) {  
 
944 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {  
 
945 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "\nPresenting probability distribution for a numeric feature:\n";  
 
946 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 foreach my $sampling_point (sort {$a <=> $b} keys   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
947 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                    %{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {  
 
948 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     my $sampling_pt_for_display = sprintf("%.2f", $sampling_point);  
 
949 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     print "$feature :: $sampling_pt_for_display=" . sprintf("%.5f",   
 
950 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                           $self->{_prob_distribution_numeric_features_hash}->{$feature}{$sampling_point}) . "\n";  
 
951 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
952 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
953 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "\nPresenting probabilities for the values of a feature considered to be symbolic:\n";  
 
954 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
955 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 foreach my $value (sort @values_for_feature) {  
 
956 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     my $prob = $self->probability_of_feature_value($feature,$value);   
 
957 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     print "$feature :: $value = " . sprintf("%.5f", $prob) . "\n";  
 
958 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
959 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
960 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
961 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
962 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
963 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
964 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub probability_of_feature_value {  
 
965 
 
441
 
 
 
 
 
  
441
   
 
  
0
   
 
938
 
     my $self = shift;  
 
966 
 
441
 
 
 
 
 
 
 
 
 
919
 
     my $feature_name = shift;  
 
967 
 
441
 
 
 
 
 
 
 
 
 
942
 
     my $value = shift;  
 
968 
 
441
 
  
 50
   
 
  
 66
   
 
 
 
 
 
2687
 
     $value = sprintf("%.1f", $value) if defined($value) && $value =~ /^\d+$/;  
 
969 
 
441
 
  
 50
   
 
  
 66
   
 
 
 
 
 
2082
 
     if (defined($value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {  
 
970 
 
 
 
 
 
 
 
 
 
 
 
 
 
         $value = closest_sampling_point($value,   
 
971 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                         $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});  
 
972 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
973 
 
441
 
 
 
 
 
 
 
 
 
849
 
     my $feature_and_value;  
 
974 
 
441
 
  
100
   
 
 
 
 
 
 
 
1224
 
     if (defined($value)) {  
 
975 
 
437
 
 
 
 
 
 
 
 
 
1160
 
         $feature_and_value = "$feature_name=$value";  
 
976 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
977 
 
441
 
  
 50
   
 
  
 66
   
 
 
 
 
 
2348
 
     if (defined($value) && exists($self->{_probability_cache}->{$feature_and_value})) {  
 
978 
 
437
 
 
 
 
 
 
 
 
 
1732
 
         return $self->{_probability_cache}->{$feature_and_value};  
 
979 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
980 
 
4
 
 
 
 
 
 
 
 
 
15
 
     my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);  
 
981 
 
4
 
  
 50
   
 
 
 
 
 
 
 
19
 
     if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {  
 
982 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >   
 
983 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                 $self->{_symbolic_to_numeric_cardinality_threshold}) {  
 
984 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (! exists $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}) {  
 
985 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
986 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $diffrange = $valuerange[1] - $valuerange[0];  
 
987 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my %seen = ();  
 
988 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
                 my @unique_values_for_feature =  sort {$a <=> $b}  grep {$_ if $_ ne 'NA' && !$seen{$_}++}   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
989 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                          @{$self->{_features_and_values_hash}->{$feature_name}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
990 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my @diffs = sort {$a <=> $b} map {$unique_values_for_feature[$_] -   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
991 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                     $unique_values_for_feature[$_-1]}  1..@unique_values_for_feature-1;  
 
992 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my $median_diff = $diffs[int(@diffs/2) - 1];  
 
993 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $histogram_delta =  $median_diff * 2;  
 
994 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                 if ($histogram_delta < $diffrange / 500.0) {  
 
995 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                     if (defined $self->{_number_of_histogram_bins}) {  
 
996 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         $histogram_delta = $diffrange / $self->{_number_of_histogram_bins};  
 
997 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     } else {  
 
998 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         $histogram_delta = $diffrange / 500.0;  
 
999 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     }  
 
1000 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
1001 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $self->{_histogram_delta_hash}->{$feature_name} = $histogram_delta;  
 
1002 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $num_of_histogram_bins = int($diffrange / $histogram_delta) + 1;  
 
1003 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $self->{_num_of_histogram_bins_hash}->{$feature_name} = $num_of_histogram_bins;  
 
1004 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my @sampling_points_for_feature = map {$valuerange[0] + $histogram_delta * $_}   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1005 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                     0..$num_of_histogram_bins-1;  
 
1006 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name} =   
 
1007 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                            \@sampling_points_for_feature;  
 
1008 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1009 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1010 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1011 
 
4
 
  
 50
   
 
 
 
 
 
 
 
15
 
     if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {  
 
1012 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >   
 
1013 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                    $self->{_symbolic_to_numeric_cardinality_threshold}) {  
 
1014 
 
 
 
 
 
 
 
 
 
 
 
 
 
             my @sampling_points_for_feature =   
 
1015 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1016 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @counts_at_sampling_points = (0) x @sampling_points_for_feature;  
 
1017 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @actual_values_for_feature = grep {$_ ne 'NA'}   
 
1018 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                               @{$self->{_features_and_values_hash}->{$feature_name}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1019 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $i (0..@sampling_points_for_feature-1) {  
 
1020 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 foreach my $j (0..@actual_values_for_feature-1) {  
 
1021 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                     if (abs($sampling_points_for_feature[$i]-$actual_values_for_feature[$j]) < $histogram_delta) {  
 
1022 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         $counts_at_sampling_points[$i]++  
 
1023 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     }  
 
1024 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
1025 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1026 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $total_counts = 0;  
 
1027 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             map {$total_counts += $_} @counts_at_sampling_points;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1028 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1029 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my %bin_prob_hash = ();  
 
1030 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $i (0..@sampling_points_for_feature-1) {  
 
1031 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $bin_prob_hash{$sampling_points_for_feature[$i]} = $probs[$i];  
 
1032 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1033 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $self->{_prob_distribution_numeric_features_hash}->{$feature_name} = \%bin_prob_hash;  
 
1034 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @values_for_feature = map "$feature_name=$_", map {sprintf("%.5f", $_)}   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1035 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                              @sampling_points_for_feature;  
 
1036 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $i (0..@values_for_feature-1) {  
 
1037 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];  
 
1038 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1039 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
             if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {  
 
1040 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 return $self->{_probability_cache}->{$feature_and_value};  
 
1041 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1042 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 return 0;  
 
1043 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1044 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
1045 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my %seen = ();  
 
1046 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
             my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}   
 
1047 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                  @{$self->{_features_and_values_hash}->{$feature_name}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1048 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             @values_for_feature = map {"$feature_name=$_"} @values_for_feature;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1049 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @value_counts = (0) x @values_for_feature;  
 
1050 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #            foreach my $sample (sort {sample_index($a) cmp sample_index($b)}keys %{$self->{_training_data_hash}}){  
 
1051 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $sample (sort {sample_index($a) <=> sample_index($b)}keys %{$self->{_training_data_hash}}){  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1052 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1053 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 foreach my $i (0..@values_for_feature-1) {  
 
1054 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     foreach my $current_value (@features_and_values) {  
 
1055 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                         $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;  
 
1056 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     }  
 
1057 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
1058 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1059 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $total_counts = 0;  
 
1060 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             map {$total_counts += $_} @value_counts;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1061 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             die "PFV Something is wrong with your training file. It contains no training samples \  
 
1062 
 
 
 
 
 
 
 
 
 
 
 
 
 
                          for feature named $feature_name" if $total_counts == 0;  
 
1063 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @probs = map {$_ / (1.0 * $total_counts)} @value_counts;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1064 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $i (0..@values_for_feature-1) {  
 
1065 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];  
 
1066 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1067 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
             if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {  
 
1068 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 return $self->{_probability_cache}->{$feature_and_value};  
 
1069 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1070 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 return 0;  
 
1071 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1072 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1073 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
1074 
 
 
 
 
 
 
 
 
 
 
 
 
 
         # This section is only for purely symbolic features:    
 
1075 
 
4
 
 
 
 
 
 
 
 
 
12
 
         my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};          
 
  
 
4
 
 
 
 
 
 
 
 
 
33
 
    
 
1076 
 
4
 
 
 
 
 
 
 
 
 
15
 
         @values_for_feature = map {"$feature_name=$_"} @values_for_feature;  
 
  
 
140
 
 
 
 
 
 
 
 
 
272
 
    
 
1077 
 
4
 
 
 
 
 
 
 
 
 
21
 
         my @value_counts = (0) x @values_for_feature;  
 
1078 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #        foreach my $sample (sort {sample_index($a) cmp sample_index($b)} keys %{$self->{_training_data_hash}}) {  
 
1079 
 
4
 
 
 
 
 
 
 
 
 
10
 
         foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}}) {  
 
  
 
544
 
 
 
 
 
 
 
 
 
1144
 
    
 
  
 
4
 
 
 
 
 
 
 
 
 
53
 
    
 
1080 
 
140
 
 
 
 
 
 
 
 
 
281
 
             my @features_and_values = @{$self->{_training_data_hash}->{$sample}};  
 
  
 
140
 
 
 
 
 
 
 
 
 
657
 
    
 
1081 
 
140
 
 
 
 
 
 
 
 
 
401
 
             foreach my $i (0..@values_for_feature-1) {  
 
1082 
 
4900
 
 
 
 
 
 
 
 
 
9059
 
                 for my $current_value (@features_and_values) {  
 
1083 
 
19600
 
  
100
   
 
 
 
 
 
 
 
47918
 
                     $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;  
 
1084 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
1085 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1086 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1087 
 
4
 
 
 
 
 
 
 
 
 
56
 
         foreach my $i (0..@values_for_feature-1) {  
 
1088 
 
 
 
 
 
 
 
 
 
 
 
 
 
             $self->{_probability_cache}->{$values_for_feature[$i]} =   
 
1089 
 
140
 
 
 
 
 
 
 
 
 
221
 
                 $value_counts[$i] / (1.0 * scalar(keys %{$self->{_training_data_hash}}));  
 
  
 
140
 
 
 
 
 
 
 
 
 
382
 
    
 
1090 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1091 
 
4
 
  
 50
   
 
  
 33
   
 
 
 
 
 
32
 
         if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {  
 
1092 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             return $self->{_probability_cache}->{$feature_and_value};  
 
1093 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
1094 
 
4
 
 
 
 
 
 
 
 
 
40
 
             return 0;  
 
1095 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1096 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1097 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1098 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1099 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub probability_of_feature_value_given_class {  
 
1100 
 
882
 
 
 
 
 
  
882
   
 
  
0
   
 
1879
 
     my $self = shift;  
 
1101 
 
882
 
 
 
 
 
 
 
 
 
1754
 
     my $feature_name = shift;  
 
1102 
 
882
 
 
 
 
 
 
 
 
 
1672
 
     my $feature_value = shift;  
 
1103 
 
882
 
 
 
 
 
 
 
 
 
1720
 
     my $class_name = shift;  
 
1104 
 
882
 
  
 50
   
 
  
 33
   
 
 
 
 
 
4736
 
     $feature_value = sprintf("%.1f", $feature_value) if defined($feature_value) && $feature_value =~ /^\d+$/;  
 
1105 
 
882
 
  
 50
   
 
  
 33
   
 
 
 
 
 
4195
 
     if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {  
 
1106 
 
 
 
 
 
 
 
 
 
 
 
 
 
         $feature_value = closest_sampling_point($feature_value,   
 
1107 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                         $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});  
 
1108 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1109 
 
882
 
 
 
 
 
 
 
 
 
1780
 
     my $feature_value_class;  
 
1110 
 
882
 
  
 50
   
 
 
 
 
 
 
 
2234
 
     if (defined($feature_value)) {  
 
1111 
 
882
 
 
 
 
 
 
 
 
 
2648
 
         $feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";  
 
1112 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1113 
 
882
 
  
100
   
 
  
 66
   
 
 
 
 
 
4649
 
     if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {  
 
1114 
 
 
 
 
 
 
 
 
 
 
 
 
 
         print "\nNext answer returned by cache for feature $feature_name and " .  
 
1115 
 
874
 
  
 50
   
 
 
 
 
 
 
 
2536
 
             "value $feature_value given class $class_name\n" if $self->{_debug2};  
 
1116 
 
874
 
 
 
 
 
 
 
 
 
3495
 
         return $self->{_probability_cache}->{$feature_value_class};  
 
1117 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1118 
 
8
 
 
 
 
 
 
 
 
 
42
 
     my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);  
 
1119 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1120 
 
8
 
  
 50
   
 
 
 
 
 
 
 
45
 
     if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {  
 
1121 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >   
 
1122 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                 $self->{_symbolic_to_numeric_cardinality_threshold}) {  
 
1123 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};  
 
1124 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};  
 
1125 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1126 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $diffrange = $valuerange[1] - $valuerange[0];  
 
1127 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1128 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1129 
 
8
 
 
 
 
 
 
 
 
 
24
 
     my @samples_for_class = ();  
 
1130 
 
 
 
 
 
 
 
 
 
 
 
 
 
     # Accumulate all samples names for the given class:  
 
1131 
 
8
 
 
 
 
 
 
 
 
 
22
 
     foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {  
 
  
 
8
 
 
 
 
 
 
 
 
 
98
 
    
 
1132 
 
280
 
  
100
   
 
 
 
 
 
 
 
907
 
         if ($self->{_samples_class_label_hash}->{$sample_name} eq $class_name) {  
 
1133 
 
140
 
 
 
 
 
 
 
 
 
384
 
             push @samples_for_class, $sample_name;  
 
1134 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1135 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1136 
 
8
 
  
 50
   
 
 
 
 
 
 
 
54
 
     if (exists($self->{_numeric_features_valuerange_hash}->{$feature_name})) {  
 
1137 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >   
 
1138 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                 $self->{_symbolic_to_numeric_cardinality_threshold}) {  
 
1139 
 
 
 
 
 
 
 
 
 
 
 
 
 
             my @sampling_points_for_feature =   
 
1140 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                               @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1141 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @counts_at_sampling_points = (0) x @sampling_points_for_feature;  
 
1142 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @actual_feature_values_for_samples_in_class = ();  
 
1143 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $sample (@samples_for_class) {             
 
1144 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1145 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     my $pattern = '(.+)=(.+)';  
 
1146 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     $feature_and_value =~ /$pattern/;  
 
1147 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     my ($feature, $value) = ($1, $2);  
 
1148 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
                     if (($feature eq $feature_name) && ($value ne 'NA')) {  
 
1149 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         push @actual_feature_values_for_samples_in_class, $value;  
 
1150 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     }  
 
1151 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
1152 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1153 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $i (0..@sampling_points_for_feature-1) {  
 
1154 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 foreach my $j (0..@actual_feature_values_for_samples_in_class-1) {  
 
1155 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                     if (abs($sampling_points_for_feature[$i] -   
 
1156 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             $actual_feature_values_for_samples_in_class[$j]) < $histogram_delta) {  
 
1157 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         $counts_at_sampling_points[$i]++;  
 
1158 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     }  
 
1159 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
1160 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1161 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $total_counts = 0;  
 
1162 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             map {$total_counts += $_} @counts_at_sampling_points;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1163 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             die "PFVC1 Something is wrong with your training file. It contains no training " .  
 
1164 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;  
 
1165 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1166 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @values_for_feature_and_class = map {"$feature_name=$_" . "::" . "$class_name"}   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1167 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                      @sampling_points_for_feature;  
 
1168 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $i (0..@values_for_feature_and_class-1) {  
 
1169 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $self->{_probability_cache}->{$values_for_feature_and_class[$i]} = $probs[$i];  
 
1170 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1171 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (exists $self->{_probability_cache}->{$feature_value_class}) {  
 
1172 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 return $self->{_probability_cache}->{$feature_value_class};  
 
1173 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1174 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 return 0;  
 
1175 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1176 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
1177 
 
 
 
 
 
 
 
 
 
 
 
 
 
             # This section is for numeric features that will be treated symbolically  
 
1178 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my %seen = ();  
 
1179 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
             my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}   
 
1180 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                  @{$self->{_features_and_values_hash}->{$feature_name}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1181 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             @values_for_feature = map {"$feature_name=$_"} @values_for_feature;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1182 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @value_counts = (0) x @values_for_feature;  
 
1183 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $sample (@samples_for_class) {  
 
1184 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my @features_and_values = @{$self->{_training_data_hash}->{$sample}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1185 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 foreach my $i (0..@values_for_feature-1) {  
 
1186 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     foreach my $current_value (@features_and_values) {  
 
1187 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                         $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;  
 
1188 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     }  
 
1189 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
1190 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1191 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $total_counts = 0;  
 
1192 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             map {$total_counts += $_} @value_counts;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1193 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             die "PFVC2 Something is wrong with your training file. It contains no training " .  
 
1194 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;  
 
1195 
 
 
 
 
 
 
 
 
 
 
 
 
 
             # We normalize by total_count because the probabilities are conditioned on a given class  
 
1196 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $i (0..@values_for_feature-1) {  
 
1197 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my $feature_and_value_and_class =  "$values_for_feature[$i]" . "::" . "$class_name";  
 
1198 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $self->{_probability_cache}->{$feature_and_value_and_class} =   
 
1199 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                            $value_counts[$i] / (1.0 * $total_counts);  
 
1200 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1201 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (exists $self->{_probability_cache}->{$feature_value_class}) {  
 
1202 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 return $self->{_probability_cache}->{$feature_value_class};  
 
1203 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1204 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 return 0;  
 
1205 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1206 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1207 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
1208 
 
 
 
 
 
 
 
 
 
 
 
 
 
         # This section is for purely symbolic features  
 
1209 
 
8
 
 
 
 
 
 
 
 
 
25
 
         my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};  
 
  
 
8
 
 
 
 
 
 
 
 
 
107
 
    
 
1210 
 
8
 
 
 
 
 
 
 
 
 
35
 
         my %seen = ();  
 
1211 
 
280
 
  
100
   
 
  
 66
   
 
 
 
 
 
1926
 
         @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}   
 
1212 
 
8
 
 
 
 
 
 
 
 
 
20
 
                                              @{$self->{_features_and_values_hash}->{$feature_name}};  
 
  
 
8
 
 
 
 
 
 
 
 
 
36
 
    
 
1213 
 
8
 
 
 
 
 
 
 
 
 
33
 
         @values_for_feature = map {"$feature_name=$_"} @values_for_feature;  
 
  
 
28
 
 
 
 
 
 
 
 
 
118
 
    
 
1214 
 
8
 
 
 
 
 
 
 
 
 
41
 
         my @value_counts = (0) x @values_for_feature;  
 
1215 
 
8
 
 
 
 
 
 
 
 
 
31
 
         foreach my $sample (@samples_for_class) {  
 
1216 
 
140
 
 
 
 
 
 
 
 
 
331
 
             my @features_and_values = @{$self->{_training_data_hash}->{$sample}};  
 
  
 
140
 
 
 
 
 
 
 
 
 
631
 
    
 
1217 
 
140
 
 
 
 
 
 
 
 
 
440
 
             foreach my $i (0..@values_for_feature-1) {  
 
1218 
 
490
 
 
 
 
 
 
 
 
 
1165
 
                 foreach my $current_value (@features_and_values) {  
 
1219 
 
1960
 
  
100
   
 
 
 
 
 
 
 
6044
 
                     $value_counts[$i]++ if $values_for_feature[$i] eq $current_value;  
 
1220 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
1221 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1222 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1223 
 
8
 
 
 
 
 
 
 
 
 
22
 
         my $total_counts = 0;  
 
1224 
 
8
 
 
 
 
 
 
 
 
 
28
 
         map {$total_counts += $_} @value_counts;  
 
  
 
28
 
 
 
 
 
 
 
 
 
79
 
    
 
1225 
 
8
 
  
 50
   
 
 
 
 
 
 
 
39
 
         die "PFVC2 Something is wrong with your training file. It contains no training " .  
 
1226 
 
 
 
 
 
 
 
 
 
 
 
 
 
             "samples for Class $class_name and Feature $feature_name" if $total_counts == 0;  
 
1227 
 
 
 
 
 
 
 
 
 
 
 
 
 
         # We normalize by total_count because the probabilities are conditioned on a given class  
 
1228 
 
8
 
 
 
 
 
 
 
 
 
41
 
         foreach my $i (0..@values_for_feature-1) {  
 
1229 
 
28
 
 
 
 
 
 
 
 
 
110
 
             my $feature_and_value_and_class =  "$values_for_feature[$i]" . "::" . "$class_name";  
 
1230 
 
28
 
 
 
 
 
 
 
 
 
189
 
             $self->{_probability_cache}->{$feature_and_value_and_class} =   
 
1231 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                        $value_counts[$i] / (1.0 * $total_counts);  
 
1232 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1233 
 
8
 
  
 50
   
 
 
 
 
 
 
 
43
 
         if (exists $self->{_probability_cache}->{$feature_value_class}) {  
 
1234 
 
8
 
 
 
 
 
 
 
 
 
89
 
             return $self->{_probability_cache}->{$feature_value_class};  
 
1235 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
1236 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             return 0;  
 
1237 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1238 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1239 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1240 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1241 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub probability_of_feature_less_than_threshold {  
 
1242 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $self = shift;  
 
1243 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature_name = shift;  
 
1244 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $threshold = shift;  
 
1245 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature_threshold_combo = "$feature_name" . '<' . "$threshold";  
 
1246 
 
 
 
 
 
 
 
 
 
 
 
 
 
     return $self->{_probability_cache}->{$feature_threshold_combo}  
 
1247 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                      if (exists $self->{_probability_cache}->{$feature_threshold_combo});  
 
1248 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     my @all_values = grep {$_ if $_ ne 'NA'} @{$self->{_features_and_values_hash}->{$feature_name}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1249 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     my @all_values_less_than_threshold = grep {$_ if $_ <= $threshold} @all_values;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1250 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $probability = (1.0 * @all_values_less_than_threshold) / @all_values;  
 
1251 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $self->{_probability_cache}->{$feature_threshold_combo} = $probability;  
 
1252 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $probability;  
 
1253 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1254 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1255 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub probability_of_feature_less_than_threshold_given_class {  
 
1256 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $self = shift;  
 
1257 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature_name = shift;  
 
1258 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $threshold = shift;  
 
1259 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $class_name = shift;  
 
1260 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature_threshold_class_combo = "$feature_name" . "<" . "$threshold" . "::" . "$class_name";  
 
1261 
 
 
 
 
 
 
 
 
 
 
 
 
 
     return $self->{_probability_cache}->{$feature_threshold_class_combo}  
 
1262 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                      if (exists $self->{_probability_cache}->{$feature_threshold_class_combo});  
 
1263 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @data_samples_for_class = ();  
 
1264 
 
 
 
 
 
 
 
 
 
 
 
 
 
     # Accumulate all samples names for the given class:  
 
1265 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1266 
 
 
 
 
 
 
 
 
 
 
 
 
 
         push @data_samples_for_class, $sample_name   
 
1267 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                   if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;  
 
1268 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1269 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @actual_feature_values_for_samples_in_class = ();  
 
1270 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $sample (@data_samples_for_class) {  
 
1271 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1272 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $pattern = '(.+)=(.+)';  
 
1273 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $feature_and_value =~ /$pattern/;  
 
1274 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my ($feature,$value) = ($1,$2);  
 
1275 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
             push @actual_feature_values_for_samples_in_class, $value  
 
1276 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                     if $feature eq $feature_name && $value ne 'NA';  
 
1277 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1278 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1279 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     my @actual_points_for_feature_less_than_threshold = grep {$_ if $_ <= $threshold} @actual_feature_values_for_samples_in_class;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1280 
 
 
 
 
 
 
 
 
 
 
 
 
 
     # The condition in the assignment that follows was a bug correction in Version 3.20  
 
1281 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     my $probability = @actual_feature_values_for_samples_in_class > 0 ? ((1.0 * @actual_points_for_feature_less_than_threshold) / @actual_feature_values_for_samples_in_class) : 0.0;  
 
1282 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $self->{_probability_cache}->{$feature_threshold_class_combo} = $probability;  
 
1283 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $probability;  
 
1284 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1285 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1286 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # This method requires that all truly numeric types only be expressed as '<' or '>'  
 
1287 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # constructs in the array of branch features and thresholds  
 
1288 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub probability_of_a_sequence_of_features_and_values_or_thresholds {  
 
1289 
 
477
 
 
 
 
 
  
477
   
 
  
0
   
 
1038
 
     my $self = shift;  
 
1290 
 
477
 
 
 
 
 
 
 
 
 
880
 
     my $arr = shift;  
 
1291 
 
477
 
 
 
 
 
 
 
 
 
1471
 
     my @array_of_features_and_values_or_thresholds = @$arr;  
 
1292 
 
477
 
  
 50
   
 
 
 
 
 
 
 
1506
 
     return if scalar @array_of_features_and_values_or_thresholds == 0;  
 
1293 
 
477
 
 
 
 
 
 
 
 
 
1469
 
     my $sequence = join ':', @array_of_features_and_values_or_thresholds;  
 
1294 
 
477
 
  
100
   
 
 
 
 
 
 
 
2124
 
     return $self->{_probability_cache}->{$sequence} if exists $self->{_probability_cache}->{$sequence};  
 
1295 
 
145
 
 
 
 
 
 
 
 
 
319
 
     my $probability = undef;  
 
1296 
 
145
 
 
 
 
 
 
 
 
 
316
 
     my $pattern1 = '(.+)=(.+)';  
 
1297 
 
145
 
 
 
 
 
 
 
 
 
306
 
     my $pattern2 = '(.+)<(.+)';  
 
1298 
 
145
 
 
 
 
 
 
 
 
 
293
 
     my $pattern3 = '(.+)>(.+)';  
 
1299 
 
145
 
 
 
 
 
 
 
 
 
320
 
     my @true_numeric_types = ();  
 
1300 
 
145
 
 
 
 
 
 
 
 
 
312
 
     my @true_numeric_types_feature_names = ();  
 
1301 
 
145
 
 
 
 
 
 
 
 
 
317
 
     my @symbolic_types = ();  
 
1302 
 
145
 
 
 
 
 
 
 
 
 
317
 
     my @symbolic_types_feature_names = ();  
 
1303 
 
145
 
 
 
 
 
 
 
 
 
371
 
     foreach my $item (@array_of_features_and_values_or_thresholds) {  
 
1304 
 
437
 
  
 50
   
 
 
 
 
 
 
 
1933
 
         if ($item =~ /$pattern2/) {  
 
  
 
 
 
  
 50
   
 
 
 
 
 
 
 
 
 
    
 
1305 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @true_numeric_types, $item;  
 
1306 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my ($feature,$value) = ($1,$2);  
 
1307 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @true_numeric_types_feature_names, $feature;  
 
1308 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif ($item =~ /$pattern3/) {  
 
1309 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @true_numeric_types, $item;  
 
1310 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my ($feature,$value) = ($1,$2);  
 
1311 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @true_numeric_types_feature_names, $feature;  
 
1312 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
1313 
 
437
 
 
 
 
 
 
 
 
 
1003
 
             push @symbolic_types, $item;  
 
1314 
 
437
 
 
 
 
 
 
 
 
 
1668
 
             $item =~ /$pattern1/;  
 
1315 
 
437
 
 
 
 
 
 
 
 
 
1581
 
             my ($feature,$value) = ($1,$2);  
 
1316 
 
437
 
 
 
 
 
 
 
 
 
1209
 
             push @symbolic_types_feature_names, $feature;  
 
1317 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1318 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1319 
 
145
 
 
 
 
 
 
 
 
 
390
 
     my %seen1 = ();  
 
1320 
 
145
 
  
  0
   
 
 
 
 
 
 
 
360
 
     @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1321 
 
145
 
 
 
 
 
 
 
 
 
325
 
     my %seen2 = ();  
 
1322 
 
145
 
  
 50
   
 
 
 
 
 
 
 
378
 
     @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;  
 
  
 
437
 
 
 
 
 
 
 
 
 
2337
 
    
 
1323 
 
145
 
 
 
 
 
 
 
 
 
524
 
     my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);  
 
1324 
 
 
 
 
 
 
 
 
 
 
 
 
 
     print_array_with_msg("POS: Answer returned by find_bounded: ",   
 
1325 
 
145
 
  
 50
   
 
 
 
 
 
 
 
494
 
                                        $bounded_intervals_numeric_types) if $self->{_debug2};  
 
1326 
 
 
 
 
 
 
 
 
 
 
 
 
 
     # Calculate the upper and the lower bounds to be used when searching for the best  
 
1327 
 
 
 
 
 
 
 
 
 
 
 
 
 
     # threshold for each of the numeric features that are in play at the current node:  
 
1328 
 
145
 
 
 
 
 
 
 
 
 
360
 
     my (%upperbound, %lowerbound);  
 
1329 
 
145
 
 
 
 
 
 
 
 
 
318
 
     foreach my $feature_name (@true_numeric_types_feature_names) {  
 
1330 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $upperbound{$feature_name} = undef;  
 
1331 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $lowerbound{$feature_name} = undef;  
 
1332 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1333 
 
145
 
 
 
 
 
 
 
 
 
439
 
     foreach my $item (@$bounded_intervals_numeric_types) {  
 
1334 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $feature_grouping (@$item) {  
 
1335 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($feature_grouping->[1] eq '>') {  
 
1336 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];  
 
1337 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1338 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];  
 
1339 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1340 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1341 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1342 
 
145
 
 
 
 
 
 
 
 
 
374
 
     foreach my $feature_name (@true_numeric_types_feature_names) {  
 
1343 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
         if (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name}) &&   
 
  
 
 
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
  0
   
 
 
 
 
 
 
 
    
 
1344 
 
 
 
 
 
 
 
 
 
 
 
 
 
                           $upperbound{$feature_name} <= $lowerbound{$feature_name}) {   
 
1345 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             return 0;  
 
1346 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {  
 
1347 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (! $probability) {  
 
1348 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 $probability = $self->probability_of_feature_less_than_threshold($feature_name,   
 
1349 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                                  $upperbound{$feature_name}) -  
 
1350 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                    $self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name});  
 
1351 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1352 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 $probability *= ($self->probability_of_feature_less_than_threshold($feature_name,   
 
1353 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                                    $upperbound{$feature_name}) -  
 
1354 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                  $self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name}))  
 
1355 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1356 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {  
 
1357 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (! $probability) {  
 
1358 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 $probability = $self->probability_of_feature_less_than_threshold($feature_name,  
 
1359 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                                                  $upperbound{$feature_name});  
 
1360 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1361 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 $probability *= $self->probability_of_feature_less_than_threshold($feature_name,   
 
1362 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                                                   $upperbound{$feature_name});  
 
1363 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1364 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {  
 
1365 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (! $probability) {  
 
1366 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 $probability = 1.0 - $self->probability_of_feature_less_than_threshold($feature_name,  
 
1367 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                                                  $lowerbound{$feature_name});  
 
1368 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1369 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 $probability *= (1.0 - $self->probability_of_feature_less_than_threshold($feature_name,   
 
1370 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                                                 $lowerbound{$feature_name}));  
 
1371 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1372 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
1373 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             die("Ill formatted call to 'probability_of_sequence' method");  
 
1374 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1375 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1376 
 
145
 
 
 
 
 
 
 
 
 
367
 
     foreach my $feature_and_value (@symbolic_types) {  
 
1377 
 
437
 
  
 50
   
 
 
 
 
 
 
 
2771
 
         if ($feature_and_value =~ /$pattern1/) {  
 
1378 
 
437
 
 
 
 
 
 
 
 
 
1751
 
             my ($feature,$value) = ($1,$2);  
 
1379 
 
437
 
  
100
   
 
 
 
 
 
 
 
1326
 
             if (! $probability) {          
 
1380 
 
145
 
 
 
 
 
 
 
 
 
501
 
                 $probability = $self->probability_of_feature_value($feature, $value);  
 
1381 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1382 
 
292
 
 
 
 
 
 
 
 
 
837
 
                 $probability *= $self->probability_of_feature_value($feature, $value);  
 
1383 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1384 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1385 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1386 
 
145
 
 
 
 
 
 
 
 
 
606
 
     $self->{_probability_cache}->{$sequence} = $probability;  
 
1387 
 
145
 
 
 
 
 
 
 
 
 
746
 
     return $probability;  
 
1388 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1389 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1390 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  The following method requires that all truly numeric types only be expressed as  
 
1391 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  '<' or '>' constructs in the array of branch features and thresholds  
 
1392 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub probability_of_a_sequence_of_features_and_values_or_thresholds_given_class {  
 
1393 
 
318
 
 
 
 
 
  
318
   
 
  
0
   
 
665
 
     my $self = shift;  
 
1394 
 
318
 
 
 
 
 
 
 
 
 
592
 
     my $arr = shift;  
 
1395 
 
318
 
 
 
 
 
 
 
 
 
723
 
     my $class_name = shift;  
 
1396 
 
318
 
 
 
 
 
 
 
 
 
901
 
     my @array_of_features_and_values_or_thresholds = @$arr;  
 
1397 
 
318
 
  
 50
   
 
 
 
 
 
 
 
984
 
     return if scalar @array_of_features_and_values_or_thresholds == 0;  
 
1398 
 
318
 
 
 
 
 
 
 
 
 
958
 
     my $sequence = join ':', @array_of_features_and_values_or_thresholds;  
 
1399 
 
318
 
 
 
 
 
 
 
 
 
870
 
     my $sequence_with_class = "$sequence" . "::" . $class_name;  
 
1400 
 
 
 
 
 
 
 
 
 
 
 
 
 
     return $self->{_probability_cache}->{$sequence_with_class}   
 
1401 
 
318
 
  
100
   
 
 
 
 
 
 
 
1203
 
                       if exists $self->{_probability_cache}->{$sequence_with_class};  
 
1402 
 
298
 
 
 
 
 
 
 
 
 
618
 
     my $probability = undef;  
 
1403 
 
298
 
 
 
 
 
 
 
 
 
606
 
     my $pattern1 = '(.+)=(.+)';  
 
1404 
 
298
 
 
 
 
 
 
 
 
 
587
 
     my $pattern2 = '(.+)<(.+)';  
 
1405 
 
298
 
 
 
 
 
 
 
 
 
550
 
     my $pattern3 = '(.+)>(.+)';  
 
1406 
 
298
 
 
 
 
 
 
 
 
 
660
 
     my @true_numeric_types = ();  
 
1407 
 
298
 
 
 
 
 
 
 
 
 
593
 
     my @true_numeric_types_feature_names = ();  
 
1408 
 
298
 
 
 
 
 
 
 
 
 
652
 
     my @symbolic_types = ();  
 
1409 
 
298
 
 
 
 
 
 
 
 
 
625
 
     my @symbolic_types_feature_names = ();  
 
1410 
 
298
 
 
 
 
 
 
 
 
 
657
 
     foreach my $item (@array_of_features_and_values_or_thresholds) {  
 
1411 
 
882
 
  
 50
   
 
 
 
 
 
 
 
3740
 
         if ($item =~ /$pattern2/) {  
 
  
 
 
 
  
 50
   
 
 
 
 
 
 
 
 
 
    
 
1412 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @true_numeric_types, $item;  
 
1413 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my ($feature,$value) = ($1,$2);  
 
1414 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @true_numeric_types_feature_names, $feature;  
 
1415 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif ($item =~ /$pattern3/) {  
 
1416 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @true_numeric_types, $item;  
 
1417 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my ($feature,$value) = ($1,$2);  
 
1418 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @true_numeric_types_feature_names, $feature;  
 
1419 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
1420 
 
882
 
 
 
 
 
 
 
 
 
2021
 
             push @symbolic_types, $item;  
 
1421 
 
882
 
 
 
 
 
 
 
 
 
3527
 
             $item =~ /$pattern1/;  
 
1422 
 
882
 
 
 
 
 
 
 
 
 
3261
 
             my ($feature,$value) = ($1,$2);  
 
1423 
 
882
 
 
 
 
 
 
 
 
 
2508
 
             push @symbolic_types_feature_names, $feature;  
 
1424 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1425 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1426 
 
298
 
 
 
 
 
 
 
 
 
808
 
     my %seen1 = ();  
 
1427 
 
298
 
  
  0
   
 
 
 
 
 
 
 
717
 
     @true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1428 
 
298
 
 
 
 
 
 
 
 
 
621
 
     my %seen2 = ();  
 
1429 
 
298
 
  
 50
   
 
 
 
 
 
 
 
772
 
     @symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;  
 
  
 
882
 
 
 
 
 
 
 
 
 
4830
 
    
 
1430 
 
298
 
 
 
 
 
 
 
 
 
1110
 
     my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);  
 
1431 
 
 
 
 
 
 
 
 
 
 
 
 
 
     print_array_with_msg("POSC: Answer returned by find_bounded: ",   
 
1432 
 
298
 
  
 50
   
 
 
 
 
 
 
 
1013
 
                                        $bounded_intervals_numeric_types) if $self->{_debug2};  
 
1433 
 
 
 
 
 
 
 
 
 
 
 
 
 
     # Calculate the upper and the lower bounds to be used when searching for the best  
 
1434 
 
 
 
 
 
 
 
 
 
 
 
 
 
     # threshold for each of the numeric features that are in play at the current node:  
 
1435 
 
298
 
 
 
 
 
 
 
 
 
629
 
     my (%upperbound, %lowerbound);  
 
1436 
 
298
 
 
 
 
 
 
 
 
 
645
 
     foreach my $feature_name (@true_numeric_types_feature_names) {  
 
1437 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $upperbound{$feature_name} = undef;  
 
1438 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $lowerbound{$feature_name} = undef;  
 
1439 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1440 
 
298
 
 
 
 
 
 
 
 
 
676
 
     foreach my $item (@$bounded_intervals_numeric_types) {  
 
1441 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $feature_grouping (@$item) {  
 
1442 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($feature_grouping->[1] eq '>') {  
 
1443 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];  
 
1444 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1445 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $upperbound{$feature_grouping->[0]} = $feature_grouping->[2];  
 
1446 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1447 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1448 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1449 
 
298
 
 
 
 
 
 
 
 
 
689
 
     foreach my $feature_name (@true_numeric_types_feature_names) {  
 
1450 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
         if ($lowerbound{$feature_name} && $upperbound{$feature_name} &&   
 
  
 
 
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
  0
   
 
 
 
 
 
 
 
    
 
1451 
 
 
 
 
 
 
 
 
 
 
 
 
 
                           $upperbound{$feature_name} <= $lowerbound{$feature_name}) {   
 
1452 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             return 0;  
 
1453 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {  
 
1454 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (! $probability) {  
 
1455 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1456 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 $probability =   $self->probability_of_feature_less_than_threshold_given_class($feature_name,   
 
1457 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                $upperbound{$feature_name}, $class_name) -  
 
1458 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                  $self->probability_of_feature_less_than_threshold_given_class($feature_name,   
 
1459 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                                $lowerbound{$feature_name}, $class_name);  
 
1460 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1461 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 $probability *= ($self->probability_of_feature_less_than_threshold_given_class($feature_name,   
 
1462 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                $upperbound{$feature_name}, $class_name) -  
 
1463 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                  $self->probability_of_feature_less_than_threshold_given_class($feature_name,   
 
1464 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                                $lowerbound{$feature_name}, $class_name))  
 
1465 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1466 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {  
 
1467 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (! $probability) {  
 
1468 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 $probability =   $self->probability_of_feature_less_than_threshold_given_class($feature_name,  
 
1469 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                                $upperbound{$feature_name}, $class_name);  
 
1470 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1471 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 $probability *=  $self->probability_of_feature_less_than_threshold_given_class($feature_name,   
 
1472 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                                $upperbound{$feature_name}, $class_name);  
 
1473 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1474 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {  
 
1475 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (! $probability) {  
 
1476 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 $probability =   1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,  
 
1477 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                                $lowerbound{$feature_name}, $class_name);  
 
1478 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1479 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 $probability *= (1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,  
 
1480 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                                $lowerbound{$feature_name}, $class_name));  
 
1481 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1482 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
1483 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             die("Ill formatted call to 'probability of sequence given class' method");  
 
1484 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1485 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1486 
 
298
 
 
 
 
 
 
 
 
 
651
 
     foreach my $feature_and_value (@symbolic_types) {  
 
1487 
 
882
 
  
 50
   
 
 
 
 
 
 
 
5684
 
         if ($feature_and_value =~ /$pattern1/) {  
 
1488 
 
882
 
 
 
 
 
 
 
 
 
3452
 
             my ($feature,$value) = ($1,$2);  
 
1489 
 
882
 
  
100
   
 
 
 
 
 
 
 
2372
 
             if (! $probability) {          
 
1490 
 
298
 
 
 
 
 
 
 
 
 
1090
 
                 $probability = $self->probability_of_feature_value_given_class($feature, $value, $class_name);  
 
1491 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1492 
 
584
 
 
 
 
 
 
 
 
 
1678
 
                 $probability *= $self->probability_of_feature_value_given_class($feature, $value, $class_name);  
 
1493 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1494 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1495 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1496 
 
298
 
 
 
 
 
 
 
 
 
1302
 
     $self->{_probability_cache}->{$sequence_with_class} = $probability;  
 
1497 
 
298
 
 
 
 
 
 
 
 
 
1690
 
     return $probability;  
 
1498 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1499 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1500 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub probability_of_a_class_given_sequence_of_features_and_values_or_thresholds {  
 
1501 
 
494
 
 
 
 
 
  
494
   
 
  
0
   
 
1036
 
     my $self = shift;  
 
1502 
 
494
 
 
 
 
 
 
 
 
 
957
 
     my $class_name = shift;      
 
1503 
 
494
 
 
 
 
 
 
 
 
 
889
 
     my $arr = shift;  
 
1504 
 
494
 
 
 
 
 
 
 
 
 
1370
 
     my @array_of_features_and_values_or_thresholds = @$arr;  
 
1505 
 
494
 
 
 
 
 
 
 
 
 
1349
 
     my $sequence = join ':', @array_of_features_and_values_or_thresholds;  
 
1506 
 
494
 
 
 
 
 
 
 
 
 
1298
 
     my $class_and_sequence = "$class_name" . "::" . $sequence;  
 
1507 
 
 
 
 
 
 
 
 
 
 
 
 
 
     return $self->{_probability_cache}->{$class_and_sequence}   
 
1508 
 
494
 
  
100
   
 
 
 
 
 
 
 
2370
 
                       if exists $self->{_probability_cache}->{$class_and_sequence};  
 
1509 
 
159
 
 
 
 
 
 
 
 
 
385
 
     my @array_of_class_probabilities = (0) x scalar @{$self->{_class_names}};  
 
  
 
159
 
 
 
 
 
 
 
 
 
573
 
    
 
1510 
 
159
 
 
 
 
 
 
 
 
 
391
 
     foreach my $i (0..@{$self->{_class_names}}-1) {  
 
  
 
159
 
 
 
 
 
 
 
 
 
501
 
    
 
1511 
 
318
 
 
 
 
 
 
 
 
 
761
 
         my $class_name = $self->{_class_names}->[$i];  
 
1512 
 
318
 
 
 
 
 
 
 
 
 
1356
 
         my $prob = $self->probability_of_a_sequence_of_features_and_values_or_thresholds_given_class(  
 
1513 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                \@array_of_features_and_values_or_thresholds, $class_name);  
 
1514 
 
318
 
  
 50
   
 
 
 
 
 
 
 
1127
 
         if ($prob < 0.000001) {  
 
1515 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $array_of_class_probabilities[$i] = 0.0;  
 
1516 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             next;  
 
1517 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1518 
 
318
 
 
 
 
 
 
 
 
 
1122
 
         my $prob_of_feature_sequence = $self->probability_of_a_sequence_of_features_and_values_or_thresholds(  
 
1519 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                             \@array_of_features_and_values_or_thresholds);  
 
1520 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #        die "PCS Something is wrong with your sequence of feature values and thresholds in " .  
 
1521 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #                "probability_of_a_class_given_sequence_of_features_and_values_or_thresholds()"  
 
1522 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #                if ! $prob_of_feature_sequence;  
 
1523 
 
318
 
 
 
 
 
 
 
 
 
1094
 
         my $prior = $self->{_class_priors_hash}->{$self->{_class_names}->[$i]};  
 
1524 
 
318
 
  
 50
   
 
 
 
 
 
 
 
822
 
         if ($prob_of_feature_sequence) {  
 
1525 
 
318
 
 
 
 
 
 
 
 
 
1220
 
             $array_of_class_probabilities[$i] = $prob * $prior / $prob_of_feature_sequence;  
 
1526 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
1527 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $array_of_class_probabilities[$i] =  $prior;  
 
1528 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1529 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1530 
 
159
 
 
 
 
 
 
 
 
 
363
 
     my $sum_probability;  
 
1531 
 
159
 
 
 
 
 
 
 
 
 
433
 
     map {$sum_probability += $_} @array_of_class_probabilities;  
 
  
 
318
 
 
 
 
 
 
 
 
 
931
 
    
 
1532 
 
159
 
  
 50
   
 
 
 
 
 
 
 
491
 
     if ($sum_probability == 0) {  
 
1533 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         @array_of_class_probabilities =  map {1.0 / (scalar @{$self->{_class_names}})}    
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1534 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                                (0..@{$self->{_class_names}}-1);  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1535 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
1536 
 
159
 
 
 
 
 
 
 
 
 
392
 
         @array_of_class_probabilities = map {$_ * 1.0 / $sum_probability} @array_of_class_probabilities;  
 
  
 
318
 
 
 
 
 
 
 
 
 
1035
 
    
 
1537 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1538 
 
159
 
 
 
 
 
 
 
 
 
376
 
     foreach my $i (0..@{$self->{_class_names}}-1) {  
 
  
 
159
 
 
 
 
 
 
 
 
 
571
 
    
 
1539 
 
318
 
 
 
 
 
 
 
 
 
1161
 
         my $this_class_and_sequence = "$self->{_class_names}->[$i]" . "::" . "$sequence";  
 
1540 
 
318
 
 
 
 
 
 
 
 
 
1501
 
         $self->{_probability_cache}->{$this_class_and_sequence} = $array_of_class_probabilities[$i];  
 
1541 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1542 
 
159
 
 
 
 
 
 
 
 
 
636
 
     return $self->{_probability_cache}->{$class_and_sequence};  
 
1543 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1544 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1545 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #######################################  Class Based Utilities  ##########################################  
 
1546 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1547 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  Given a list of branch attributes for the numeric features of the form, say,  
 
1548 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  ['g2<1','g2<2','g2<3','age>34','age>36','age>37'], this method returns the  
 
1549 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  smallest list that is relevant for the purpose of calculating the probabilities.  
 
1550 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  To explain, the probability that the feature `g2' is less than 1 AND, at the same  
 
1551 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  time, less than 2, AND, at the same time, less than 3, is the same as the  
 
1552 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  probability that the feature less than 1. Similarly, the probability that 'age'  
 
1553 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  is greater than 34 and also greater than 37 is the same as `age' being greater  
 
1554 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  than 37.  
 
1555 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub find_bounded_intervals_for_numeric_features {  
 
1556 
 
489
 
 
 
 
 
  
489
   
 
  
0
   
 
1104
 
     my $self = shift;  
 
1557 
 
489
 
 
 
 
 
 
 
 
 
912
 
     my $arr = shift;      
 
1558 
 
489
 
 
 
 
 
 
 
 
 
1061
 
     my @arr = @$arr;  
 
1559 
 
489
 
 
 
 
 
 
 
 
 
917
 
     my @features = @{$self->{_feature_names}};  
 
  
 
489
 
 
 
 
 
 
 
 
 
1651
 
    
 
1560 
 
489
 
 
 
 
 
 
 
 
 
1170
 
     my @arr1 = map {my @x = split /(>|<)/, $_; \@x} @arr;     
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1561 
 
489
 
  
 50
   
 
 
 
 
 
 
 
1456
 
     print_array_with_msg("arr1", \@arr1) if $self->{_debug2};  
 
1562 
 
489
 
 
 
 
 
 
 
 
 
1027
 
     my @arr3 = ();  
 
1563 
 
489
 
 
 
 
 
 
 
 
 
1133
 
     foreach my $feature_name (@features) {  
 
1564 
 
1956
 
 
 
 
 
 
 
 
 
3893
 
         my @temp = ();  
 
1565 
 
1956
 
 
 
 
 
 
 
 
 
3831
 
         foreach my $x (@arr1) {  
 
1566 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
             push @temp, $x if @$x > 0 && $x->[0] eq $feature_name;  
 
1567 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1568 
 
1956
 
  
 50
   
 
 
 
 
 
 
 
5964
 
         push @arr3, \@temp if @temp > 0;  
 
1569 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1570 
 
489
 
  
 50
   
 
 
 
 
 
 
 
1444
 
     print_array_with_msg("arr3", \@arr3) if $self->{_debug2};  
 
1571 
 
 
 
 
 
 
 
 
 
 
 
 
 
     # Sort each list so that '<' entries occur before '>' entries:  
 
1572 
 
489
 
 
 
 
 
 
 
 
 
915
 
     my @arr4;  
 
1573 
 
489
 
 
 
 
 
 
 
 
 
992
 
     foreach my $li (@arr3) {  
 
1574 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @sorted = sort {$a->[1] cmp $b->[1]} @$li;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1575 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @arr4, \@sorted;  
 
1576 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1577 
 
489
 
  
 50
   
 
 
 
 
 
 
 
1387
 
     print_array_with_msg("arr4", \@arr4) if $self->{_debug2};  
 
1578 
 
489
 
 
 
 
 
 
 
 
 
904
 
     my @arr5;  
 
1579 
 
489
 
 
 
 
 
 
 
 
 
1030
 
     foreach my $li (@arr4) {  
 
1580 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @temp1 = ();  
 
1581 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @temp2 = ();  
 
1582 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $inner (@$li) {  
 
1583 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($inner->[1] eq '<') {  
 
1584 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 push @temp1, $inner;  
 
1585 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1586 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 push @temp2, $inner;  
 
1587 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1588 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1589 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
         if (@temp1 > 0 && @temp2 > 0) {  
 
  
 
 
 
  
  0
   
 
 
 
 
 
 
 
 
 
    
 
1590 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @arr5, [\@temp1, \@temp2];  
 
1591 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } elsif (@temp1 > 0) {  
 
1592 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @arr5, [\@temp1];  
 
1593 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
1594 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @arr5, [\@temp2];  
 
1595 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1596 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1597 
 
489
 
  
 50
   
 
 
 
 
 
 
 
1396
 
     print_array_with_msg("arr5", \@arr5) if $self->{_debug2};  
 
1598 
 
489
 
 
 
 
 
 
 
 
 
1069
 
     my @arr6 = ();  
 
1599 
 
489
 
 
 
 
 
 
 
 
 
1036
 
     foreach my $li (@arr5) {  
 
1600 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @temp1 = ();  
 
1601 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $inner (@$li) {  
 
1602 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1603 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @temp1, \@sorted;  
 
1604 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1605 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @arr6, \@temp1;  
 
1606 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1607 
 
489
 
  
 50
   
 
 
 
 
 
 
 
1301
 
     print_array_with_msg("arr6", \@arr6) if $self->{_debug2};  
 
1608 
 
489
 
 
 
 
 
 
 
 
 
1034
 
     my @arr9 = ();  
 
1609 
 
489
 
 
 
 
 
 
 
 
 
1042
 
     foreach my $li (@arr6) {  
 
1610 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $alist (@$li) {  
 
1611 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @newalist = ();  
 
1612 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($alist->[0][1] eq '<') {  
 
1613 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 push @newalist, $alist->[0];  
 
1614 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
1615 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 push @newalist, $alist->[-1];  
 
1616 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1617 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($alist->[0][1] ne $alist->[-1][1]) {  
 
1618 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 push @newalist, $alist->[-1];  
 
1619 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1620 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @arr9, \@newalist;  
 
1621 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1622 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1623 
 
489
 
  
 50
   
 
 
 
 
 
 
 
1284
 
     print_array_with_msg('arr9', \@arr9) if $self->{_debug2};  
 
1624 
 
489
 
 
 
 
 
 
 
 
 
1787
 
     return \@arr9;  
 
1625 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1626 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1627 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1628 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  This method is used to verify that you used legal feature names in the test  
 
1629 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  sample that you want to classify with the decision tree.  
 
1630 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub check_names_used {  
 
1631 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $self = shift;  
 
1632 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $features_and_values_test_data = shift;  
 
1633 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @features_and_values_test_data = @$features_and_values_test_data;  
 
1634 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $pattern = '(\S+)\s*=\s*(\S+)';  
 
1635 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $feature_and_value (@features_and_values_test_data) {  
 
1636 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $feature_and_value =~ /$pattern/;  
 
1637 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my ($feature,$value) = ($1,$2);  
 
1638 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
         die "Your test data has formatting error" unless defined($feature) && defined($value);  
 
1639 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         return 0 unless contained_in($feature, @{$self->{_feature_names}});  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1640 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1641 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return 1;  
 
1642 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1643 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1644 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #######################################  Data Condition Calculator  ######################################  
 
1645 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1646 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  This method estimates the worst-case fan-out of the decision tree taking into  
 
1647 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  account the number of values (and therefore the number of branches emanating from  
 
1648 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  a node) for the symbolic features.  
 
1649 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub determine_data_condition {  
 
1650 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $self = shift;  
 
1651 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $num_of_features = scalar @{$self->{_feature_names}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1652 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @values = ();  
 
1653 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @number_of_values;  
 
1654 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {    
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1655 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}  
 
1656 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1657 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @number_of_values, scalar @values;  
 
1658 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1659 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     return if ! @values;  
 
1660 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "Number of features: $num_of_features\n";  
 
1661 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @minmax = minmax(\@number_of_values);  
 
1662 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $max_num_values = $minmax[1];  
 
1663 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "Largest number of values for symbolic features is: $max_num_values\n";  
 
1664 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $estimated_number_of_nodes = $max_num_values ** $num_of_features;  
 
1665 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .  
 
1666 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "nodes. The exact number of nodes created depends critically on " .  
 
1667 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "the entropy_threshold used for node expansion (the default value " .  
 
1668 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "for this threshold is 0.01) and on the value set for max_depth_desired " .  
 
1669 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "for the depth of the tree.\n";  
 
1670 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if ($estimated_number_of_nodes > 10000) {  
 
1671 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .  
 
1672 
 
 
 
 
 
 
 
 
 
 
 
 
 
               "large value for entropy_threshold and/or a small value for " .  
 
1673 
 
 
 
 
 
 
 
 
 
 
 
 
 
               "for max_depth_desired to reduce the number of nodes created.\n";  
 
1674 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nDo you wish to continue anyway? Enter 'y' for yes:  ";  
 
1675 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $answer = ;   
 
1676 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $answer =~ s/\r?\n?$//;  
 
1677 
 
  
0
   
 
 
 
  
  0
   
 
 
 
 
 
0
 
         while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {  
 
1678 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "\nAnswer not recognized.  Let's try again. Enter 'y' or 'n': ";  
 
1679 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $answer = ;   
 
1680 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $answer =~ s/\r?\n?$//;  
 
1681 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1682 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         die unless $answer =~ /y(es)?/i;  
 
1683 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1684 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1685 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1686 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1687 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ####################################  Read Training Data From File  ######################################  
 
1688 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1689 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1690 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub get_training_data {  
 
1691 
 
1
 
 
 
 
 
  
1
   
 
  
1
   
 
7
 
     my $self = shift;  
 
1692 
 
1
 
  
 50
   
 
 
 
 
 
 
 
10
 
     die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;  
 
1693 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my %class_names = ();  
 
1694 
 
1
 
 
 
 
 
 
 
 
 
3
 
     my %all_record_ids_with_class_labels;  
 
1695 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $firstline;  
 
1696 
 
1
 
 
 
 
 
 
 
 
 
0
 
     my %data_hash;  
 
1697 
 
1
 
 
 
 
 
 
 
 
 
4
 
     $|++;  
 
1698 
 
1
 
 
 
  
 50
   
 
 
 
 
 
25
 
     open FILEIN, $self->{_training_datafile} || die "unable to open $self->{_training_datafile}: $!";  
 
1699 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my $record_index = 0;  
 
1700 
 
1
 
 
 
 
 
 
 
 
 
2
 
     my $firsetline;  
 
1701 
 
1
 
 
 
 
 
 
 
 
 
16
 
     while () {   
 
1702 
 
36
 
  
 50
   
 
 
 
 
 
 
 
115
 
         next if /^[ ]*\r?\n?$/;  
 
1703 
 
36
 
 
 
 
 
 
 
 
 
112
 
         $_ =~ s/\r?\n?$//;  
 
1704 
 
36
 
  
 50
   
 
 
 
 
 
 
 
84
 
         my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;  
 
1705 
 
36
 
  
100
   
 
 
 
 
 
 
 
77
 
         if ($record_index == 0) {  
 
1706 
 
1
 
 
 
 
 
 
 
 
 
2
 
             $firstline = $record;  
 
1707 
 
1
 
 
 
 
 
 
 
 
 
3
 
             $record_index++;  
 
1708 
 
1
 
 
 
 
 
 
 
 
 
11
 
             next;  
 
1709 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1710 
 
35
 
 
 
 
 
 
 
 
 
152
 
         my @parts = split /,/, $record;  
 
1711 
 
35
 
 
 
 
 
 
 
 
 
66
 
         my $classname = $parts[$self->{_csv_class_column_index}];  
 
1712 
 
35
 
 
 
 
 
 
 
 
 
49
 
         $class_names{$classname} = 1;  
 
1713 
 
35
 
 
 
 
 
 
 
 
 
56
 
         my $record_label = shift @parts;  
 
1714 
 
35
 
 
 
 
 
 
 
 
 
81
 
         $record_label  =~ s/^\s*\"|\"\s*$//g;  
 
1715 
 
35
 
 
 
 
 
 
 
 
 
77
 
         $data_hash{$record_label} = \@parts;  
 
1716 
 
35
 
 
 
 
 
 
 
 
 
59
 
         $all_record_ids_with_class_labels{$record_label} = $classname;  
 
1717 
 
35
 
  
 50
   
 
 
 
 
 
 
 
76
 
         print "." if $record_index % 10000 == 0;  
 
1718 
 
35
 
 
 
 
 
 
 
 
 
108
 
         $record_index++;  
 
1719 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1720 
 
1
 
 
 
 
 
 
 
 
 
9
 
     close FILEIN;      
 
1721 
 
1
 
 
 
 
 
 
 
 
 
3
 
     $|--;  
 
1722 
 
1
 
 
 
 
 
 
 
 
 
5
 
     $self->{_how_many_total_training_samples} = $record_index - 1;  # must subtract 1 for the header record  
 
1723 
 
1
 
  
 50
   
 
 
 
 
 
 
 
4
 
     print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};  
 
1724 
 
1
 
 
 
 
 
 
 
 
 
11
 
     my @all_feature_names =   split /,/, substr($firstline, index($firstline,','));  
 
1725 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];  
 
1726 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;  
 
  
 
2
 
 
 
 
 
 
 
 
 
12
 
    
 
1727 
 
1
 
 
 
 
 
 
 
 
 
3
 
     my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};  
 
  
 
4
 
 
 
 
 
 
 
 
 
9
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
3
 
    
 
1728 
 
1
 
 
 
 
 
 
 
 
 
5
 
     my %class_for_sample_hash = map {"sample_" . $_  =>  "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;  
 
  
 
35
 
 
 
 
 
 
 
 
 
96
 
    
 
1729 
 
1
 
 
 
 
 
 
 
 
 
10
 
     my @sample_names = map {"sample_$_"} keys %data_hash;  
 
  
 
35
 
 
 
 
 
 
 
 
 
59
 
    
 
1730 
 
1
 
  
 50
   
 
 
 
 
 
 
 
8
 
     my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID  =>  [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $data_hash{$sampleID}->[$_-1] } } @{$self->{_csv_columns_for_features}} ] }  keys %data_hash;      
 
  
 
35
 
 
 
 
 
 
 
 
 
59
 
    
 
  
 
35
 
 
 
 
 
 
 
 
 
55
 
    
 
  
 
140
 
 
 
 
 
 
 
 
 
220
 
    
 
  
 
140
 
 
 
 
 
 
 
 
 
223
 
    
 
  
 
140
 
 
 
 
 
 
 
 
 
516
 
    
 
  
 
35
 
 
 
 
 
 
 
 
 
58
 
    
 
1731 
 
1
 
  
 50
   
 
 
 
 
 
 
 
10
 
     my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [  map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};       
 
  
 
4
 
 
 
 
 
 
 
 
 
15
 
    
 
  
 
4
 
 
 
 
 
 
 
 
 
11
 
    
 
  
 
4
 
 
 
 
 
 
 
 
 
31
 
    
 
  
 
140
 
 
 
 
 
 
 
 
 
300
 
    
 
  
 
140
 
 
 
 
 
 
 
 
 
518
 
    
 
  
 
140
 
 
 
 
 
 
 
 
 
387
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
5
 
    
 
1732 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my %numeric_features_valuerange_hash = ();  
 
1733 
 
1
 
 
 
 
 
 
 
 
 
3
 
     my %feature_values_how_many_uniques_hash = ();  
 
1734 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my %features_and_unique_values_hash = ();  
 
1735 
 
1
 
 
 
 
 
 
 
 
 
3
 
     my $numregex =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';  
 
1736 
 
1
 
 
 
 
 
 
 
 
 
5
 
     foreach my $feature (keys %features_and_values_hash) {  
 
1737 
 
4
 
 
 
 
 
 
 
 
 
9
 
         my %seen1 = ();  
 
1738 
 
140
 
  
100
   
 
  
 66
   
 
 
 
 
 
655
 
         my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}   
 
1739 
 
4
 
 
 
 
 
 
 
 
 
8
 
                                                    @{$features_and_values_hash{$feature}};  
 
  
 
4
 
 
 
 
 
 
 
 
 
16
 
    
 
1740 
 
4
 
 
 
 
 
 
 
 
 
15
 
         $feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;  
 
1741 
 
4
 
 
 
 
 
 
 
 
 
11
 
         my $not_all_values_float = 0;  
 
1742 
 
4
 
  
 50
   
 
 
 
 
 
 
 
11
 
         map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;  
 
  
 
14
 
 
 
 
 
 
 
 
 
152
 
    
 
1743 
 
4
 
  
 50
   
 
 
 
 
 
 
 
13
 
         if ($not_all_values_float == 0) {  
 
1744 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @minmaxvalues = minmax(\@unique_values_for_feature);  
 
1745 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $numeric_features_valuerange_hash{$feature} = \@minmaxvalues;   
 
1746 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1747 
 
4
 
 
 
 
 
 
 
 
 
17
 
         $features_and_unique_values_hash{$feature} = \@unique_values_for_feature;  
 
1748 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1749 
 
1
 
  
 50
   
 
 
 
 
 
 
 
7
 
     if ($self->{_debug1}) {  
 
1750 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nAll class names: @all_class_names\n";  
 
1751 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nEach sample data record:\n";  
 
1752 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1753 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$sample  =>  @{$feature_values_for_samples_hash{$sample}}\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1754 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1755 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nclass label for each data sample:\n";  
 
1756 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $sample (sort {sample_index($a) <=> sample_index($b)}  keys %class_for_sample_hash) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1757 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$sample => $class_for_sample_hash{$sample}\n";  
 
1758 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1759 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nFeatures used: @feature_names\n\n";  
 
1760 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nfeatures and the values taken by them:\n";  
 
1761 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $feature (sort keys %features_and_values_hash) {  
 
1762 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$feature => @{$features_and_values_hash{$feature}}\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1763 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1764 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nnumeric features and their ranges:\n";  
 
1765 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach  my $feature (sort keys %numeric_features_valuerange_hash) {  
 
1766 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$feature  =>  @{$numeric_features_valuerange_hash{$feature}}\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1767 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1768 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nnumber of unique values in each feature:\n";  
 
1769 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach  my $feature (sort keys %feature_values_how_many_uniques_hash) {  
 
1770 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$feature  =>  $feature_values_how_many_uniques_hash{$feature}\n";  
 
1771 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1772 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1773 
 
1
 
 
 
 
 
 
 
 
 
4
 
     $self->{_class_names} = \@all_class_names;  
 
1774 
 
1
 
 
 
 
 
 
 
 
 
4
 
     $self->{_feature_names} = \@feature_names;  
 
1775 
 
1
 
 
 
 
 
 
 
 
 
4
 
     $self->{_samples_class_label_hash}  =  \%class_for_sample_hash;  
 
1776 
 
1
 
 
 
 
 
 
 
 
 
3
 
     $self->{_training_data_hash}  =  \%feature_values_for_samples_hash;  
 
1777 
 
1
 
 
 
 
 
 
 
 
 
4
 
     $self->{_features_and_values_hash}  = \%features_and_values_hash;  
 
1778 
 
1
 
 
 
 
 
 
 
 
 
12
 
     $self->{_features_and_unique_values_hash}  =  \%features_and_unique_values_hash;  
 
1779 
 
1
 
 
 
 
 
 
 
 
 
3
 
     $self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;  
 
1780 
 
1
 
 
 
 
 
 
 
 
 
44
 
     $self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;  
 
1781 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1782 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1783 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub show_training_data {  
 
1784 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
1
   
 
0
 
     my $self = shift;  
 
1785 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @class_names = @{$self->{_class_names}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1786 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %features_and_values_hash = %{$self->{_features_and_values_hash}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1787 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1788 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %training_data_hash = %{$self->{_training_data_hash}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1789 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\n\nClass Names: @class_names\n";  
 
1790 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\n\nFeatures and Their Values:\n\n";  
 
1791 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     while ( my ($k, $v) = each %features_and_values_hash ) {  
 
1792 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "$k --->  @{$features_and_values_hash{$k}}\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1793 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1794 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\n\nSamples vs. Class Labels:\n\n";  
 
1795 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1796 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "$kee =>  $samples_class_label_hash{$kee}\n";  
 
1797 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1798 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\n\nTraining Samples:\n\n";  
 
1799 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $kee (sort {sample_index($a) <=> sample_index($b)}   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1800 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                       keys %training_data_hash) {  
 
1801 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "$kee =>  @{$training_data_hash{$kee}}\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1802 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1803 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }      
 
1804 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1805 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub get_class_names {  
 
1806 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $self = shift;  
 
1807 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return @{$self->{_class_names}}  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1808 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1809 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1810 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##########################################  Utility Routines  ############################################  
 
1811 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1812 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub closest_sampling_point {  
 
1813 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $value = shift;  
 
1814 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $arr_ref = shift;  
 
1815 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @arr = @{$arr_ref};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1816 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @compare = map {abs($_ - $value)} @arr;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1817 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my ($minval,$index) = minimum(\@compare);  
 
1818 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $arr[$index];  
 
1819 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1820 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1821 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ## returns the array index that contains a specified STRING value: (meant only for array of strings)  
 
1822 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub get_index_at_value {  
 
1823 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $value = shift;  
 
1824 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @array = @{shift @_};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1825 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $i (0..@array-1) {  
 
1826 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         return $i if $value eq $array[$i];  
 
1827 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1828 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1829 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1830 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  When the training data is read from a CSV file, we assume that the first column  
 
1831 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  of each data record contains a unique integer identifier for the record in that  
 
1832 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  row. This training data is stored in a hash whose keys are the prefix 'sample_'  
 
1833 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  followed by the identifying integers.  The purpose of this function is to return  
 
1834 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  the identifying integer associated with a data record.  
 
1835 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub sample_index {  
 
1836 
 
1088
 
 
 
 
 
  
1088
   
 
  
0
   
 
1757
 
     my $arg = shift;  
 
1837 
 
1088
 
 
 
 
 
 
 
 
 
2447
 
     $arg =~ /_(.+)$/;  
 
1838 
 
1088
 
 
 
 
 
 
 
 
 
2685
 
     return $1;  
 
1839 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }      
 
1840 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1841 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # Returns the minimum value and its positional index in an array  
 
1842 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub minimum {  
 
1843 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $arr = shift;  
 
1844 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $min;  
 
1845 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $index;  
 
1846 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $i (0..@{$arr}-1) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1847 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
         if ( (!defined $min) || ($arr->[$i] < $min) ) {  
 
1848 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $index = $i;  
 
1849 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $min = $arr->[$i];  
 
1850 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1851 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1852 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return ($min, $index);  
 
1853 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1854 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1855 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # Returns an array of two values, the min and the max, of an array of floats  
 
1856 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub minmax {  
 
1857 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $arr = shift;  
 
1858 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my ($min, $max);  
 
1859 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $i (0..@{$arr}-1) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1860 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
         if ( (!defined $min) || ($arr->[$i] < $min) ) {  
 
1861 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $min = $arr->[$i];  
 
1862 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1863 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
         if ( (!defined $max) || ($arr->[$i] > $max) ) {  
 
1864 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $max = $arr->[$i];  
 
1865 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1866 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1867 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return ($min, $max);  
 
1868 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1869 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1870 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # checks whether an element is in an array:  
 
1871 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub contained_in {  
 
1872 
 
414
 
 
 
 
 
  
414
   
 
  
0
   
 
858
 
     my $ele = shift;  
 
1873 
 
414
 
 
 
 
 
 
 
 
 
1227
 
     my @array = @_;  
 
1874 
 
414
 
 
 
 
 
 
 
 
 
858
 
     my $count = 0;  
 
1875 
 
414
 
  
100
   
 
 
 
 
 
 
 
908
 
     map {$count++ if $ele eq $_} @array;  
 
  
 
1104
 
 
 
 
 
 
 
 
 
3997
 
    
 
1876 
 
414
 
 
 
 
 
 
 
 
 
1727
 
     return $count;  
 
1877 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1878 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1879 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # Meant only for an array of strings (no nesting):  
 
1880 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub deep_copy_array {  
 
1881 
 
289
 
 
 
 
 
  
289
   
 
  
0
   
 
619
 
     my $ref_in = shift;  
 
1882 
 
289
 
 
 
 
 
 
 
 
 
578
 
     my $ref_out;  
 
1883 
 
289
 
  
100
   
 
 
 
 
 
 
 
885
 
     return [] if scalar @$ref_in == 0;  
 
1884 
 
274
 
 
 
 
 
 
 
 
 
525
 
     foreach my $i (0..@{$ref_in}-1) {  
 
  
 
274
 
 
 
 
 
 
 
 
 
994
 
    
 
1885 
 
628
 
 
 
 
 
 
 
 
 
1783
 
         $ref_out->[$i] = $ref_in->[$i];  
 
1886 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1887 
 
274
 
 
 
 
 
 
 
 
 
986
 
     return $ref_out;  
 
1888 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1889 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1890 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub check_for_illegal_params2 {  
 
1891 
 
1
 
 
 
 
 
  
1
   
 
  
0
   
 
3
 
     my @params = @_;  
 
1892 
 
1
 
 
 
 
 
 
 
 
 
7
 
     my @legal_params = qw / training_datafile  
 
1893 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             entropy_threshold  
 
1894 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             max_depth_desired  
 
1895 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             csv_class_column_index  
 
1896 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             csv_columns_for_features  
 
1897 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             symbolic_to_numeric_cardinality_threshold  
 
1898 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             number_of_histogram_bins  
 
1899 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             csv_cleanup_needed  
 
1900 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             debug1  
 
1901 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             debug2  
 
1902 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             debug3  
 
1903 
 
 
 
 
 
 
 
 
 
 
 
 
 
                           /;  
 
1904 
 
1
 
 
 
 
 
 
 
 
 
7
 
     my $found_match_flag;  
 
1905 
 
1
 
 
 
 
 
 
 
 
 
3
 
     foreach my $param (@params) {  
 
1906 
 
3
 
 
 
 
 
 
 
 
 
5
 
         foreach my $legal (@legal_params) {  
 
1907 
 
10
 
 
 
 
 
 
 
 
 
16
 
             $found_match_flag = 0;  
 
1908 
 
10
 
  
100
   
 
 
 
 
 
 
 
20
 
             if ($param eq $legal) {  
 
1909 
 
3
 
 
 
 
 
 
 
 
 
4
 
                 $found_match_flag = 1;  
 
1910 
 
3
 
 
 
 
 
 
 
 
 
5
 
                 last;  
 
1911 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
1912 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1913 
 
3
 
  
 50
   
 
 
 
 
 
 
 
9
 
         last if $found_match_flag == 0;  
 
1914 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1915 
 
1
 
 
 
 
 
 
 
 
 
9
 
     return $found_match_flag;  
 
1916 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1917 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1918 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub print_array_with_msg {  
 
1919 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $message = shift;  
 
1920 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $arr = shift;  
 
1921 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\n$message: ";  
 
1922 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print_nested_array( $arr );  
 
1923 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1924 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1925 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub print_nested_array {  
 
1926 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $arr = shift;  
 
1927 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @arr = @$arr;  
 
1928 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "[";  
 
1929 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $item (@arr) {  
 
1930 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         if (ref $item) {  
 
1931 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print_nested_array($item);  
 
1932 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
1933 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$item";  
 
1934 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
1935 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1936 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "]";  
 
1937 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }      
 
1938 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1939 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub cleanup_csv {  
 
1940 
 
  
0
   
 
 
 
 
 
  
0
   
 
  
0
   
 
0
 
     my $line = shift;  
 
1941 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $line =~ tr/\/:?()[]{}'/          /;  
 
1942 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;  
 
1943 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;  
 
1944 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     for (@double_quoted) {  
 
1945 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $item = $_;  
 
1946 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $item = substr($item, 1, -1);  
 
1947 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $item =~ s/^\s+|,|\s+$//g;  
 
1948 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $item = join '_',  split /\s+/, $item;  
 
1949 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         substr($line, index($line, $_), length($_)) = $item;  
 
1950 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1951 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;  
 
1952 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     for (@white_spaced) {  
 
1953 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $item = $_;  
 
1954 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $item =~ s/\s+/_/g;  
 
1955 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $item =~ s/^\s*_|_\s*$//g;  
 
1956 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         substr($line, index($line, $_), length($_)) = $item;  
 
1957 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
1958 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $line =~ s/,\s*(?=,|$)/,NA/g;  
 
1959 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $line;  
 
1960 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1961 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1962 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ######################################### Class EvalTrainingData  ########################################  
 
1963 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1964 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  This subclass of the DecisionTree class is used to evaluate the quality of your  
 
1965 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  training data by running a 10-fold cross-validation test on it. This test divides  
 
1966 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  all of the training data into ten parts, with nine parts used for training a  
 
1967 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  decision tree and one part used for testing its ability to classify correctly.  
 
1968 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  This selection of nine parts for training and one part for testing is carried out  
 
1969 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  in all of the ten different possible ways.  This testing functionality can also  
 
1970 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  be used to find the best values to use for the constructor parameters  
 
1971 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  entropy_threshold, max_depth_desired, and  
 
1972 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  symbolic_to_numeric_cardinality_threshold.  
 
1973 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1974 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  Only the CSV training files can be evaluated in this manner (because only CSV  
 
1975 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  training are allowed to have numeric features --- which is the more interesting  
 
1976 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  case for evaluation analytics.  
 
1977 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1978 
 
 
 
 
 
 
 
 
 
 
 
 
 
 package EvalTrainingData;  
 
1979 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1980 
 
 
 
 
 
 
 
 
 
 
 
 
 
 @EvalTrainingData::ISA = ('Algorithm::DecisionTree');  
 
1981 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1982 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub new {  
 
1983 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $class = shift;  
 
1984 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $instance = Algorithm::DecisionTree->new(@_);  
 
1985 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     bless $instance, $class;  
 
1986 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
1987 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
1988 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub evaluate_training_data {  
 
1989 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $self = shift;  
 
1990 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $evaldebug = 0;  
 
1991 
 
 
 
 
 
 
 
 
 
 
 
 
 
     die "The data evaluation function in the module can only be used when your " .  
 
1992 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         "training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;  
 
1993 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\nWill run a 10-fold cross-validation test on your training data to test its " .  
 
1994 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "class-discriminatory power:\n";  
 
1995 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %all_training_data = %{$self->{_training_data_hash}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1996 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=>   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
1997 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                      Algorithm::DecisionTree::sample_index($b)}  keys %all_training_data;  
 
1998 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $fold_size = int(0.1 * (scalar keys %all_training_data));  
 
1999 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "fold size: $fold_size\n";  
 
2000 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %confusion_matrix = ();  
 
2001 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $class_name (@{$self->{_class_names}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2002 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $inner_class_name (@{$self->{_class_names}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2003 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $confusion_matrix{$class_name}->{$inner_class_name} = 0;  
 
2004 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2005 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2006 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $fold_index (0..9) {  
 
2007 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n";   
 
2008 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];  
 
2009 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],    
 
2010 
 
 
 
 
 
 
 
 
 
 
 
 
 
                      @all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);  
 
2011 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my %testing_data = ();  
 
2012 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $x (@testing_samples) {  
 
2013 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $testing_data{$x} = $all_training_data{$x};  
 
2014 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2015 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my %training_data = ();  
 
2016 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $x (@training_samples) {  
 
2017 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $training_data{$x} = $all_training_data{$x};  
 
2018 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2019 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $trainingDT = Algorithm::DecisionTree->new('evalmode');  
 
2020 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $trainingDT->{_training_data_hash} = \%training_data;  
 
2021 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $trainingDT->{_class_names} = $self->{_class_names};  
 
2022 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $trainingDT->{_feature_names} = $self->{_feature_names};  
 
2023 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};  
 
2024 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};  
 
2025 
 
 
 
 
 
 
 
 
 
 
 
 
 
         $trainingDT->{_symbolic_to_numeric_cardinality_threshold} =   
 
2026 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                 $self->{_symbolic_to_numeric_cardinality_threshold};  
 
2027 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $sample_name (@training_samples) {  
 
2028 
 
 
 
 
 
 
 
 
 
 
 
 
 
             $trainingDT->{_samples_class_label_hash}->{$sample_name} =   
 
2029 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                 $self->{_samples_class_label_hash}->{$sample_name};  
 
2030 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2031 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $feature (keys %{$self->{_features_and_values_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2032 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $trainingDT->{_features_and_values_hash}->{$feature} = ();  
 
2033 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2034 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $pattern = '(\S+)\s*=\s*(\S+)';  
 
2035 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=>   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2036 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                 Algorithm::DecisionTree::sample_index($b)}    
 
2037 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                           keys %{$trainingDT->{_training_data_hash}}) {  
 
2038 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2039 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $feature_and_value =~ /$pattern/;  
 
2040 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 my ($feature,$value) = ($1,$2);  
 
2041 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                 push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2042 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2043 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2044 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2045 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my %seen = ();  
 
2046 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
             my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}   
 
2047 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                 @{$trainingDT->{_features_and_values_hash}->{$feature}};   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2048 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if (Algorithm::DecisionTree::contained_in($feature,   
 
2049 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                                 keys %{$self->{_numeric_features_valuerange_hash}})) {  
 
2050 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 @unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2051 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
2052 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 @unique_values_for_feature = sort @unique_values_for_feature;  
 
2053 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2054 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;  
 
2055 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2056 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2057 
 
 
 
 
 
 
 
 
 
 
 
 
 
             my @minmaxvalues = Algorithm::DecisionTree::minmax(  
 
2058 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                          \@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2059 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;  
 
2060 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2061 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         if ($evaldebug) {  
 
2062 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "\n\nprinting samples in the testing set: @testing_samples\n";  
 
2063 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "\n\nPrinting features and their values in the training set:\n";  
 
2064 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2065 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "$item  => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2066 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2067 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "\n\nPrinting unique values for features:\n";  
 
2068 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2069 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "$item  => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2070 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2071 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "\n\nPrinting unique value ranges for features:\n";  
 
2072 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2073 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "$item  => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2074 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2075 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2076 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2077 
 
 
 
 
 
 
 
 
 
 
 
 
 
             $trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} =   
 
2078 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2079 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2080 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         $trainingDT->{_debug2} = 1 if $evaldebug;  
 
2081 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $trainingDT->calculate_first_order_probabilities();  
 
2082 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $trainingDT->calculate_class_priors();  
 
2083 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $root_node = $trainingDT->construct_decision_tree_classifier();  
 
2084 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         $root_node->display_decision_tree("     ") if $evaldebug;  
 
2085 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $test_sample_name (@testing_samples) {  
 
2086 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @test_sample_data = @{$all_training_data{$test_sample_name}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2087 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             print "original data in test sample: @test_sample_data\n" if $evaldebug;  
 
2088 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
             @test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2089 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             print "filtered data in test sample: @test_sample_data\n" if $evaldebug;  
 
2090 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2091 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @solution_path = @{$classification{'solution_path'}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2092 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             delete $classification{'solution_path'};  
 
2093 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @which_classes = keys %classification;  
 
2094 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             @which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2095 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $most_likely_class_label = $which_classes[0];  
 
2096 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($evaldebug) {  
 
2097 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "\nClassification:\n\n";  
 
2098 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "     class                         probability\n";  
 
2099 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "     ----------                    -----------\n";  
 
2100 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 foreach my $which_class (@which_classes) {  
 
2101 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     my $classstring = sprintf("%-30s", $which_class);  
 
2102 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     my $valuestring = sprintf("%-30s", $classification{$which_class});  
 
2103 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     print "     $classstring $valuestring\n";  
 
2104 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
2105 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "\nSolution path in the decision tree: @solution_path\n";  
 
2106 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";  
 
2107 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2108 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};  
 
2109 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             print "$test_sample_name:    true_class: $true_class_label_for_sample    " .  
 
2110 
 
 
 
 
 
 
 
 
 
 
 
 
 
                      "estimated_class: $most_likely_class_label\n"  if $evaldebug;  
 
2111 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;  
 
2112 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2113 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2114 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\n\n       DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";  
 
2115 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $matrix_header = " " x 30;  
 
2116 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $class_name (@{$self->{_class_names}}) {    
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2117 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $matrix_header .= sprintf("%-30s", $class_name);  
 
2118 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2119 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\n" . $matrix_header . "\n\n";  
 
2120 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $row_class_name (sort keys %confusion_matrix) {  
 
2121 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $row_display = sprintf("%-30s", $row_class_name);  
 
2122 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2123 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $row_display .= sprintf( "%-30u",  $confusion_matrix{$row_class_name}->{$col_class_name} );  
 
2124 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2125 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "$row_display\n\n";  
 
2126 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2127 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\n\n";  
 
2128 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my ($diagonal_sum, $off_diagonal_sum) = (0,0);  
 
2129 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $row_class_name (sort keys %confusion_matrix) {  
 
2130 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2131 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($row_class_name eq $col_class_name) {  
 
2132 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};  
 
2133 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
2134 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};  
 
2135 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2136 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2137 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2138 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);  
 
2139 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\nTraining Data Quality Index: $data_quality_index    (out of a possible maximum of 100)\n";  
 
2140 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
0
 
     if ($data_quality_index <= 80) {  
 
  
 
 
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
 
 
 
 
 
 
 
 
    
 
2141 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nYour training data does not possess much class discriminatory " .  
 
2142 
 
 
 
 
 
 
 
 
 
 
 
 
 
               "information.  It could be that the classes are inherently not well " .  
 
2143 
 
 
 
 
 
 
 
 
 
 
 
 
 
               "separable or that your constructor parameter choices are not appropriate.\n";  
 
2144 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } elsif ($data_quality_index > 80 && $data_quality_index <= 90) {  
 
2145 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nYour training data possesses some class discriminatory information " .  
 
2146 
 
 
 
 
 
 
 
 
 
 
 
 
 
               "but it may not be sufficient for real-world applications.  You might " .  
 
2147 
 
 
 
 
 
 
 
 
 
 
 
 
 
               "try tweaking the constructor parameters to see if that improves the " .  
 
2148 
 
 
 
 
 
 
 
 
 
 
 
 
 
               "class discriminations.\n";  
 
2149 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } elsif ($data_quality_index > 90 && $data_quality_index <= 95) {  
 
2150 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print  "\nYour training data appears to possess good class discriminatory " .  
 
2151 
 
 
 
 
 
 
 
 
 
 
 
 
 
                "information.  Whether or not it is acceptable would depend on your " .  
 
2152 
 
 
 
 
 
 
 
 
 
 
 
 
 
                "application.\n";  
 
2153 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } elsif ($data_quality_index > 95 && $data_quality_index <= 98) {  
 
2154 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nYour training data is of excellent quality.\n";  
 
2155 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
2156 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nYour training data is perfect.\n";  
 
2157 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2158 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2159 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2160 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2161 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2162 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #############################################  Class DTNode  #############################################  
 
2163 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2164 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # The nodes of the decision tree are instances of this class:  
 
2165 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2166 
 
 
 
 
 
 
 
 
 
 
 
 
 
 package DTNode;  
 
2167 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2168 
 
1
 
 
 
 
 
  
1
   
 
 
 
23
 
 use strict;   
 
  
 
1
 
 
 
 
 
 
 
 
 
5
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
39
 
    
 
2169 
 
1
 
 
 
 
 
  
1
   
 
 
 
14
 
 use Carp;  
 
  
 
1
 
 
 
 
 
 
 
 
 
4
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
1578
 
    
 
2170 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2171 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # $feature is the feature test at the current node.  $branch_features_and_values is  
 
2172 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # an anonymous array holding the feature names and corresponding values on the path  
 
2173 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # from the root to the current node:  
 
2174 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub new {                                                             
 
2175 
 
46
 
 
 
 
 
  
46
   
 
 
 
235
 
     my ($class, $feature, $entropy, $class_probabilities,   
 
2176 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                        $branch_features_and_values_or_thresholds, $dt, $root_or_not) = @_;   
 
2177 
 
46
 
  
100
   
 
 
 
 
 
 
 
229
 
     $root_or_not = '' if !defined $root_or_not;  
 
2178 
 
46
 
  
100
   
 
 
 
 
 
 
 
187
 
     if ($root_or_not eq 'root') {  
 
2179 
 
1
 
 
 
 
 
 
 
 
 
5
 
         $dt->{nodes_created} = -1;  
 
2180 
 
1
 
 
 
 
 
 
 
 
 
8
 
         $dt->{class_names} = undef;  
 
2181 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2182 
 
46
 
 
 
 
 
 
 
 
 
400
 
     my $self = {                                                           
 
2183 
 
 
 
 
 
 
 
 
 
 
 
 
 
             _dt                      => $dt,  
 
2184 
 
 
 
 
 
 
 
 
 
 
 
 
 
             _feature                 => $feature,                                         
 
2185 
 
 
 
 
 
 
 
 
 
 
 
 
 
             _node_creation_entropy   => $entropy,  
 
2186 
 
 
 
 
 
 
 
 
 
 
 
 
 
             _class_probabilities     => $class_probabilities,  
 
2187 
 
 
 
 
 
 
 
 
 
 
 
 
 
             _branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,  
 
2188 
 
 
 
 
 
 
 
 
 
 
 
 
 
             _linked_to => [],                                            
 
2189 
 
 
 
 
 
 
 
 
 
 
 
 
 
     };  
 
2190 
 
46
 
 
 
 
 
 
 
 
 
153
 
     bless $self, $class;  
 
2191 
 
46
 
 
 
 
 
 
 
 
 
199
 
     $self->{_serial_number} =  $self->get_next_serial_num();  
 
2192 
 
46
 
 
 
 
 
 
 
 
 
136
 
     return $self;  
 
2193 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2194 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2195 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub how_many_nodes {  
 
2196 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $self = shift;  
 
2197 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $self->{_dt}->{nodes_created} + 1;  
 
2198 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2199 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2200 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub set_class_names {  
 
2201 
 
1
 
 
 
 
 
  
1
   
 
 
 
5
 
     my $self = shift;  
 
2202 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my $class_names_list = shift;  
 
2203 
 
1
 
 
 
 
 
 
 
 
 
4
 
     $self->{_dt}->{class_names} = $class_names_list;  
 
2204 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2205 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2206 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub get_class_names {  
 
2207 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $self = shift;  
 
2208 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $self->{_dt}->{class_names};  
 
2209 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2210 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2211 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub get_next_serial_num {  
 
2212 
 
46
 
 
 
 
 
  
46
   
 
 
 
110
 
     my $self = shift;  
 
2213 
 
46
 
 
 
 
 
 
 
 
 
158
 
     $self->{_dt}->{nodes_created} += 1;  
 
2214 
 
46
 
 
 
 
 
 
 
 
 
154
 
     return $self->{_dt}->{nodes_created};  
 
2215 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2216 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2217 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub get_serial_num {  
 
2218 
 
46
 
 
 
 
 
  
46
   
 
 
 
115
 
     my $self = shift;  
 
2219 
 
46
 
 
 
 
 
 
 
 
 
135
 
     $self->{_serial_number};  
 
2220 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2221 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2222 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # this returns the feature test at the current node  
 
2223 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub get_feature {                                    
 
2224 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $self = shift;                                
 
2225 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $self->{ _feature };                      
 
2226 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2227 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2228 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub set_feature {  
 
2229 
 
46
 
 
 
 
 
  
46
   
 
 
 
114
 
     my $self = shift;  
 
2230 
 
46
 
 
 
 
 
 
 
 
 
105
 
     my $feature = shift;  
 
2231 
 
46
 
 
 
 
 
 
 
 
 
168
 
     $self->{_feature} = $feature;  
 
2232 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2233 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2234 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub get_node_entropy {  
 
2235 
 
46
 
 
 
 
 
  
46
   
 
 
 
103
 
     my $self = shift;                                
 
2236 
 
46
 
 
 
 
 
 
 
 
 
132
 
     return $self->{_node_creation_entropy};  
 
2237 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2238 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2239 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub get_class_probabilities {                                    
 
2240 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $self = shift;                                
 
2241 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $self->{ _class_probabilities};                      
 
2242 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2243 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2244 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub get_branch_features_and_values_or_thresholds {  
 
2245 
 
46
 
 
 
 
 
  
46
   
 
 
 
117
 
     my $self = shift;   
 
2246 
 
46
 
 
 
 
 
 
 
 
 
178
 
     return $self->{_branch_features_and_values_or_thresholds};  
 
2247 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2248 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2249 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub add_to_branch_features_and_values {  
 
2250 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $self = shift;                     
 
2251 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $feature_and_value = shift;  
 
2252 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     push @{$self->{ _branch_features_and_values }}, $feature_and_value;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2253 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2254 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2255 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub get_children {         
 
2256 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $self = shift;                     
 
2257 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $self->{_linked_to};  
 
2258 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2259 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2260 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub add_child_link {           
 
2261 
 
45
 
 
 
 
 
  
45
   
 
 
 
136
 
     my ($self, $new_node, ) = @_;                              
 
2262 
 
45
 
 
 
 
 
 
 
 
 
103
 
     push @{$self->{_linked_to}}, $new_node;                    
 
  
 
45
 
 
 
 
 
 
 
 
 
151
 
    
 
2263 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2264 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2265 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub delete_all_links {                    
 
2266 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $self = shift;                     
 
2267 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $self->{_linked_to} = undef;          
 
2268 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2269 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2270 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub display_node {  
 
2271 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $self = shift;   
 
2272 
 
  
0
   
 
 
 
  
  0
   
 
 
 
 
 
0
 
     my $feature_at_node = $self->get_feature() || " ";  
 
2273 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $node_creation_entropy_at_node = $self->get_node_entropy();  
 
2274 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);  
 
2275 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @class_probabilities = @{$self->get_class_probabilities()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2276 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @class_probabilities_for_display = map {sprintf("%0.3f", $_)} @class_probabilities;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2277 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $serial_num = $self->get_serial_num();  
 
2278 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2279 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print "\n\nNODE $serial_num" .  
 
2280 
 
 
 
 
 
 
 
 
 
 
 
 
 
           ":\n   Branch features and values to this node: @branch_features_and_values_or_thresholds" .  
 
2281 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "\n   Class probabilities at current node: @class_probabilities_for_display" .  
 
2282 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "\n   Entropy at current node: $print_node_creation_entropy_at_node" .  
 
2283 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "\n   Best feature test at current node: $feature_at_node\n\n";  
 
2284 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2285 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2286 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub display_decision_tree {  
 
2287 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $self = shift;  
 
2288 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $offset = shift;  
 
2289 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $serial_num = $self->get_serial_num();  
 
2290 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if (@{$self->get_children()} > 0) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2291 
 
  
0
   
 
 
 
  
  0
   
 
 
 
 
 
0
 
         my $feature_at_node = $self->get_feature() || " ";  
 
2292 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $node_creation_entropy_at_node = $self->get_node_entropy();  
 
2293 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);  
 
2294 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2295 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @class_probabilities = @{$self->get_class_probabilities()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2296 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2297 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @class_names = @{$self->get_class_names()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2298 
 
 
 
 
 
 
 
 
 
 
 
 
 
         my @print_class_probabilities_with_class =  
 
2299 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2300 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";  
 
2301 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));  
 
2302 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "$second_line_offset" . "Decision Feature: $feature_at_node    Node Creation Entropy: " ,  
 
2303 
 
 
 
 
 
 
 
 
 
 
 
 
 
               "$print_node_creation_entropy_at_node   Class Probs: @print_class_probabilities_with_class\n\n";  
 
2304 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $offset .= "   ";  
 
2305 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $child (@{$self->get_children()}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2306 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $child->display_decision_tree($offset);  
 
2307 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2308 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
2309 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $node_creation_entropy_at_node = $self->get_node_entropy();  
 
2310 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);  
 
2311 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2312 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @class_probabilities = @{$self->get_class_probabilities()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2313 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2314 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @class_names = @{$self->get_class_names()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2315 
 
 
 
 
 
 
 
 
 
 
 
 
 
         my @print_class_probabilities_with_class =  
 
2316 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2317 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";  
 
2318 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));  
 
2319 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node   " .  
 
2320 
 
 
 
 
 
 
 
 
 
 
 
 
 
               "Class Probs: @print_class_probabilities_with_class\n\n";  
 
2321 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2322 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2323 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2324 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2325 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##############################  Generate Your Own Numeric Training Data  #################################  
 
2326 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #############################      Class TrainingDataGeneratorNumeric     ################################  
 
2327 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2328 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  See the script generate_training_data_numeric.pl in the examples  
 
2329 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  directory on how to use this class for generating your own numeric training and  
 
2330 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  test data.  The training and test data are generated in accordance with the  
 
2331 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  specifications you place in the parameter file that is supplied as an argument to  
 
2332 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  the constructor of this class.  
 
2333 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2334 
 
 
 
 
 
 
 
 
 
 
 
 
 
 package TrainingDataGeneratorNumeric;  
 
2335 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2336 
 
1
 
 
 
 
 
  
1
   
 
 
 
20
 
 use strict;                                                           
 
  
 
1
 
 
 
 
 
 
 
 
 
3
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
36
 
    
 
2337 
 
1
 
 
 
 
 
  
1
   
 
 
 
8
 
 use Carp;  
 
  
 
1
 
 
 
 
 
 
 
 
 
4
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
1485
 
    
 
2338 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2339 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub new {                                                             
 
2340 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my ($class, %args) = @_;  
 
2341 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @params = keys %args;  
 
2342 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     croak "\nYou have used a wrong name for a keyword argument " .  
 
2343 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "--- perhaps a misspelling\n"   
 
2344 
 
 
 
 
 
 
 
 
 
 
 
 
 
           if check_for_illegal_params3(@params) == 0;     
 
2345 
 
 
 
 
 
 
 
 
 
 
 
 
 
     bless {  
 
2346 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _output_training_csv_file          =>   $args{'output_training_csv_file'}   
 
2347 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                    || croak("name for output_training_csv_file required"),  
 
2348 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _output_test_csv_file              =>   $args{'output_test_csv_file'}   
 
2349 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                    || croak("name for output_test_csv_file required"),  
 
2350 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _parameter_file                    =>   $args{'parameter_file'}  
 
2351 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                          || croak("parameter_file required"),  
 
2352 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _number_of_samples_for_training    =>   $args{'number_of_samples_for_training'}   
 
2353 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                          || croak("number_of_samples_for_training"),  
 
2354 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _number_of_samples_for_testing     =>   $args{'number_of_samples_for_testing'}   
 
2355 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                          || croak("number_of_samples_for_testing"),  
 
2356 
 
  
0
   
 
 
 
  
  0
   
 
 
 
 
 
0
 
         _debug                             =>    $args{debug} || 0,  
 
  
 
 
 
 
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
  0
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
  0
   
 
 
 
 
 
 
 
    
 
2357 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _class_names                       =>    [],  
 
2358 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _class_names_and_priors            =>    {},  
 
2359 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _features_with_value_range         =>    {},  
 
2360 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _features_ordered                  =>    [],  
 
2361 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _classes_and_their_param_values    =>    {},  
 
2362 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }, $class;  
 
2363 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2364 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2365 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub check_for_illegal_params3 {  
 
2366 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my @params = @_;  
 
2367 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @legal_params = qw / output_training_csv_file  
 
2368 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             output_test_csv_file  
 
2369 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             parameter_file  
 
2370 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             number_of_samples_for_training  
 
2371 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             number_of_samples_for_testing  
 
2372 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             debug  
 
2373 
 
 
 
 
 
 
 
 
 
 
 
 
 
                           /;  
 
2374 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $found_match_flag;  
 
2375 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $param (@params) {  
 
2376 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $legal (@legal_params) {  
 
2377 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             $found_match_flag = 0;  
 
2378 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
             if ($param eq $legal) {  
 
2379 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 $found_match_flag = 1;  
 
2380 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 last;  
 
2381 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2382 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2383 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         last if $found_match_flag == 0;  
 
2384 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2385 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     return $found_match_flag;  
 
2386 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2387 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2388 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  The training data generated by an instance of the class  
 
2389 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  TrainingDataGeneratorNumeric is based on the specs you place in a parameter that  
 
2390 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  you supply to the class constructor through a constructor variable called  
 
2391 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  `parameter_file'.  This method is for parsing the parameter file in order to  
 
2392 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  order to determine the names to be used for the different data classes, their  
 
2393 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  means, and their variances.  
 
2394 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub read_parameter_file_numeric {  
 
2395 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $self = shift;  
 
2396 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @class_names = ();  
 
2397 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %class_names_and_priors = ();  
 
2398 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %features_with_value_range = ();  
 
2399 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %classes_and_their_param_values = ();  
 
2400 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #   my $regex8 =  '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';  
 
2401 
 
  
0
   
 
 
 
  
  0
   
 
 
 
 
 
0
 
     open FILE, $self->{_parameter_file} || die "unable to open parameter file: $!";  
 
2402 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @params = ;   
 
2403 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $params = join "", @params;  
 
2404 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';  
 
2405 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $params =~ /$regex/si;  
 
2406 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my ($class_names, $class_priors) = ($1, $2);  
 
2407 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     @class_names = split ' ', $class_names;   
 
2408 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @class_priors = split ' ', $class_priors;  
 
2409 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $i (0..@class_names-1) {  
 
2410 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $class_names_and_priors{$class_names[$i]} = $class_priors[$i];  
 
2411 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2412 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if ($self->{_debug}) {  
 
2413 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $cname (keys %class_names_and_priors) {  
 
2414 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$cname  =>   $class_names_and_priors{$cname}\n";  
 
2415 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2416 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2417 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $regex = 'feature name: \w*.*?value range: [\d\. -]+';  
 
2418 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @features = $params =~ /$regex/gsi;  
 
2419 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @features_ordered;  
 
2420 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $regex = 'feature name: (\w+)\W*?value range:\s*([\d. -]+)';  
 
2421 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $feature (@features) {  
 
2422 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $feature =~ /$regex/i;  
 
2423 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $feature_name = $1;  
 
2424 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         push @features_ordered, $feature_name;  
 
2425 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @value_range = split ' ', $2;  
 
2426 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $features_with_value_range{$feature_name} = \@value_range;  
 
2427 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2428 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if ($self->{_debug}) {  
 
2429 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $fname (keys %features_with_value_range) {  
 
2430 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$fname  =>   @{$features_with_value_range{$fname}}\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2431 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2432 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2433 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $i (0..@class_names-1) {  
 
2434 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $classes_and_their_param_values{$class_names[$i]} = {};  
 
2435 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2436 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $regex = 'params for class: \w*?\W+?mean:[\d\. ]+\W*?covariance:\W+?(?:[ \d.]+\W+?)+';  
 
2437 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @class_params = $params =~ /$regex/gsi;  
 
2438 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $regex = 'params for class: (\w+)\W*?mean:\s*([\d. -]+)\W*covariance:\s*([\s\d.]+)';  
 
2439 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $class_param (@class_params) {  
 
2440 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $class_param =~ /$regex/gsi;  
 
2441 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $class_name = $1;  
 
2442 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @class_mean = split ' ', $2;  
 
2443 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $classes_and_their_param_values{$class_name}->{'mean'} =  \@class_mean;  
 
2444 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $class_param_string = $3;  
 
2445 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @covar_rows = split '\n', $class_param_string;  
 
2446 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @covar_matrix;  
 
2447 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $row (@covar_rows) {  
 
2448 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @row = split ' ', $row;  
 
2449 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @covar_matrix, \@row;  
 
2450 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2451 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $classes_and_their_param_values{$class_name}->{'covariance'} =  \@covar_matrix;  
 
2452 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2453 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if ($self->{_debug}) {  
 
2454 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nThe class parameters are:\n\n";  
 
2455 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $cname (keys %classes_and_their_param_values) {  
 
2456 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "\nFor class name $cname:\n";  
 
2457 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my %params_hash = %{$classes_and_their_param_values{$cname}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2458 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $x (keys %params_hash) {  
 
2459 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                 if ($x eq 'mean') {  
 
2460 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     print "    $x   =>   @{$params_hash{$x}}\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2461 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 } else {  
 
2462 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
                     if ($x eq 'covariance') {  
 
2463 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         print "    The covariance matrix:\n";  
 
2464 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         my @matrix = @{$params_hash{'covariance'}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2465 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                         foreach my $row (@matrix) {  
 
2466 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                             print "        @$row\n";  
 
2467 
 
 
 
 
 
 
 
 
 
 
 
 
 
                         }  
 
2468 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     }  
 
2469 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
2470 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2471 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2472 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2473 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $self->{_class_names}        =   \@class_names;  
 
2474 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $self->{_class_names_and_priors}   = \%class_names_and_priors;  
 
2475 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $self->{_features_with_value_range}   = \%features_with_value_range;  
 
2476 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $self->{_classes_and_their_param_values} = \%classes_and_their_param_values;  
 
2477 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     $self->{_features_ordered} = \@features_ordered;  
 
2478 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2479 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2480 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  After the parameter file is parsed by the previous method, this method calls on  
 
2481 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  Math::Random::random_multivariate_normal() to generate the training and test data  
 
2482 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  samples. Your training and test data can be of any number of of dimensions, can  
 
2483 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  have any mean, and any covariance.  The training and test data must obviously be  
 
2484 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  drawn from the same distribution.  
 
2485 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub gen_numeric_training_and_test_data_and_write_to_csv {  
 
2486 
 
1
 
 
 
 
 
  
1
   
 
 
 
864
 
     use Math::Random;  
 
  
 
1
 
 
 
 
 
 
 
 
 
8174
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
1453
 
    
 
2487 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $self = shift;  
 
2488 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my %training_samples_for_class;  
 
2489 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my %test_samples_for_class;  
 
2490 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $class_name (@{$self->{_class_names}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2491 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $training_samples_for_class{$class_name} = [];  
 
2492 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $test_samples_for_class{$class_name} = [];  
 
2493 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2494 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2495 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2496 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2497 
 
 
 
 
 
 
 
 
 
 
 
 
 
         my @new_training_data = Math::Random::random_multivariate_normal(  
 
2498 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
               $self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},  
 
2499 
 
 
 
 
 
 
 
 
 
 
 
 
 
               @mean, @covariance );  
 
2500 
 
 
 
 
 
 
 
 
 
 
 
 
 
         my @new_test_data = Math::Random::random_multivariate_normal(  
 
2501 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
               $self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},  
 
2502 
 
 
 
 
 
 
 
 
 
 
 
 
 
               @mean, @covariance );  
 
2503 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
         if ($self->{_debug}) {  
 
2504 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "training data for class $class_name:\n";  
 
2505 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $x (@new_training_data) {print "@$x\n";}  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2506 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "\n\ntest data for class $class_name:\n";  
 
2507 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             foreach my $x (@new_test_data) {print "@$x\n";}  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2508 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2509 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $training_samples_for_class{$class_name} = \@new_training_data;  
 
2510 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         $test_samples_for_class{$class_name} = \@new_test_data;  
 
2511 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2512 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @training_data_records = ();  
 
2513 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @test_data_records = ();  
 
2514 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $class_name (keys %training_samples_for_class) {  
 
2515 
 
 
 
 
 
 
 
 
 
 
 
 
 
         my $num_of_samples_for_training = $self->{_number_of_samples_for_training} *   
 
2516 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                          $self->{_class_names_and_priors}->{$class_name};  
 
2517 
 
 
 
 
 
 
 
 
 
 
 
 
 
         my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} *   
 
2518 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                                          $self->{_class_names_and_priors}->{$class_name};  
 
2519 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $sample_index (0..$num_of_samples_for_training-1) {  
 
2520 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2521 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             @training_vector = map {sprintf("%.3f", $_)} @training_vector;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2522 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";  
 
2523 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @training_data_records, $training_data_record;  
 
2524 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2525 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $sample_index (0..$num_of_samples_for_testing-1) {  
 
2526 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2527 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             @test_vector = map {sprintf("%.3f", $_)} @test_vector;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2528 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";  
 
2529 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             push @test_data_records, $test_data_record;  
 
2530 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2531 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2532 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     fisher_yates_shuffle(\@training_data_records);  
 
2533 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     fisher_yates_shuffle(\@test_data_records);  
 
2534 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
0
 
     if ($self->{_debug}) {  
 
2535 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $record (@training_data_records) {  
 
2536 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$record";  
 
2537 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2538 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $record (@test_data_records) {  
 
2539 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$record";  
 
2540 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2541 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2542 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     open OUTPUT, ">$self->{_output_training_csv_file}";  
 
2543 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @feature_names_training = @{$self->{_features_ordered}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2544 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2545 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;  
 
2546 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print OUTPUT "$first_row_training\n";  
 
2547 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $i (0..@training_data_records-1) {  
 
2548 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $i1 = $i+1;  
 
2549 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $sample_record = "\"$i1\",$training_data_records[$i]";  
 
2550 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print OUTPUT "$sample_record";  
 
2551 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2552 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     close OUTPUT;  
 
2553 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     open OUTPUT, ">$self->{_output_test_csv_file}";  
 
2554 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @feature_names_testing = keys %{$self->{_features_with_value_range}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2555 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2556 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;  
 
2557 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     print OUTPUT "$first_row_testing\n";  
 
2558 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     foreach my $i (0..@test_data_records-1) {  
 
2559 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $i1 = $i+1;  
 
2560 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $sample_record = "\"$i1\",$test_data_records[$i]";  
 
2561 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print OUTPUT "$sample_record";  
 
2562 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2563 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     close OUTPUT;  
 
2564 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2565 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2566 
 
 
 
 
 
 
 
 
 
 
 
 
 
 # from perl docs:                                                                           
 
2567 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub fisher_yates_shuffle {  
 
2568 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
0
 
     my $arr =  shift;  
 
2569 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     my $i = @$arr;  
 
2570 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
     while (--$i) {  
 
2571 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $j = int rand( $i + 1 );  
 
2572 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         @$arr[$i, $j] = @$arr[$j, $i];  
 
2573 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2574 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2575 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2576 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ###########################  Generate Your Own Symbolic Training Data  ###############################  
 
2577 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ###########################     Class TrainingDataGeneratorSymbolic      #############################  
 
2578 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2579 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  See the sample script generate_training_and_test_data_symbolic.pl for how to use  
 
2580 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  this class for generating purely symbolic training and test data.  The data is  
 
2581 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  generated according to the specifications you place in a parameter file whose  
 
2582 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  name you supply as one of constructor arguments.  
 
2583 
 
 
 
 
 
 
 
 
 
 
 
 
 
 package TrainingDataGeneratorSymbolic;  
 
2584 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2585 
 
1
 
 
 
 
 
  
1
   
 
 
 
19
 
 use strict;                                                           
 
  
 
1
 
 
 
 
 
 
 
 
 
5
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
39
 
    
 
2586 
 
1
 
 
 
 
 
  
1
   
 
 
 
9
 
 use Carp;  
 
  
 
1
 
 
 
 
 
 
 
 
 
3
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
3354
 
    
 
2587 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2588 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub new {                                                             
 
2589 
 
1
 
 
 
 
 
  
1
   
 
 
 
32
 
     my ($class, %args) = @_;  
 
2590 
 
1
 
 
 
 
 
 
 
 
 
7
 
     my @params = keys %args;  
 
2591 
 
1
 
  
 50
   
 
 
 
 
 
 
 
8
 
     croak "\nYou have used a wrong name for a keyword argument " .  
 
2592 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "--- perhaps a misspelling\n"   
 
2593 
 
 
 
 
 
 
 
 
 
 
 
 
 
           if check_for_illegal_params4(@params) == 0;     
 
2594 
 
 
 
 
 
 
 
 
 
 
 
 
 
     bless {  
 
2595 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _output_training_datafile          =>   $args{'output_training_datafile'}   
 
2596 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                    || die("name for output_training_datafile required"),  
 
2597 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _parameter_file                    =>   $args{'parameter_file'}  
 
2598 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                    || die("parameter_file required"),  
 
2599 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _number_of_samples_for_training    =>   $args{'number_of_samples_for_training'}   
 
2600 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                    || die("number_of_samples_for_training required"),  
 
2601 
 
1
 
 
 
  
 50
   
 
 
 
 
 
31
 
         _debug                             =>    $args{debug} || 0,  
 
  
 
 
 
 
 
  
 50
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
 50
   
 
 
 
 
 
 
 
    
 
  
 
 
 
 
 
  
 50
   
 
 
 
 
 
 
 
    
 
2602 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _class_names                       =>    [],  
 
2603 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _class_priors                      =>    [],  
 
2604 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _features_and_values_hash          =>    {},  
 
2605 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _bias_hash                         =>    {},  
 
2606 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _training_sample_records           =>    {},  
 
2607 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }, $class;  
 
2608 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2609 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2610 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub check_for_illegal_params4 {  
 
2611 
 
1
 
 
 
 
 
  
1
   
 
 
 
5
 
     my @params = @_;  
 
2612 
 
1
 
 
 
 
 
 
 
 
 
5
 
     my @legal_params = qw / output_training_datafile  
 
2613 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             parameter_file  
 
2614 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             number_of_samples_for_training  
 
2615 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             debug  
 
2616 
 
 
 
 
 
 
 
 
 
 
 
 
 
                           /;  
 
2617 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my $found_match_flag;  
 
2618 
 
1
 
 
 
 
 
 
 
 
 
4
 
     foreach my $param (@params) {  
 
2619 
 
3
 
 
 
 
 
 
 
 
 
10
 
         foreach my $legal (@legal_params) {  
 
2620 
 
6
 
 
 
 
 
 
 
 
 
15
 
             $found_match_flag = 0;  
 
2621 
 
6
 
  
100
   
 
 
 
 
 
 
 
23
 
             if ($param eq $legal) {  
 
2622 
 
3
 
 
 
 
 
 
 
 
 
9
 
                 $found_match_flag = 1;  
 
2623 
 
3
 
 
 
 
 
 
 
 
 
8
 
                 last;  
 
2624 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2625 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2626 
 
3
 
  
 50
   
 
 
 
 
 
 
 
14
 
         last if $found_match_flag == 0;  
 
2627 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2628 
 
1
 
 
 
 
 
 
 
 
 
8
 
     return $found_match_flag;  
 
2629 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2630 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2631 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  Read a parameter file for generating symbolic training data. See the script  
 
2632 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  generate_symbolic_training_data_symbolic.pl in the Examples directory for how to  
 
2633 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  pass the name of the parameter file to the constructor of the  
 
2634 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  TrainingDataGeneratorSymbolic class.  
 
2635 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub read_parameter_file_symbolic {  
 
2636 
 
1
 
 
 
 
 
  
1
   
 
 
 
9
 
     my $self = shift;  
 
2637 
 
1
 
 
 
 
 
 
 
 
 
10
 
     my $debug = $self->{_debug};  
 
2638 
 
1
 
 
 
 
 
 
 
 
 
5
 
     my $number_of_training_samples = $self->{_number_of_samples_for_training};  
 
2639 
 
1
 
 
 
 
 
 
 
 
 
3
 
     my $input_parameter_file = $self->{_parameter_file};  
 
2640 
 
1
 
  
 50
   
 
 
 
 
 
 
 
7
 
     croak "Forgot to supply parameter file" if ! defined $input_parameter_file;  
 
2641 
 
1
 
 
 
 
 
 
 
 
 
3
 
     my $output_file_training = $self->{_output_training_datafile};  
 
2642 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my $output_file_testing = $self->{_output_test_datafile};  
 
2643 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my @all_params;  
 
2644 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $param_string;  
 
2645 
 
1
 
 
 
  
 33
   
 
 
 
 
 
36
 
     open INPUT, $input_parameter_file || "unable to open parameter file: $!";  
 
2646 
 
1
 
 
 
 
 
 
 
 
 
53
 
     @all_params =  
 
2647 
 
1
 
 
 
 
 
 
 
 
 
9
 
     @all_params = grep { $_ !~ /^[ ]*#/ } @all_params;  
 
  
 
40
 
 
 
 
 
 
 
 
 
150
 
    
 
2648 
 
1
 
 
 
 
 
 
 
 
 
5
 
     @all_params = grep { $_ =~ s/\r?\n?$//} @all_params;  
 
  
 
36
 
 
 
 
 
 
 
 
 
173
 
    
 
2649 
 
1
 
 
 
 
 
 
 
 
 
9
 
     $param_string = join ' ', @all_params;  
 
2650 
 
1
 
 
 
 
 
 
 
 
 
16
 
     my ($class_names, $class_priors, $rest_param) =   
 
2651 
 
 
 
 
 
 
 
 
 
 
 
 
 
               $param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;  
 
2652 
 
1
 
  
 50
   
 
 
 
 
 
 
 
12
 
     my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;  
 
  
 
3
 
 
 
 
 
 
 
 
 
26
 
    
 
2653 
 
1
 
 
 
 
 
 
 
 
 
25
 
     push @{$self->{_class_names}}, @class_names;  
 
  
 
1
 
 
 
 
 
 
 
 
 
7
 
    
 
2654 
 
1
 
  
 50
   
 
 
 
 
 
 
 
16
 
     my @class_priors =   grep {defined($_) && length($_) > 0} split /\s+/, $2;  
 
  
 
3
 
 
 
 
 
 
 
 
 
24
 
    
 
2655 
 
1
 
 
 
 
 
 
 
 
 
4
 
     push @{$self->{_class_priors}}, @class_priors;      
 
  
 
1
 
 
 
 
 
 
 
 
 
6
 
    
 
2656 
 
1
 
 
 
 
 
 
 
 
 
15
 
     my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;  
 
2657 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my %features_and_values_hash;  
 
2658 
 
1
 
 
 
 
 
 
 
 
 
24
 
     my @features = split /(feature[:])/, $feature_string;  
 
2659 
 
1
 
  
 50
   
 
 
 
 
 
 
 
5
 
     @features = grep {defined($_) && length($_) > 0} @features;  
 
  
 
9
 
 
 
 
 
 
 
 
 
52
 
    
 
2660 
 
1
 
 
 
 
 
 
 
 
 
4
 
     foreach my $item (@features) {  
 
2661 
 
8
 
  
100
   
 
 
 
 
 
 
 
38
 
         next if $item =~ /feature/;  
 
2662 
 
4
 
 
 
 
 
 
 
 
 
24
 
         my @splits = split / /, $item;  
 
2663 
 
4
 
  
 50
   
 
 
 
 
 
 
 
14
 
         @splits = grep {defined($_) && length($_) > 0} @splits;  
 
  
 
27
 
 
 
 
 
 
 
 
 
148
 
    
 
2664 
 
4
 
 
 
 
 
 
 
 
 
16
 
         foreach my $i (0..@splits-1) {  
 
2665 
 
22
 
  
100
   
 
 
 
 
 
 
 
67
 
             if ($i == 0) {  
 
2666 
 
4
 
 
 
 
 
 
 
 
 
20
 
                 $features_and_values_hash{$splits[0]} = [];  
 
2667 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
2668 
 
18
 
  
100
   
 
 
 
 
 
 
 
70
 
                 next if $splits[$i] =~ /values/;  
 
2669 
 
14
 
 
 
 
 
 
 
 
 
29
 
                 push @{$features_and_values_hash{$splits[0]}}, $splits[$i];  
 
  
 
14
 
 
 
 
 
 
 
 
 
56
 
    
 
2670 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2671 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2672 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2673 
 
1
 
 
 
 
 
 
 
 
 
6
 
     $self->{_features_and_values_hash} = \%features_and_values_hash;  
 
2674 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my %bias_hash = %{$self->{_bias_hash}};  
 
  
 
1
 
 
 
 
 
 
 
 
 
6
 
    
 
2675 
 
1
 
 
 
 
 
 
 
 
 
20
 
     my @biases = split /(bias[:]\s*class[:])/, $bias_string;  
 
2676 
 
1
 
  
 50
   
 
 
 
 
 
 
 
5
 
     @biases = grep {defined($_) && length($_) > 0} @biases;  
 
  
 
5
 
 
 
 
 
 
 
 
 
35
 
    
 
2677 
 
1
 
 
 
 
 
 
 
 
 
5
 
     foreach my $item (@biases) {  
 
2678 
 
4
 
  
100
   
 
 
 
 
 
 
 
21
 
         next if $item =~ /bias/;  
 
2679 
 
2
 
 
 
 
 
 
 
 
 
24
 
         my @splits = split /\s+/, $item;  
 
2680 
 
2
 
  
 50
   
 
 
 
 
 
 
 
8
 
         @splits = grep {defined($_) && length($_) > 0} @splits;  
 
  
 
18
 
 
 
 
 
 
 
 
 
97
 
    
 
2681 
 
2
 
 
 
 
 
 
 
 
 
6
 
         my $feature_name;  
 
2682 
 
2
 
 
 
 
 
 
 
 
 
16
 
         foreach my $i (0..@splits-1) {  
 
2683 
 
16
 
  
100
   
 
 
 
 
 
 
 
89
 
             if ($i == 0) {  
 
  
 
 
 
  
100
   
 
 
 
 
 
 
 
 
 
    
 
2684 
 
2
 
 
 
 
 
 
 
 
 
10
 
                 $bias_hash{$splits[0]} = {};  
 
2685 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } elsif ($splits[$i] =~ /(^.+)[:]$/) {  
 
2686 
 
8
 
 
 
 
 
 
 
 
 
29
 
                 $feature_name = $1;  
 
2687 
 
8
 
 
 
 
 
 
 
 
 
37
 
                 $bias_hash{$splits[0]}->{$feature_name} = [];  
 
2688 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
2689 
 
6
 
  
 50
   
 
 
 
 
 
 
 
28
 
                 next if !defined $feature_name;  
 
2690 
 
6
 
  
 50
   
 
 
 
 
 
 
 
20
 
                 push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]  
 
  
 
6
 
 
 
 
 
 
 
 
 
26
 
    
 
2691 
 
 
 
 
 
 
 
 
 
 
 
 
 
                         if defined $feature_name;  
 
2692 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2693 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2694 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2695 
 
1
 
 
 
 
 
 
 
 
 
5
 
     $self->{_bias_hash} = \%bias_hash;  
 
2696 
 
1
 
  
 50
   
 
 
 
 
 
 
 
16
 
     if ($debug) {  
 
2697 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\n\nClass names: @class_names\n";  
 
2698 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         my $num_of_classes = @class_names;  
 
2699 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "Class priors: @class_priors\n";  
 
2700 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "Number of classes: $num_of_classes\n";  
 
2701 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nHere are the features and their possible values:\n";  
 
2702 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         while ( my ($k, $v) = each %features_and_values_hash ) {  
 
2703 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$k ===>  @$v\n";  
 
2704 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2705 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\nHere is the biasing for each class:\n";  
 
2706 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         while ( my ($k, $v) = each %bias_hash ) {  
 
2707 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$k:\n";  
 
2708 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             while ( my ($k1, $v1) = each %$v ) {  
 
2709 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "       $k1 ===>  @$v1\n";  
 
2710 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2711 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2712 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2713 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2714 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2715 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  This method generates training data according to the specifications placed in a  
 
2716 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  parameter file that is read by the previous method.  
 
2717 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub gen_symbolic_training_data {  
 
2718 
 
1
 
 
 
 
 
  
1
   
 
 
 
11
 
     my $self = shift;  
 
2719 
 
1
 
 
 
 
 
 
 
 
 
3
 
     my @class_names = @{$self->{_class_names}};  
 
  
 
1
 
 
 
 
 
 
 
 
 
7
 
    
 
2720 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my @class_priors = @{$self->{_class_priors}};  
 
  
 
1
 
 
 
 
 
 
 
 
 
5
 
    
 
2721 
 
1
 
 
 
 
 
 
 
 
 
3
 
     my %training_sample_records;  
 
2722 
 
1
 
 
 
 
 
 
 
 
 
3
 
     my %features_and_values_hash = %{$self->{_features_and_values_hash}};  
 
  
 
1
 
 
 
 
 
 
 
 
 
8
 
    
 
2723 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my %bias_hash  = %{$self->{_bias_hash}};  
 
  
 
1
 
 
 
 
 
 
 
 
 
5
 
    
 
2724 
 
1
 
 
 
 
 
 
 
 
 
5
 
     my $how_many_training_samples = $self->{_number_of_samples_for_training};  
 
2725 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my $how_many_test_samples = $self->{_number_of_samples_for_testing};  
 
2726 
 
1
 
 
 
 
 
 
 
 
 
3
 
     my %class_priors_to_unit_interval_map;  
 
2727 
 
1
 
 
 
 
 
 
 
 
 
3
 
     my $accumulated_interval = 0;  
 
2728 
 
1
 
 
 
 
 
 
 
 
 
6
 
     foreach my $i (0..@class_names-1) {  
 
2729 
 
2
 
 
 
 
 
 
 
 
 
13
 
         $class_priors_to_unit_interval_map{$class_names[$i]}   
 
2730 
 
 
 
 
 
 
 
 
 
 
 
 
 
          = [$accumulated_interval, $accumulated_interval + $class_priors[$i]];  
 
2731 
 
2
 
 
 
 
 
 
 
 
 
8
 
         $accumulated_interval += $class_priors[$i];  
 
2732 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2733 
 
1
 
  
 50
   
 
 
 
 
 
 
 
6
 
     if ($self->{_debug}) {  
 
2734 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "Mapping of class priors to unit interval: \n";  
 
2735 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {  
 
2736 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$k =>  @$v\n";  
 
2737 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2738 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\n\n";  
 
2739 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2740 
 
1
 
 
 
 
 
 
 
 
 
10
 
     my $ele_index = 0;  
 
2741 
 
1
 
 
 
 
 
 
 
 
 
6
 
     while ($ele_index < $how_many_training_samples) {  
 
2742 
 
35
 
 
 
 
 
 
 
 
 
86
 
         my $sample_name = "sample" . "_$ele_index";  
 
2743 
 
35
 
 
 
 
 
 
 
 
 
108
 
         $training_sample_records{$sample_name} = [];  
 
2744 
 
 
 
 
 
 
 
 
 
 
 
 
 
         # Generate class label for this training sample:                  
 
2745 
 
35
 
 
 
 
 
 
 
 
 
109
 
         my $roll_the_dice = rand(1.0);  
 
2746 
 
35
 
 
 
 
 
 
 
 
 
56
 
         my $class_label;  
 
2747 
 
35
 
 
 
 
 
 
 
 
 
95
 
         foreach my $class_name (keys %class_priors_to_unit_interval_map ) {  
 
2748 
 
52
 
 
 
 
 
 
 
 
 
93
 
             my $v = $class_priors_to_unit_interval_map{$class_name};  
 
2749 
 
52
 
  
100
   
 
  
 66
   
 
 
 
 
 
245
 
             if ( ($roll_the_dice >= $v->[0]) && ($roll_the_dice <= $v->[1]) ) {  
 
2750 
 
35
 
 
 
 
 
 
 
 
 
67
 
                 push @{$training_sample_records{$sample_name}},   
 
  
 
35
 
 
 
 
 
 
 
 
 
103
 
    
 
2751 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                     "class=" . $class_name;  
 
2752 
 
35
 
 
 
 
 
 
 
 
 
69
 
                 $class_label = $class_name;  
 
2753 
 
35
 
 
 
 
 
 
 
 
 
66
 
                 last;  
 
2754 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2755 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2756 
 
35
 
 
 
 
 
 
 
 
 
92
 
         foreach my $feature (keys %features_and_values_hash) {  
 
2757 
 
140
 
 
 
 
 
 
 
 
 
259
 
             my @values = @{$features_and_values_hash{$feature}};  
 
  
 
140
 
 
 
 
 
 
 
 
 
411
 
    
 
2758 
 
140
 
 
 
 
 
 
 
 
 
303
 
             my $bias_string = $bias_hash{$class_label}->{$feature}->[0];  
 
2759 
 
140
 
 
 
 
 
 
 
 
 
289
 
             my $no_bias = 1.0 / @values;  
 
2760 
 
140
 
  
100
   
 
 
 
 
 
 
 
482
 
             $bias_string = "$values[0]" . "=$no_bias" if !defined $bias_string;  
 
2761 
 
140
 
 
 
 
 
 
 
 
 
232
 
             my %value_priors_to_unit_interval_map;  
 
2762 
 
140
 
 
 
 
 
 
 
 
 
700
 
             my @splits = split /\s*=\s*/, $bias_string;  
 
2763 
 
140
 
 
 
 
 
 
 
 
 
342
 
             my $chosen_for_bias_value = $splits[0];  
 
2764 
 
140
 
 
 
 
 
 
 
 
 
230
 
             my $chosen_bias = $splits[1];  
 
2765 
 
140
 
 
 
 
 
 
 
 
 
365
 
             my $remaining_bias = 1 - $chosen_bias;  
 
2766 
 
140
 
 
 
 
 
 
 
 
 
298
 
             my $remaining_portion_bias = $remaining_bias / (@values -1);  
 
2767 
 
140
 
  
 50
   
 
 
 
 
 
 
 
301
 
             @splits = grep {defined($_) && length($_) > 0} @splits;  
 
  
 
280
 
 
 
 
 
 
 
 
 
1359
 
    
 
2768 
 
140
 
 
 
 
 
 
 
 
 
291
 
             my $accumulated = 0;  
 
2769 
 
140
 
 
 
 
 
 
 
 
 
335
 
             foreach my $i (0..@values-1) {  
 
2770 
 
490
 
  
100
   
 
 
 
 
 
 
 
1220
 
                 if ($values[$i] eq $chosen_for_bias_value) {  
 
2771 
 
140
 
 
 
 
 
 
 
 
 
407
 
                     $value_priors_to_unit_interval_map{$values[$i]}   
 
2772 
 
 
 
 
 
 
 
 
 
 
 
 
 
                         = [$accumulated, $accumulated + $chosen_bias];  
 
2773 
 
140
 
 
 
 
 
 
 
 
 
304
 
                     $accumulated += $chosen_bias;  
 
2774 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 } else {  
 
2775 
 
350
 
 
 
 
 
 
 
 
 
866
 
                     $value_priors_to_unit_interval_map{$values[$i]}   
 
2776 
 
 
 
 
 
 
 
 
 
 
 
 
 
                       = [$accumulated, $accumulated + $remaining_portion_bias];  
 
2777 
 
350
 
 
 
 
 
 
 
 
 
677
 
                     $accumulated += $remaining_portion_bias;             
 
2778 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
2779 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2780 
 
140
 
 
 
 
 
 
 
 
 
276
 
             my $roll_the_dice = rand(1.0);  
 
2781 
 
140
 
 
 
 
 
 
 
 
 
230
 
             my $value_label;  
 
2782 
 
140
 
 
 
 
 
 
 
 
 
353
 
             foreach my $value_name (keys %value_priors_to_unit_interval_map ) {  
 
2783 
 
307
 
 
 
 
 
 
 
 
 
546
 
                 my $v = $value_priors_to_unit_interval_map{$value_name};  
 
2784 
 
307
 
  
100
   
 
  
100
   
 
 
 
 
 
1282
 
                 if ( ($roll_the_dice >= $v->[0])   
 
2785 
 
 
 
 
 
 
 
 
 
 
 
 
 
                              && ($roll_the_dice <= $v->[1]) ) {  
 
2786 
 
140
 
 
 
 
 
 
 
 
 
232
 
                     push @{$training_sample_records{$sample_name}},   
 
  
 
140
 
 
 
 
 
 
 
 
 
490
 
    
 
2787 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                             $feature . "=" . $value_name;  
 
2788 
 
140
 
 
 
 
 
 
 
 
 
260
 
                     $value_label = $value_name;  
 
2789 
 
140
 
 
 
 
 
 
 
 
 
352
 
                     last;  
 
2790 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
2791 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2792 
 
140
 
  
 50
   
 
 
 
 
 
 
 
676
 
             if ($self->{_debug}) {  
 
2793 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "mapping feature value priors for '$feature' " .  
 
2794 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                           "to unit interval: \n";  
 
2795 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 while ( my ($k, $v) =   
 
2796 
 
 
 
 
 
 
 
 
 
 
 
 
 
                         each %value_priors_to_unit_interval_map ) {  
 
2797 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                     print "$k =>  @$v\n";  
 
2798 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
2799 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
                 print "\n\n";  
 
2800 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2801 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2802 
 
35
 
 
 
 
 
 
 
 
 
120
 
         $ele_index++;  
 
2803 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2804 
 
1
 
 
 
 
 
 
 
 
 
4
 
     $self->{_training_sample_records} = \%training_sample_records;  
 
2805 
 
1
 
  
 50
   
 
 
 
 
 
 
 
6
 
     if ($self->{_debug}) {  
 
2806 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         print "\n\nPRINTING TRAINING RECORDS:\n\n";  
 
2807 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
         foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2808 
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
             print "$kee =>  @{$training_sample_records{$kee}}\n\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
0
 
    
 
2809 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2810 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2811 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my $output_training_file = $self->{_output_training_datafile};  
 
2812 
 
1
 
  
 50
   
 
 
 
 
 
 
 
5
 
     print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};  
 
2813 
 
1
 
 
 
 
 
 
 
 
 
120
 
     open FILEHANDLE, ">$output_training_file";  
 
2814 
 
1
 
 
 
 
 
 
 
 
 
10
 
     my @features = sort keys %features_and_values_hash;  
 
2815 
 
1
 
 
 
 
 
 
 
 
 
4
 
     my $title_string = ',class';  
 
2816 
 
1
 
 
 
 
 
 
 
 
 
4
 
     foreach my $feature_name (@features) {  
 
2817 
 
4
 
 
 
 
 
 
 
 
 
9
 
         $title_string .= ',' . $feature_name;  
 
2818 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2819 
 
1
 
 
 
 
 
 
 
 
 
12
 
     print FILEHANDLE "$title_string\n";  
 
2820 
 
1
 
 
 
 
 
 
 
 
 
15
 
     my @sample_names = sort {$a <=> $b}  map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;  
 
  
 
105
 
 
 
 
 
 
 
 
 
156
 
    
 
  
 
35
 
 
 
 
 
 
 
 
 
76
 
    
 
  
 
35
 
 
 
 
 
 
 
 
 
63
 
    
 
2821 
 
1
 
 
 
 
 
 
 
 
 
6
 
     my $record_string = '';  
 
2822 
 
1
 
 
 
 
 
 
 
 
 
3
 
     foreach my $sample_name (@sample_names) {  
 
2823 
 
35
 
 
 
 
 
 
 
 
 
61
 
         $record_string .= "$sample_name,";  
 
2824 
 
35
 
 
 
 
 
 
 
 
 
44
 
         my @record = @{$training_sample_records{"sample_$sample_name"}};  
 
  
 
35
 
 
 
 
 
 
 
 
 
111
 
    
 
2825 
 
35
 
 
 
 
 
 
 
 
 
59
 
         my %item_parts_hash;  
 
2826 
 
35
 
 
 
 
 
 
 
 
 
50
 
         foreach my $item (@record) {  
 
2827 
 
175
 
 
 
 
 
 
 
 
 
431
 
             my @splits = grep $_, split /=/, $item;  
 
2828 
 
175
 
 
 
 
 
 
 
 
 
365
 
             $item_parts_hash{$splits[0]} = $splits[1];  
 
2829 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2830 
 
35
 
 
 
 
 
 
 
 
 
76
 
         $record_string .= $item_parts_hash{"class"};  
 
2831 
 
35
 
 
 
 
 
 
 
 
 
64
 
         delete $item_parts_hash{"class"};  
 
2832 
 
35
 
 
 
 
 
 
 
 
 
102
 
         my @kees = sort keys %item_parts_hash;  
 
2833 
 
35
 
 
 
 
 
 
 
 
 
67
 
         foreach my $kee (@kees) {  
 
2834 
 
140
 
 
 
 
 
 
 
 
 
232
 
             $record_string .= ",$item_parts_hash{$kee}";  
 
2835 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2836 
 
35
 
 
 
 
 
 
 
 
 
67
 
         print FILEHANDLE "$record_string\n";  
 
2837 
 
35
 
 
 
 
 
 
 
 
 
85
 
         $record_string = '';  
 
2838 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2839 
 
1
 
 
 
 
 
 
 
 
 
62
 
     close FILEHANDLE;  
 
2840 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }      
 
2841 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2842 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub sample_index {  
 
2843 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
 
 
     my $arg = shift;  
 
2844 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $arg =~ /_(.+)$/;  
 
2845 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     return $1;  
 
2846 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }      
 
2847 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2848 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #################################   Decision Tree Introspection   #######################################  
 
2849 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #################################      Class DTIntrospection      #######################################  
 
2850 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2851 
 
 
 
 
 
 
 
 
 
 
 
 
 
 package DTIntrospection;  
 
2852 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2853 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  Instances constructed from this class can provide explanations for the  
 
2854 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  classification decisions at the nodes of a decision tree.    
 
2855 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##    
 
2856 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  When used in the interactive mode, the decision-tree introspection made possible  
 
2857 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  by this class provides answers to the following three questions: (1) List of the  
 
2858 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  training samples that fall in the portion of the feature space that corresponds  
 
2859 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  to a node of the decision tree; (2) The probabilities associated with the last  
 
2860 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  feature test that led to the node; and (3) The class probabilities predicated on  
 
2861 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  just the last feature test on the path to that node.  
 
2862 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##    
 
2863 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  CAVEAT: It is possible for a node to exist even when there are no training  
 
2864 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  samples in the portion of the feature space that corresponds to the node.  That  
 
2865 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  is because a decision tree is based on the probability densities estimated from  
 
2866 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  the training data. When training data is non-uniformly distributed, it is  
 
2867 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  possible for the probability associated with a point in the feature space to be  
 
2868 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  non-zero even when there are no training samples at or in the vicinity of that  
 
2869 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  point.  
 
2870 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##    
 
2871 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  For a node to exist even where there are no training samples in the portion of  
 
2872 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  the feature space that belongs to the node is an indication of the generalization  
 
2873 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  ability of decision-tree based classification.  
 
2874 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##    
 
2875 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  When used in a non-interactive mode, an instance of this class can be used to  
 
2876 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  create a tabular display that shows what training samples belong directly to the  
 
2877 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  portion of the feature space that corresponds to each node of the decision tree.  
 
2878 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  An instance of this class can also construct a tabular display that shows how the  
 
2879 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  influence of each training sample propagates in the decision tree.  For each  
 
2880 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  training sample, this display first shows the list of nodes that came into  
 
2881 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  existence through feature test(s) that used the data provided by that sample.  
 
2882 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  This list for each training sample is followed by a subtree of the nodes that owe  
 
2883 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  their existence indirectly to the training sample. A training sample influences a  
 
2884 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  node indirectly if the node is a descendant of another node that is affected  
 
2885 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ##  directly by the training sample.  
 
2886 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2887 
 
1
 
 
 
 
 
  
1
   
 
 
 
18
 
 use strict;   
 
  
 
1
 
 
 
 
 
 
 
 
 
5
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
47
 
    
 
2888 
 
1
 
 
 
 
 
  
1
   
 
 
 
9
 
 use Carp;  
 
  
 
1
 
 
 
 
 
 
 
 
 
3
 
    
 
  
 
1
 
 
 
 
 
 
 
 
 
3391
 
    
 
2889 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2890 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub new {                                                             
 
2891 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
 
 
     my ($class, $dt) = @_;   
 
2892 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"  
 
2893 
 
 
 
 
 
 
 
 
 
 
 
 
 
         unless ref($dt) eq "Algorithm::DecisionTree";  
 
2894 
 
 
 
 
 
 
 
 
 
 
 
 
 
     bless {                                                           
 
2895 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _dt                                 => $dt,  
 
2896 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _root_dtnode                        => $dt->{_root_node},  
 
2897 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         _samples_at_nodes_hash              => {},  
 
2898 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _branch_features_to_nodes_hash      => {},  
 
2899 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _sample_to_node_mapping_direct_hash => {},  
 
2900 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _node_serial_num_to_node_hash       => {},   
 
2901 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _awareness_raising_msg_shown        => 0,  
 
2902 
 
 
 
 
 
 
 
 
 
 
 
 
 
         _debug                              => 0,  
 
2903 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }, $class;                                                       
 
2904 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2905 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2906 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub initialize {  
 
2907 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
 
 
     my $self = shift;  
 
2908 
 
 
 
 
 
 
 
 
 
 
 
 
 
     croak "You must first construct the decision tree before using the DTIntrospection class"  
 
2909 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
         unless $self->{_root_dtnode};  
 
2910 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $self->recursive_descent($self->{_root_dtnode});  
 
2911 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2912 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2913 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub recursive_descent {  
 
2914 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
 
 
     my $self = shift;  
 
2915 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $node = shift;  
 
2916 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $node_serial_number = $node->get_serial_num();  
 
2917 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();  
 
2918 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     print "\nAt node $node_serial_number:  the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
2919 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;  
 
2920 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;  
 
2921 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my @samples_at_node = ();  
 
2922 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     foreach my $item (@$branch_features_and_values_or_thresholds) {  
 
2923 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);  
 
2924 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
         unless (@samples_at_node) {  
 
2925 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             @samples_at_node =  @$samples_for_feature_value_combo;  
 
2926 
 
 
 
 
 
 
 
 
 
 
 
 
 
         } else {  
 
2927 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             my @accum;  
 
2928 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             foreach my $sample (@samples_at_node) {  
 
2929 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
                 push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);    
 
2930 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2931 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             @samples_at_node =  @accum;  
 
2932 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2933 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
         last unless @samples_at_node;  
 
2934 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2935 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     @samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node;   
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
2936 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     print "Node: $node_serial_number    the samples are: [@samples_at_node]\n"  if ($self->{_debug});  
 
2937 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;  
 
2938 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     if (@samples_at_node) {  
 
2939 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         foreach my $sample (@samples_at_node) {  
 
2940 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
             if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {  
 
2941 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
                 $self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number];   
 
2942 
 
 
 
 
 
 
 
 
 
 
 
 
 
             } else {  
 
2943 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
                 push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
2944 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
2945 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2946 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2947 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $children = $node->get_children();  
 
2948 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     foreach my $child (@$children) {  
 
2949 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         $self->recursive_descent($child);  
 
2950 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2951 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2952 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2953 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub display_training_samples_at_all_nodes_direct_influence_only {  
 
2954 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
 
 
     my $self = shift;  
 
2955 
 
 
 
 
 
 
 
 
 
 
 
 
 
     croak "You must first construct the decision tree before using the DT Introspection class."   
 
2956 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
         unless $self->{_root_dtnode};  
 
2957 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $self->recursive_descent_for_showing_samples_at_a_node($self->{_root_dtnode});  
 
2958 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2959 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2960 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub recursive_descent_for_showing_samples_at_a_node{  
 
2961 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
 
 
     my $self = shift;  
 
2962 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $node = shift;  
 
2963 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $node_serial_number = $node->get_serial_num();  
 
2964 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();  
 
2965 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     if (exists $self->{_samples_at_nodes_hash}->{$node_serial_number}) {  
 
2966 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
         print "\nAt node $node_serial_number:  the branch features and values are: [@{$branch_features_and_values_or_thresholds}]\n"  if $self->{_debug};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
2967 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         print "Node $node_serial_number: the samples are: [@{$self->{_samples_at_nodes_hash}->{$node_serial_number}}]\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
2968 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2969 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     map $self->recursive_descent_for_showing_samples_at_a_node($_), @{$node->get_children()};              
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
2970 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2971 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2972 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub display_training_samples_to_nodes_influence_propagation {  
 
2973 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
 
 
     my $self = shift;  
 
2974 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     foreach my $sample (sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)}  keys %{$self->{_dt}->{_training_data_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
2975 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
         if (exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {  
 
2976 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             my $nodes_directly_affected = $self->{_sample_to_node_mapping_direct_hash}->{$sample};  
 
2977 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             print "\n$sample:\n    nodes affected directly: [@{$nodes_directly_affected}]\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
2978 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             print "    nodes affected through probabilistic generalization:\n";  
 
2979 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             map  $self->recursive_descent_for_sample_to_node_influence($_, $nodes_directly_affected, "    "), @$nodes_directly_affected;  
 
2980 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
2981 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2982 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2983 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2984 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub recursive_descent_for_sample_to_node_influence {  
 
2985 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
 
 
     my $self = shift;  
 
2986 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $node_serial_num = shift;  
 
2987 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $nodes_already_accounted_for = shift;  
 
2988 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $offset = shift;  
 
2989 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $offset .= "    ";  
 
2990 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $node = $self->{_node_serial_num_to_node_hash}->{$node_serial_num};  
 
2991 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my @children =  map $_->get_serial_num(), @{$node->get_children()};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
2992 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my @children_affected = grep {!Algorithm::DecisionTree::contained_in($_, @{$nodes_already_accounted_for})} @children;  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
2993 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     if (@children_affected) {  
 
2994 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         print "$offset $node_serial_num => [@children_affected]\n";  
 
2995 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
2996 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     map $self->recursive_descent_for_sample_to_node_influence($_, \@children_affected, $offset), @children_affected;  
 
2997 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
2998 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
2999 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub get_samples_for_feature_value_combo {  
 
3000 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
 
 
     my $self = shift;  
 
3001 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $feature_value_combo = shift;  
 
3002 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my ($feature,$op,$value) = $self->extract_feature_op_val($feature_value_combo);  
 
3003 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my @samples = ();  
 
3004 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     if ($op eq '=') {  
 
  
 
 
 
  
  0
   
 
 
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
 
 
 
 
 
 
 
 
    
 
3005 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         @samples = grep Algorithm::DecisionTree::contained_in($feature_value_combo, @{$self->{_dt}->{_training_data_hash}->{$_}}), keys %{$self->{_dt}->{_training_data_hash}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
3006 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } elsif ($op eq '<') {  
 
3007 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
3008 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
3009 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             foreach my $item (@features_and_values) {  
 
3010 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
                 my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);  
 
3011 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
                 if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data <= $value)) {  
 
  
 
 
 
 
 
  
  0
   
 
 
 
 
 
 
 
    
 
3012 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
                     push @samples, $sample;  
 
3013 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
                     last;  
 
3014 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
3015 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
3016 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
3017 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } elsif ($op eq '>') {  
 
3018 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
3019 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
3020 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             foreach my $item (@features_and_values) {  
 
3021 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
                 my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);  
 
3022 
 
  
0
   
 
  
  0
   
 
  
  0
   
 
 
 
 
 
 
 
                 if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data > $value)) {  
 
  
 
 
 
 
 
  
  0
   
 
 
 
 
 
 
 
    
 
3023 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
                     push @samples, $sample;  
 
3024 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
                     last;  
 
3025 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 }  
 
3026 
 
 
 
 
 
 
 
 
 
 
 
 
 
             }  
 
3027 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
3028 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
3029 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         die "Something strange is going on";  
 
3030 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
3031 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     return \@samples;  
 
3032 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
3033 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3034 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub extract_feature_op_val {  
 
3035 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
 
 
     my $self = shift;  
 
3036 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $feature_value_combo = shift;  
 
3037 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $pattern1 = '(.+)=(.+)';  
 
3038 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $pattern2 = '(.+)<(.+)';  
 
3039 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $pattern3 = '(.+)>(.+)';  
 
3040 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my ($feature,$value,$op);  
 
3041 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     if ($feature_value_combo =~ /$pattern2/) {  
 
  
 
 
 
  
  0
   
 
 
 
 
 
 
 
 
 
    
 
  
 
 
 
  
  0
   
 
 
 
 
 
 
 
 
 
    
 
3042 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         ($feature,$op,$value) = ($1,'<',$2);  
 
3043 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } elsif ($feature_value_combo =~ /$pattern3/) {  
 
3044 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         ($feature,$op,$value) = ($1,'>',$2);  
 
3045 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } elsif ($feature_value_combo =~ /$pattern1/) {  
 
3046 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         ($feature,$op,$value) = ($1,'=',$2);  
 
3047 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
3048 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     return ($feature,$op,$value);  
 
3049 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }   
 
3050 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3051 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub explain_classifications_at_multiple_nodes_interactively {  
 
3052 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
 
 
     my $self = shift;  
 
3053 
 
 
 
 
 
 
 
 
 
 
 
 
 
     croak "You called explain_classification_at_multiple_nodes_interactively() without " .  
 
3054 
 
 
 
 
 
 
 
 
 
 
 
 
 
         "first initializing the DTIntrospection instance in your code. Aborting."   
 
3055 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
                unless $self->{_samples_at_nodes_hash};  
 
3056 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     print "\n\nIn order for the decision tree to introspect\n\n";  
 
3057 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     print "  DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .  
 
3058 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "  that corresponds to a DT node may have NON-ZERO probabilities associated\n" .  
 
3059 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "  with it even when there are NO training data points in that region?\n" .  
 
3060 
 
 
 
 
 
 
 
 
 
 
 
 
 
           "\nEnter 'y' for yes or any other character for no:  ";  
 
3061 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $ans = ;   
 
3062 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $ans =~ s/^\s*|\s*$//g;  
 
3063 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     die "\n  Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decisions in the decision tree. Aborting!\n" if $ans !~ /^ye?s?$/;  
 
3064 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $self->{_awareness_raising_msg_shown} = 1;  
 
3065 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     while (1) {   
 
3066 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         my $node_id;  
 
3067 
 
 
 
 
 
 
 
 
 
 
 
 
 
         my $ans;  
 
3068 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         while (1) {  
 
3069 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             print "\nEnter the integer ID of a node: ";  
 
3070 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             $ans = ;   
 
3071 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             $ans =~ s/^\s*|\s*$//g;  
 
3072 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
             return if $ans =~ /^exit$/;  
 
3073 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
             last if Algorithm::DecisionTree::contained_in($ans, keys %{$self->{_samples_at_nodes_hash}});  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
3074 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             print "\nYour answer must be an integer ID of a node. Try again or enter 'exit'.\n";  
 
3075 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
3076 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         $node_id = $ans;          
 
3077 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         $self->explain_classification_at_one_node($node_id)     
 
3078 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
3079 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
3080 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3081 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sub explain_classification_at_one_node {  
 
3082 
 
  
0
   
 
 
 
 
 
  
0
   
 
 
 
 
 
     my $self = shift;  
 
3083 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $node_id = shift;  
 
3084 
 
 
 
 
 
 
 
 
 
 
 
 
 
     croak "You called explain_classification_at_one_node() without first initializing " .  
 
3085 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
         "the DTIntrospection instance in your code. Aborting." unless $self->{_samples_at_nodes_hash};  
 
3086 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     unless (exists $self->{_samples_at_nodes_hash}->{$node_id}) {   
 
3087 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         print "Node $node_id is not a node in the tree\n";  
 
3088 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         return;  
 
3089 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
3090 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     unless ($self->{_awareness_raising_msg_shown}) {  
 
3091 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         print "\n\nIn order for the decision tree to introspect at Node $node_id: \n\n";  
 
3092 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         print "  DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .  
 
3093 
 
 
 
 
 
 
 
 
 
 
 
 
 
               "  that corresponds to a DT node may have NON-ZERO probabilities associated\n" .  
 
3094 
 
 
 
 
 
 
 
 
 
 
 
 
 
               "  with it even when there are NO training data points in that region?\n" .  
 
3095 
 
 
 
 
 
 
 
 
 
 
 
 
 
               "\nEnter 'y' for yes or any other character for no:  ";  
 
3096 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         my $ans = ;   
 
3097 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         $ans =~ s/^\s*|\s*$//g;  
 
3098 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
         die "\n  Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decision at node $node_id\n" if $ans !~ /^ye?s?$/;  
 
3099 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
3100 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my @samples_at_node = @{$self->{_samples_at_nodes_hash}->{$node_id}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
3101 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my @branch_features_to_node = @{$self->{_branch_features_to_nodes_hash}->{$node_id}};  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
3102 
 
 
 
 
 
 
 
 
 
 
 
 
 
 #    my @class_names = @{$self->get_class_names()};  
 
3103 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my @class_names = $self->{_dt}->get_class_names();  
 
3104 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $class_probabilities = $self->{_root_dtnode}->get_class_probabilities();  
 
3105 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my ($feature,$op,$value) = $self->extract_feature_op_val( $branch_features_to_node[-1] );  
 
3106 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     my $msg = @samples_at_node == 0   
 
3107 
 
 
 
 
 
 
 
 
 
 
 
 
 
               ? "\n\n    There are NO training data samples directly in the region of the feature space assigned to node $node_id: @samples_at_node\n\n"  
 
3108 
 
 
 
 
 
 
 
 
 
 
 
 
 
               : "\n    Samples in the portion of the feature space assigned to Node $node_id: @samples_at_node\n";  
 
3109 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $msg .= "\n    Features tests on the branch to node $node_id: [@branch_features_to_node]\n\n";  
 
3110 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $msg .= "\n    Would you like to see the probability associated with the last feature test on the branch leading to Node $node_id?\n";  
 
3111 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $msg .= "\n    Enter 'y' if yes and any other character for 'no': ";  
 
3112 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     print $msg;  
 
3113 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     my $ans = ;   
 
3114 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $ans =~ s/^\s*|\s*$//g;  
 
3115 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     if ($ans =~ /^ye?s?$/) {  
 
3116 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         my $sequence = [$branch_features_to_node[-1]];  
 
3117 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         my $prob = $self->{_dt}->probability_of_a_sequence_of_features_and_values_or_thresholds($sequence);   
 
3118 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         print "\n    probability of @{$sequence} is: $prob\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
3119 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
3120 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $msg = "\n    Using Bayes rule, would you like to see the class probabilities predicated on just the last feature test on the branch leading to Node $node_id?\n";  
 
3121 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $msg .= "\n    Enter 'y' for yes and any other character for no:  ";  
 
3122 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     print $msg;  
 
3123 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $ans = ;   
 
3124 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
     $ans =~ s/^\s*|\s*$//g;  
 
3125 
 
  
0
   
 
  
  0
   
 
 
 
 
 
 
 
 
 
     if ($ans =~ /^ye?s?$/) {  
 
3126 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         my $sequence = [$branch_features_to_node[-1]];  
 
3127 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         foreach my $cls (@class_names) {  
 
3128 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             my $prob = $self->{_dt}->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($cls, $sequence);  
 
3129 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
             print "\n    probability of class $cls given just the feature test @{$sequence} is: $prob\n";  
 
  
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
    
 
3130 
 
 
 
 
 
 
 
 
 
 
 
 
 
         }  
 
3131 
 
 
 
 
 
 
 
 
 
 
 
 
 
     } else {  
 
3132 
 
  
0
   
 
 
 
 
 
 
 
 
 
 
 
         print "goodbye\n";  
 
3133 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }  
 
3134 
 
0
 
 
 
 
 
 
 
 
 
 
 
     print "\n    Finished supplying information on Node $node_id\n\n";  
 
3135 
 
 
 
 
 
 
 
 
 
 
 
 
 
 }  
 
3136 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3137 
 
 
 
 
 
 
 
 
 
 
 
 
 
 1;  
 
3138 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3139 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =pod  
 
3140 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3141 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =encoding UTF8  
 
3142 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3143 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 NAME  
 
3144 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3145 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Algorithm::DecisionTree - A Perl module for decision-tree based classification of  
 
3146 
 
 
 
 
 
 
 
 
 
 
 
 
 
 multidimensional data.  
 
3147 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3148 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3149 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 SYNOPSIS  
 
3150 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3151 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # FOR CONSTRUCTING A DECISION TREE AND FOR CLASSIFYING A SAMPLE:  
 
3152 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3153 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # In general, your call for constructing an instance of the DecisionTree class   
 
3154 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # will look like:  
 
3155 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3156 
 
 
 
 
 
 
 
 
 
 
 
 
 
       my $training_datafile = "stage3cancer.csv";  
 
3157 
 
 
 
 
 
 
 
 
 
 
 
 
 
       my $dt = Algorithm::DecisionTree->new(   
 
3158 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               training_datafile => $training_datafile,  
 
3159 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_class_column_index => 2,  
 
3160 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_columns_for_features => [3,4,5,6,7,8],  
 
3161 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               entropy_threshold => 0.01,  
 
3162 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               max_depth_desired => 8,  
 
3163 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               symbolic_to_numeric_cardinality_threshold => 10,  
 
3164 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_cleanup_needed => 1,  
 
3165 
 
 
 
 
 
 
 
 
 
 
 
 
 
                );  
 
3166 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3167 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # The constructor option `csv_class_column_index' informs the module as to which  
 
3168 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # column of your CSV file contains the class label.  THE COLUMN INDEXING IS ZERO  
 
3169 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # BASED.  The constructor option `csv_columns_for_features' specifies which columns  
 
3170 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # are to be used for feature values.  The first row of the CSV file must specify  
 
3171 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # the names of the features.  See examples of CSV files in the `Examples'  
 
3172 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # subdirectory.  
 
3173 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3174 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # The option `symbolic_to_numeric_cardinality_threshold' is also important.  For  
 
3175 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # the example shown above, if an ostensibly numeric feature takes on only 10 or  
 
3176 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # fewer different values in your training datafile, it will be treated like a  
 
3177 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # symbolic features.  The option `entropy_threshold' determines the granularity  
 
3178 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # with which the entropies are sampled for the purpose of calculating entropy gain  
 
3179 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # with a particular choice of decision threshold for a numeric feature or a feature  
 
3180 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # value for a symbolic feature.  
 
3181 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3182 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # The option 'csv_cleanup_needed' is by default set to 0.  If you set it  
 
3183 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # to 1, that would cause all line records in your CSV file to be "sanitized" before  
 
3184 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # they are used for constructing a decision tree.  You need this option if your CSV  
 
3185 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # file uses double-quoted field names and field values in the line records and if  
 
3186 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # such double-quoted strings are allowed to include commas for, presumably, better  
 
3187 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # readability.  
 
3188 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3189 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # After you have constructed an instance of the DecisionTree class as shown above,  
 
3190 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # you read in the training data file and initialize the probability cache by  
 
3191 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # calling:  
 
3192 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3193 
 
 
 
 
 
 
 
 
 
 
 
 
 
       $dt->get_training_data();  
 
3194 
 
 
 
 
 
 
 
 
 
 
 
 
 
       $dt->calculate_first_order_probabilities();  
 
3195 
 
 
 
 
 
 
 
 
 
 
 
 
 
       $dt->calculate_class_priors();  
 
3196 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3197 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # Next you construct a decision tree for your training data by calling:  
 
3198 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3199 
 
 
 
 
 
 
 
 
 
 
 
 
 
       $root_node = $dt->construct_decision_tree_classifier();  
 
3200 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3201 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # where $root_node is an instance of the DTNode class that is also defined in the  
 
3202 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # module file.  Now you are ready to classify a new data record.  Let's say that  
 
3203 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # your data record looks like:  
 
3204 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3205 
 
 
 
 
 
 
 
 
 
 
 
 
 
       my @test_sample  = qw /  g2=4.2  
 
3206 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                grade=2.3  
 
3207 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                gleason=4  
 
3208 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                eet=1.7  
 
3209 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                age=55.0  
 
3210 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                ploidy=diploid /;  
 
3211 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3212 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # You can classify it by calling:  
 
3213 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3214 
 
 
 
 
 
 
 
 
 
 
 
 
 
       my $classification = $dt->classify($root_node, \@test_sample);  
 
3215 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3216 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # The call to `classify()' returns a reference to a hash whose keys are the class  
 
3217 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # names and the values the associated classification probabilities.  This hash also  
 
3218 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # includes another key-value pair for the solution path from the root node to the  
 
3219 
 
 
 
 
 
 
 
 
 
 
 
 
 
   # leaf node at which the final classification was carried out.  
 
3220 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3221 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3222 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 CHANGES  
 
3223 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3224 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B This version fixes a bug in the C function.    
 
3225 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The source of the bug was a typo in a regex component meant for matching with white  
 
3226 
 
 
 
 
 
 
 
 
 
 
 
 
 
 space.  I have also made one additional change to this function to increase its  
 
3227 
 
 
 
 
 
 
 
 
 
 
 
 
 
 versatility.  With this change, you are now allowed to have empty strings as values  
 
3228 
 
 
 
 
 
 
 
 
 
 
 
 
 
 for features.  
 
3229 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3230 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B This version reintroduces C as an optional    
 
3231 
 
 
 
 
 
 
 
 
 
 
 
 
 
 parameter in the module constructor.  This was done in response to several requests  
 
3232 
 
 
 
 
 
 
 
 
 
 
 
 
 
 received from the user community. (Previously, all line records from a CSV file were  
 
3233 
 
 
 
 
 
 
 
 
 
 
 
 
 
 processed by the C function no matter what.)  The main point made by   
 
3234 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the users was that invoking C when there was no need for CSV clean-up   
 
3235 
 
 
 
 
 
 
 
 
 
 
 
 
 
 extracted a performance penalty when ingesting large database files with tens of  
 
3236 
 
 
 
 
 
 
 
 
 
 
 
 
 
 thousands of line records.  In addition to making C optional, I   
 
3237 
 
 
 
 
 
 
 
 
 
 
 
 
 
 have also tweaked up the code in the C function in order to extract   
 
3238 
 
 
 
 
 
 
 
 
 
 
 
 
 
 data from a larger range of messy CSV files.  
 
3239 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3240 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B All the changes made in this version relate to the construction of   
 
3241 
 
 
 
 
 
 
 
 
 
 
 
 
 
 regression trees.  I have fixed a couple of bugs in the calculation of the regression  
 
3242 
 
 
 
 
 
 
 
 
 
 
 
 
 
 coefficients. Additionally, the C class now comes with a new   
 
3243 
 
 
 
 
 
 
 
 
 
 
 
 
 
 constructor parameter named C.  For most cases, you'd set this   
 
3244 
 
 
 
 
 
 
 
 
 
 
 
 
 
 parameter to 0, which causes the regression coefficients to be estimated through  
 
3245 
 
 
 
 
 
 
 
 
 
 
 
 
 
 linear least-squares minimization.  
 
3246 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3247 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B In addition to constructing decision trees, this version of the   
 
3248 
 
 
 
 
 
 
 
 
 
 
 
 
 
 module also allows you to construct regression trees. The regression tree capability  
 
3249 
 
 
 
 
 
 
 
 
 
 
 
 
 
 has been packed into a separate subclass, named C, of the main   
 
3250 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C class.  The subdirectory C in the main    
 
3251 
 
 
 
 
 
 
 
 
 
 
 
 
 
 installation directory illustrates how you can use this new functionality of the  
 
3252 
 
 
 
 
 
 
 
 
 
 
 
 
 
 module.  
 
3253 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3254 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B This version incorporates four very significant upgrades/changes to   
 
3255 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the C module: B<(1)> The CSV cleanup is now the default. So you do not   
 
3256 
 
 
 
 
 
 
 
 
 
 
 
 
 
 have to set any special parameters in the constructor calls to initiate CSV  
 
3257 
 
 
 
 
 
 
 
 
 
 
 
 
 
 cleanup. B<(2)> In the form of a new Perl class named C,   
 
3258 
 
 
 
 
 
 
 
 
 
 
 
 
 
 this module provides you with an easy-to-use programming interface for attempting  
 
3259 
 
 
 
 
 
 
 
 
 
 
 
 
 
 needle-in-a-haystack solutions for the case when your training data is overwhelmingly  
 
3260 
 
 
 
 
 
 
 
 
 
 
 
 
 
 dominated by a single class.  You need to set the constructor parameter  
 
3261 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C to invoke the logic that constructs multiple   
 
3262 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision trees, each using the minority class samples along with samples drawn  
 
3263 
 
 
 
 
 
 
 
 
 
 
 
 
 
 randomly from the majority class.  The final classification is made through a  
 
3264 
 
 
 
 
 
 
 
 
 
 
 
 
 
 majority vote from all the decision trees.  B<(3)> Assuming you are faced with a  
 
3265 
 
 
 
 
 
 
 
 
 
 
 
 
 
 big-data problem --- in the sense that you have been given a training database with a  
 
3266 
 
 
 
 
 
 
 
 
 
 
 
 
 
 very large number of training records --- the class C will   
 
3267 
 
 
 
 
 
 
 
 
 
 
 
 
 
 also let you construct multiple decision trees by pulling training data randomly from  
 
3268 
 
 
 
 
 
 
 
 
 
 
 
 
 
 your training database (without paying attention to the relative populations of the  
 
3269 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classes).  The final classification decision for a test sample is based on a majority  
 
3270 
 
 
 
 
 
 
 
 
 
 
 
 
 
 vote from all the decision trees thus constructed.  See the C   
 
3271 
 
 
 
 
 
 
 
 
 
 
 
 
 
 directory for how to use these new features of the module. And, finally, B<(4)>  
 
3272 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Support for the old-style '.dat' training files has been dropped in this version.  
 
3273 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3274 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B This version makes it easier to use a CSV training file that   
 
3275 
 
 
 
 
 
 
 
 
 
 
 
 
 
 violates the assumption that a comma be used only to separate the different field  
 
3276 
 
 
 
 
 
 
 
 
 
 
 
 
 
 values in a line record.  Some large econometrics databases use double-quoted values  
 
3277 
 
 
 
 
 
 
 
 
 
 
 
 
 
 for fields, and these values may contain commas (presumably for better readability).  
 
3278 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This version also allows you to specify the leftmost entry in the first CSV record  
 
3279 
 
 
 
 
 
 
 
 
 
 
 
 
 
 that names all the fields. Previously, this entry was required to be an empty  
 
3280 
 
 
 
 
 
 
 
 
 
 
 
 
 
 double-quoted string.  I have also made some minor changes to the  
 
3281 
 
 
 
 
 
 
 
 
 
 
 
 
 
 'C' method to make it more user friendly for large   
 
3282 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training files that may contain tens of thousands of records.  When pulling training  
 
3283 
 
 
 
 
 
 
 
 
 
 
 
 
 
 data from such files, this method prints out a dot on the terminal screen for every  
 
3284 
 
 
 
 
 
 
 
 
 
 
 
 
 
 10000 records it has processed.   
 
3285 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3286 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B This version brings the boosting capability to the C    
 
3287 
 
 
 
 
 
 
 
 
 
 
 
 
 
 module.  
 
3288 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3289 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B This version adds bagging to the C module. If your    
 
3290 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training dataset is large enough, you can ask the module to construct multiple  
 
3291 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision trees using data bags extracted from your dataset.  The module can show you  
 
3292 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the results returned by the individual decision trees and also the results obtained  
 
3293 
 
 
 
 
 
 
 
 
 
 
 
 
 
 by taking a majority vote of the classification decisions made by the individual  
 
3294 
 
 
 
 
 
 
 
 
 
 
 
 
 
 trees.  You can specify any arbitrary extent of overlap between the data bags.  
 
3295 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3296 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B The introspection capability in this version packs more of a punch.   
 
3297 
 
 
 
 
 
 
 
 
 
 
 
 
 
 For each training data sample, you can now figure out not only the decision-tree  
 
3298 
 
 
 
 
 
 
 
 
 
 
 
 
 
 nodes that are affected directly by that sample, but also those nodes that are  
 
3299 
 
 
 
 
 
 
 
 
 
 
 
 
 
 affected indirectly through the generalization achieved by the probabilistic modeling  
 
3300 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of the data.  The 'examples' directory of this version includes additional scripts  
 
3301 
 
 
 
 
 
 
 
 
 
 
 
 
 
 that illustrate these enhancements to the introspection capability.  See the section  
 
3302 
 
 
 
 
 
 
 
 
 
 
 
 
 
 "The Introspection API" for a declaration of the introspection related methods, old  
 
3303 
 
 
 
 
 
 
 
 
 
 
 
 
 
 and new.  
 
3304 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3305 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B In response to requests from several users, this version includes a new   
 
3306 
 
 
 
 
 
 
 
 
 
 
 
 
 
 capability: You can now ask the module to introspect about the classification  
 
3307 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decisions returned by the decision tree.  Toward that end, the module includes a new  
 
3308 
 
 
 
 
 
 
 
 
 
 
 
 
 
 class named C.  Perhaps the most important bit of information you   
 
3309 
 
 
 
 
 
 
 
 
 
 
 
 
 
 are likely to seek through DT introspection is the list of the training samples that  
 
3310 
 
 
 
 
 
 
 
 
 
 
 
 
 
 fall directly in the portion of the feature space that is assigned to a node.  
 
3311 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B When training samples are non-uniformly distributed in the underlying   
 
3312 
 
 
 
 
 
 
 
 
 
 
 
 
 
 feature space, IT IS POSSIBLE FOR A NODE TO EXIST EVEN WHEN NO TRAINING SAMPLES FALL  
 
3313 
 
 
 
 
 
 
 
 
 
 
 
 
 
 IN THE PORTION OF THE FEATURE SPACE ASSIGNED TO THE NODE.  B<(This is an important  
 
3314 
 
 
 
 
 
 
 
 
 
 
 
 
 
 part of the generalization achieved by probabilistic modeling of the training data.)>  
 
3315 
 
 
 
 
 
 
 
 
 
 
 
 
 
 For additional information related to DT introspection, see the section titled  
 
3316 
 
 
 
 
 
 
 
 
 
 
 
 
 
 "DECISION TREE INTROSPECTION" in this documentation page.  
 
3317 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3318 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B fixes a bug in the part of the module that some folks use for generating   
 
3319 
 
 
 
 
 
 
 
 
 
 
 
 
 
 synthetic data for experimenting with decision tree construction and classification.  
 
3320 
 
 
 
 
 
 
 
 
 
 
 
 
 
 In the class C that is a part of the module, there   
 
3321 
 
 
 
 
 
 
 
 
 
 
 
 
 
 was a problem with the order in which the features were recorded from the  
 
3322 
 
 
 
 
 
 
 
 
 
 
 
 
 
 user-supplied parameter file.  The basic code for decision tree construction and  
 
3323 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classification remains unchanged.  
 
3324 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3325 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B further downshifts the required version of Perl for this module.  This   
 
3326 
 
 
 
 
 
 
 
 
 
 
 
 
 
 was a result of testing the module with Version 5.10.1 of Perl.  Only one statement  
 
3327 
 
 
 
 
 
 
 
 
 
 
 
 
 
 in the module code needed to be changed for the module to work with the older version  
 
3328 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of Perl.  
 
3329 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3330 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B fixes the C restriction on the required Perl version.  This    
 
3331 
 
 
 
 
 
 
 
 
 
 
 
 
 
 version should work with Perl versions 5.14.0 and higher.  
 
3332 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3333 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B changes the required version of Perl from 5.18.0 to 5.14.0.  Everything   
 
3334 
 
 
 
 
 
 
 
 
 
 
 
 
 
 else remains the same.  
 
3335 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3336 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B should prove more robust when the probability distribution for the   
 
3337 
 
 
 
 
 
 
 
 
 
 
 
 
 
 values of a feature is expected to be heavy-tailed; that is, when the supposedly rare  
 
3338 
 
 
 
 
 
 
 
 
 
 
 
 
 
 observations can occur with significant probabilities.  A new option in the  
 
3339 
 
 
 
 
 
 
 
 
 
 
 
 
 
 DecisionTree constructor lets the user specify the precision with which the  
 
3340 
 
 
 
 
 
 
 
 
 
 
 
 
 
 probability distributions are estimated for such features.  
 
3341 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3342 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B fixes a bug that was caused by the explicitly set zero values for   
 
3343 
 
 
 
 
 
 
 
 
 
 
 
 
 
 numerical features being misconstrued as "false" in the conditional statements in  
 
3344 
 
 
 
 
 
 
 
 
 
 
 
 
 
 some of the method definitions.  
 
3345 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3346 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B makes it easier to write code for classifying in one go all of your test   
 
3347 
 
 
 
 
 
 
 
 
 
 
 
 
 
 data samples in a CSV file.  The bulk classifications obtained can be written out to  
 
3348 
 
 
 
 
 
 
 
 
 
 
 
 
 
 either a CSV file or to a regular text file.  See the script  
 
3349 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C in the C directory for how to    
 
3350 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classify all of your test data records in a CSV file.  This version also includes  
 
3351 
 
 
 
 
 
 
 
 
 
 
 
 
 
 improved code for generating synthetic numeric/symbolic training and test data  
 
3352 
 
 
 
 
 
 
 
 
 
 
 
 
 
 records for experimenting with the decision tree classifier.  
 
3353 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3354 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B allows you to test the quality of your training data by running a 10-fold   
 
3355 
 
 
 
 
 
 
 
 
 
 
 
 
 
 cross-validation test on the data.  This test divides all of the training data into  
 
3356 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ten parts, with nine parts used for training a decision tree and one part used for  
 
3357 
 
 
 
 
 
 
 
 
 
 
 
 
 
 testing its ability to classify correctly. This selection of nine parts for training  
 
3358 
 
 
 
 
 
 
 
 
 
 
 
 
 
 and one part for testing is carried out in all of the ten different ways that are  
 
3359 
 
 
 
 
 
 
 
 
 
 
 
 
 
 possible.  This testing functionality in Version 2.1 can also be used to find the  
 
3360 
 
 
 
 
 
 
 
 
 
 
 
 
 
 best values to use for the constructor parameters C,   
 
3361 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C, and C.    
 
3362 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3363 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B Now you can use both numeric and   
 
3364 
 
 
 
 
 
 
 
 
 
 
 
 
 
 symbolic features for constructing a decision tree. A feature is numeric if it can  
 
3365 
 
 
 
 
 
 
 
 
 
 
 
 
 
 take any floating-point value over an interval.  
 
3366 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3367 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B fixes a bug in the code that was triggered by 0 being declared as one of   
 
3368 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the features values in the training datafile. Version 1.71 also include an additional  
 
3369 
 
 
 
 
 
 
 
 
 
 
 
 
 
 safety feature that is useful for training datafiles that contain a very large number  
 
3370 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of features.  The new version makes sure that the number of values you declare for  
 
3371 
 
 
 
 
 
 
 
 
 
 
 
 
 
 each sample record matches the number of features declared at the beginning of the  
 
3372 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training datafile.  
 
3373 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3374 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B includes safety checks on the consistency of the data you place in your   
 
3375 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training datafile. When a training file contains thousands of samples, it is  
 
3376 
 
 
 
 
 
 
 
 
 
 
 
 
 
 difficult to manually check that you used the same class names in your sample records  
 
3377 
 
 
 
 
 
 
 
 
 
 
 
 
 
 that you declared at top of your training file or that the values you have for your  
 
3378 
 
 
 
 
 
 
 
 
 
 
 
 
 
 features are legal vis-a-vis the earlier declarations of the values in the training  
 
3379 
 
 
 
 
 
 
 
 
 
 
 
 
 
 file.  Another safety feature incorporated in this version is the non-consideration  
 
3380 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of classes that are declared at the top of the training file but that have no sample  
 
3381 
 
 
 
 
 
 
 
 
 
 
 
 
 
 records in the file.  
 
3382 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3383 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B uses probability caching much more extensively compared to the previous   
 
3384 
 
 
 
 
 
 
 
 
 
 
 
 
 
 versions.  This should result in faster construction of large decision trees.  
 
3385 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Another new feature in Version 1.6 is the use of a decision tree for interactive  
 
3386 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classification. In this mode, after you have constructed a decision tree from the  
 
3387 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training data, the user is prompted for answers to the questions pertaining to the  
 
3388 
 
 
 
 
 
 
 
 
 
 
 
 
 
 feature tests at the nodes of the tree.  
 
3389 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3390 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Some key elements of the documentation were cleaned up and made more readable in  
 
3391 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B.  The implementation code remains unchanged from Version 1.4.   
 
3392 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3393 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B should make things faster (and easier) for folks who want to use this   
 
3394 
 
 
 
 
 
 
 
 
 
 
 
 
 
 module with training data that creates very large decision trees (that is, trees with  
 
3395 
 
 
 
 
 
 
 
 
 
 
 
 
 
 tens of thousands or more decision nodes).  The speedup in Version 1.4 has been  
 
3396 
 
 
 
 
 
 
 
 
 
 
 
 
 
 achieved by eliminating duplicate calculation of probabilities as the tree grows.  In  
 
3397 
 
 
 
 
 
 
 
 
 
 
 
 
 
 addition, this version provides an additional constructor parameter,  
 
3398 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C for controlling the size of the decision tree.  This is in   
 
3399 
 
 
 
 
 
 
 
 
 
 
 
 
 
 addition to the tree size control achieved by the parameter C that   
 
3400 
 
 
 
 
 
 
 
 
 
 
 
 
 
 was introduced in Version 1.3.  Since large decision trees can take a long time to  
 
3401 
 
 
 
 
 
 
 
 
 
 
 
 
 
 create, you may find yourself wishing you could store the tree you just created in a  
 
3402 
 
 
 
 
 
 
 
 
 
 
 
 
 
 disk file and that, subsequently, you could use the stored tree for classification  
 
3403 
 
 
 
 
 
 
 
 
 
 
 
 
 
 work.  The C directory contains two scripts, C and    
 
3404 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C, that show how you can do exactly that with the   
 
3405 
 
 
 
 
 
 
 
 
 
 
 
 
 
 help of Perl's C module.   
 
3406 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3407 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B addresses the issue that arises when the header of a training datafile   
 
3408 
 
 
 
 
 
 
 
 
 
 
 
 
 
 declares a certain possible value for a feature but that (feature,value) pair does  
 
3409 
 
 
 
 
 
 
 
 
 
 
 
 
 
 NOT show up anywhere in the training data.  Version 1.3 also makes it possible for a  
 
3410 
 
 
 
 
 
 
 
 
 
 
 
 
 
 user to control the size of the decision tree by changing the value of the parameter  
 
3411 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C Additionally, Version 1.3 includes a method called   
 
3412 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C that displays useful information regarding the size and   
 
3413 
 
 
 
 
 
 
 
 
 
 
 
 
 
 some other attributes of the training data.  It also warns the user if the training  
 
3414 
 
 
 
 
 
 
 
 
 
 
 
 
 
 data might result in a decision tree that would simply be much too large --- unless  
 
3415 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the user controls the size with the entropy_threshold parameter.  
 
3416 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3417 
 
 
 
 
 
 
 
 
 
 
 
 
 
 In addition to the removal of a couple of serious bugs, B incorporates a   
 
3418 
 
 
 
 
 
 
 
 
 
 
 
 
 
 number of enhancements: (1) Version 1.2 includes checks on the names of the features  
 
3419 
 
 
 
 
 
 
 
 
 
 
 
 
 
 and values used in test data --- this is the data you want to classify with the  
 
3420 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision tree classifier constructed by this module.  (2) Version 1.2 includes a  
 
3421 
 
 
 
 
 
 
 
 
 
 
 
 
 
 separate constructor for generating test data.  To make it easier to generate test  
 
3422 
 
 
 
 
 
 
 
 
 
 
 
 
 
 data whose probabilistic parameters may not be identical to that used for the  
 
3423 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training data, I have used separate routines for generating the test data.  (3)  
 
3424 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Version 1.2 also includes in its examples directory a script that classifies the test  
 
3425 
 
 
 
 
 
 
 
 
 
 
 
 
 
 data in a file and outputs the class labels into another file.  This is for folks who  
 
3426 
 
 
 
 
 
 
 
 
 
 
 
 
 
 do not wish to write their own scripts using this module. (4) Version 1.2 also  
 
3427 
 
 
 
 
 
 
 
 
 
 
 
 
 
 includes addition to the documentation regarding the issue of numeric values for  
 
3428 
 
 
 
 
 
 
 
 
 
 
 
 
 
 features.  
 
3429 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3430 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 DESCRIPTION  
 
3431 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3432 
 
 
 
 
 
 
 
 
 
 
 
 
 
 B is a I module for constructing a decision tree from    
 
3433 
 
 
 
 
 
 
 
 
 
 
 
 
 
 a training datafile containing multidimensional data.  In one form or another,  
 
3434 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision trees have been around for about fifty years.  From a statistical  
 
3435 
 
 
 
 
 
 
 
 
 
 
 
 
 
 perspective, they are closely related to classification and regression by recursive  
 
3436 
 
 
 
 
 
 
 
 
 
 
 
 
 
 partitioning of multidimensional data.  Early work that demonstrated the usefulness  
 
3437 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of such partitioning of data for classification and regression can be traced to the  
 
3438 
 
 
 
 
 
 
 
 
 
 
 
 
 
 work of Terry Therneau in the early 1980's in the statistics community, and to the  
 
3439 
 
 
 
 
 
 
 
 
 
 
 
 
 
 work of Ross Quinlan in the mid 1990's in the machine learning community.  
 
3440 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3441 
 
 
 
 
 
 
 
 
 
 
 
 
 
 For those not familiar with decision tree ideas, the traditional way to classify  
 
3442 
 
 
 
 
 
 
 
 
 
 
 
 
 
 multidimensional data is to start with a feature space whose dimensionality is the  
 
3443 
 
 
 
 
 
 
 
 
 
 
 
 
 
 same as that of the data.  Each feature in this space corresponds to the attribute  
 
3444 
 
 
 
 
 
 
 
 
 
 
 
 
 
 that each dimension of the data measures.  You then use the training data to carve up  
 
3445 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the feature space into different regions, each corresponding to a different class.  
 
3446 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Subsequently, when you try to classify a new data sample, you locate it in the  
 
3447 
 
 
 
 
 
 
 
 
 
 
 
 
 
 feature space and find the class label of the region to which it belongs.  One can  
 
3448 
 
 
 
 
 
 
 
 
 
 
 
 
 
 also give the new data point the same class label as that of the nearest training  
 
3449 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sample. This is referred to as the nearest neighbor classification.  There exist  
 
3450 
 
 
 
 
 
 
 
 
 
 
 
 
 
 hundreds of variations of varying power on these two basic approaches to the  
 
3451 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classification of multidimensional data.  
 
3452 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3453 
 
 
 
 
 
 
 
 
 
 
 
 
 
 A decision tree classifier works differently.  When you construct a decision tree,  
 
3454 
 
 
 
 
 
 
 
 
 
 
 
 
 
 you select for the root node a feature test that partitions the training data in a  
 
3455 
 
 
 
 
 
 
 
 
 
 
 
 
 
 way that causes maximal disambiguation of the class labels associated with the data.  
 
3456 
 
 
 
 
 
 
 
 
 
 
 
 
 
 In terms of information content as measured by entropy, such a feature test would  
 
3457 
 
 
 
 
 
 
 
 
 
 
 
 
 
 cause maximum reduction in class entropy in going from all of the training data taken  
 
3458 
 
 
 
 
 
 
 
 
 
 
 
 
 
 together to the data as partitioned by the feature test.  You then drop from the root  
 
3459 
 
 
 
 
 
 
 
 
 
 
 
 
 
 node a set of child nodes, one for each partition of the training data created by the  
 
3460 
 
 
 
 
 
 
 
 
 
 
 
 
 
 feature test at the root node. When your features are purely symbolic, you'll have  
 
3461 
 
 
 
 
 
 
 
 
 
 
 
 
 
 one child node for each value of the feature chosen for the feature test at the root.  
 
3462 
 
 
 
 
 
 
 
 
 
 
 
 
 
 When the test at the root involves a numeric feature, you find the decision threshold  
 
3463 
 
 
 
 
 
 
 
 
 
 
 
 
 
 for the feature that best bipartitions the data and you drop from the root node two  
 
3464 
 
 
 
 
 
 
 
 
 
 
 
 
 
 child nodes, one for each partition.  Now at each child node you pose the same  
 
3465 
 
 
 
 
 
 
 
 
 
 
 
 
 
 question that you posed when you found the best feature to use at the root: Which  
 
3466 
 
 
 
 
 
 
 
 
 
 
 
 
 
 feature at the child node in question would maximally disambiguate the class labels  
 
3467 
 
 
 
 
 
 
 
 
 
 
 
 
 
 associated with the training data corresponding to that child node?  
 
3468 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3469 
 
 
 
 
 
 
 
 
 
 
 
 
 
 As the reader would expect, the two key steps in any approach to decision-tree based  
 
3470 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classification are the construction of the decision tree itself from a file  
 
3471 
 
 
 
 
 
 
 
 
 
 
 
 
 
 containing the training data, and then using the decision tree thus obtained for  
 
3472 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classifying new data.  
 
3473 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3474 
 
 
 
 
 
 
 
 
 
 
 
 
 
 What is cool about decision tree classification is that it gives you soft  
 
3475 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classification, meaning it may associate more than one class label with a given data  
 
3476 
 
 
 
 
 
 
 
 
 
 
 
 
 
 vector.  When this happens, it may mean that your classes are indeed overlapping in  
 
3477 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the underlying feature space.  It could also mean that you simply have not supplied  
 
3478 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sufficient training data to the decision tree classifier.  For a tutorial  
 
3479 
 
 
 
 
 
 
 
 
 
 
 
 
 
 introduction to how a decision tree is constructed and used, visit  
 
3480 
 
 
 
 
 
 
 
 
 
 
 
 
 
 L   
 
3481 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3482 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This module also allows you to generate your own synthetic training and test  
 
3483 
 
 
 
 
 
 
 
 
 
 
 
 
 
 data. Generating your own training data, using it for constructing a decision-tree  
 
3484 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classifier, and subsequently testing the classifier on a synthetically generated  
 
3485 
 
 
 
 
 
 
 
 
 
 
 
 
 
 test set of data is a good way to develop greater proficiency with decision trees.  
 
3486 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3487 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3488 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 WHAT PRACTICAL PROBLEM IS SOLVED BY THIS MODULE  
 
3489 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3490 
 
 
 
 
 
 
 
 
 
 
 
 
 
 If you are new to the concept of a decision tree, their practical utility is best  
 
3491 
 
 
 
 
 
 
 
 
 
 
 
 
 
 understood with an example that only involves symbolic features. However, as  
 
3492 
 
 
 
 
 
 
 
 
 
 
 
 
 
 mentioned earlier, versions of the module higher than 2.0 allow you to use both  
 
3493 
 
 
 
 
 
 
 
 
 
 
 
 
 
 symbolic and numeric features.  
 
3494 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3495 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Consider the following scenario: Let's say you are running a small investment company  
 
3496 
 
 
 
 
 
 
 
 
 
 
 
 
 
 that employs a team of stockbrokers who make buy/sell decisions for the customers of  
 
3497 
 
 
 
 
 
 
 
 
 
 
 
 
 
 your company.  Assume that your company has asked the traders to make each investment  
 
3498 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision on the basis of the following four criteria:  
 
3499 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3500 
 
 
 
 
 
 
 
 
 
 
 
 
 
   price_to_earnings_ratio   (P_to_E)  
 
3501 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3502 
 
 
 
 
 
 
 
 
 
 
 
 
 
   price_to_sales_ratio      (P_to_S)  
 
3503 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3504 
 
 
 
 
 
 
 
 
 
 
 
 
 
   return_on_equity          (R_on_E)  
 
3505 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3506 
 
 
 
 
 
 
 
 
 
 
 
 
 
   market_share              (MS)  
 
3507 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3508 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Since you are the boss, you keep track of the buy/sell decisions made by the  
 
3509 
 
 
 
 
 
 
 
 
 
 
 
 
 
 individual traders.  But one unfortunate day, all of your traders decide to quit  
 
3510 
 
 
 
 
 
 
 
 
 
 
 
 
 
 because you did not pay them enough.  So what do you do?  If you had a module like  
 
3511 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the one here, you could still run your company and do so in such a way that, on the  
 
3512 
 
 
 
 
 
 
 
 
 
 
 
 
 
 average, would do better than any of the individual traders who worked for your  
 
3513 
 
 
 
 
 
 
 
 
 
 
 
 
 
 company.  This is what you do: You pool together the individual trader buy/sell  
 
3514 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decisions you have accumulated during the last one year.  This pooled information is  
 
3515 
 
 
 
 
 
 
 
 
 
 
 
 
 
 likely to look like:  
 
3516 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3517 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3518 
 
 
 
 
 
 
 
 
 
 
 
 
 
   example      buy/sell     P_to_E     P_to_S     R_on_E      MS  
 
3519 
 
 
 
 
 
 
 
 
 
 
 
 
 
   ============================================================+=  
 
3520 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3521 
 
 
 
 
 
 
 
 
 
 
 
 
 
   example_1     buy          high       low        medium    low  
 
3522 
 
 
 
 
 
 
 
 
 
 
 
 
 
   example_2     buy          medium     medium     low       low  
 
3523 
 
 
 
 
 
 
 
 
 
 
 
 
 
   example_3     sell         low        medium     low       high  
 
3524 
 
 
 
 
 
 
 
 
 
 
 
 
 
   ....  
 
3525 
 
 
 
 
 
 
 
 
 
 
 
 
 
   ....  
 
3526 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3527 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This data, when formatted according to CSV, would constitute your training file. You  
 
3528 
 
 
 
 
 
 
 
 
 
 
 
 
 
 could feed this file into the module by calling:  
 
3529 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3530 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $dt = Algorithm::DecisionTree->new(   
 
3531 
 
 
 
 
 
 
 
 
 
 
 
 
 
                      training_datafile => $training_datafile,  
 
3532 
 
 
 
 
 
 
 
 
 
 
 
 
 
                      csv_class_column_index => 1,  
 
3533 
 
 
 
 
 
 
 
 
 
 
 
 
 
                      csv_columns_for_features => [2,3,4,5],  
 
3534 
 
 
 
 
 
 
 
 
 
 
 
 
 
              );  
 
3535 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $dt->get_training_data();   
 
3536 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $dt->calculate_first_order_probabilities();  
 
3537 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $dt->calculate_class_priors();  
 
3538 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3539 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Subsequently, you would construct a decision tree by calling:  
 
3540 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3541 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $root_node = $dt->construct_decision_tree_classifier();  
 
3542 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3543 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Now you and your company (with practically no employees) are ready to service the  
 
3544 
 
 
 
 
 
 
 
 
 
 
 
 
 
 customers again. Suppose your computer needs to make a buy/sell decision about an  
 
3545 
 
 
 
 
 
 
 
 
 
 
 
 
 
 investment prospect that is best described by:  
 
3546 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3547 
 
 
 
 
 
 
 
 
 
 
 
 
 
     price_to_earnings_ratio  =  low  
 
3548 
 
 
 
 
 
 
 
 
 
 
 
 
 
     price_to_sales_ratio     =  very_low  
 
3549 
 
 
 
 
 
 
 
 
 
 
 
 
 
     return_on_equity         =  none  
 
3550 
 
 
 
 
 
 
 
 
 
 
 
 
 
     market_share             =  medium      
 
3551 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3552 
 
 
 
 
 
 
 
 
 
 
 
 
 
 All that your computer would need to do would be to construct a data vector like  
 
3553 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3554 
 
 
 
 
 
 
 
 
 
 
 
 
 
    my @data =   qw / P_to_E=low  
 
3555 
 
 
 
 
 
 
 
 
 
 
 
 
 
                      P_to_S=very_low  
 
3556 
 
 
 
 
 
 
 
 
 
 
 
 
 
                      R_on_E=none  
 
3557 
 
 
 
 
 
 
 
 
 
 
 
 
 
                      MS=medium /;  
 
3558 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3559 
 
 
 
 
 
 
 
 
 
 
 
 
 
 and call the decision tree classifier you just constructed by  
 
3560 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3561 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $dt->classify($root_node, \@data);   
 
3562 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3563 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The answer returned will be 'buy' and 'sell', along with the associated  
 
3564 
 
 
 
 
 
 
 
 
 
 
 
 
 
 probabilities.  So if the probability of 'buy' is considerably greater than the  
 
3565 
 
 
 
 
 
 
 
 
 
 
 
 
 
 probability of 'sell', that's what you should instruct your computer to do.  
 
3566 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3567 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The chances are that, on the average, this approach would beat the performance of any  
 
3568 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of your individual traders who worked for you previously since the buy/sell decisions  
 
3569 
 
 
 
 
 
 
 
 
 
 
 
 
 
 made by the computer would be based on the collective wisdom of all your previous  
 
3570 
 
 
 
 
 
 
 
 
 
 
 
 
 
 traders.  B
   
3571 
 
 
 
 
 
 
 
 
 
 
 
 
 
 captured by the silly little example here. However, it does nicely the convey the  
 
3572 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sense in which the current module could be used.>  
 
3573 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3574 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 SYMBOLIC FEATURES VERSUS NUMERIC FEATURES  
 
3575 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3576 
 
 
 
 
 
 
 
 
 
 
 
 
 
 A feature is symbolic when its values are compared using string comparison operators.  
 
3577 
 
 
 
 
 
 
 
 
 
 
 
 
 
 By the same token, a feature is numeric when its values are compared using numeric  
 
3578 
 
 
 
 
 
 
 
 
 
 
 
 
 
 comparison operators.  Having said that, features that take only a small number of  
 
3579 
 
 
 
 
 
 
 
 
 
 
 
 
 
 numeric values in the training data can be treated symbolically provided you are  
 
3580 
 
 
 
 
 
 
 
 
 
 
 
 
 
 careful about handling their values in the test data.  At the least, you have to set  
 
3581 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the test data value for such a feature to its closest value in the training data.  
 
3582 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The module does that automatically for you for those numeric features for which the  
 
3583 
 
 
 
 
 
 
 
 
 
 
 
 
 
 number different numeric values is less than a user-specified threshold.  For those  
 
3584 
 
 
 
 
 
 
 
 
 
 
 
 
 
 numeric features that the module is allowed to treat symbolically, this snapping of  
 
3585 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the values of the features in the test data to the small set of values in the training  
 
3586 
 
 
 
 
 
 
 
 
 
 
 
 
 
 data is carried out automatically by the module.  That is, after a user has told the  
 
3587 
 
 
 
 
 
 
 
 
 
 
 
 
 
 module which numeric features to treat symbolically, the user need not worry about  
 
3588 
 
 
 
 
 
 
 
 
 
 
 
 
 
 how the feature values appear in the test data.  
 
3589 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3590 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The constructor parameter C let's you tell   
 
3591 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the module when to consider an otherwise numeric feature symbolically. Suppose you  
 
3592 
 
 
 
 
 
 
 
 
 
 
 
 
 
 set this parameter to 10, that means that all numeric looking features that take 10  
 
3593 
 
 
 
 
 
 
 
 
 
 
 
 
 
 or fewer different values in the training datafile will be considered to be symbolic  
 
3594 
 
 
 
 
 
 
 
 
 
 
 
 
 
 features by the module.  See the tutorial at  
 
3595 
 
 
 
 
 
 
 
 
 
 
 
 
 
 L for   
 
3596 
 
 
 
 
 
 
 
 
 
 
 
 
 
 further information on the implementation issues related to the symbolic and numeric  
 
3597 
 
 
 
 
 
 
 
 
 
 
 
 
 
 features.  
 
3598 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3599 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 FEATURES WITH NOT SO "NICE" STATISTICAL PROPERTIES  
 
3600 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3601 
 
 
 
 
 
 
 
 
 
 
 
 
 
 For the purpose of estimating the probabilities, it is necessary to sample the range  
 
3602 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of values taken on by a numerical feature. For features with "nice" statistical  
 
3603 
 
 
 
 
 
 
 
 
 
 
 
 
 
 properties, this sampling interval is set to the median of the differences between  
 
3604 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the successive feature values in the training data.  (Obviously, as you would expect,  
 
3605 
 
 
 
 
 
 
 
 
 
 
 
 
 
 you first sort all the values for a feature before computing the successive  
 
3606 
 
 
 
 
 
 
 
 
 
 
 
 
 
 differences.)  This logic will not work for the sort of a feature described below.  
 
3607 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3608 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Consider a feature whose values are heavy-tailed, and, at the same time, the values  
 
3609 
 
 
 
 
 
 
 
 
 
 
 
 
 
 span a million to one range.  What I mean by heavy-tailed is that rare values can  
 
3610 
 
 
 
 
 
 
 
 
 
 
 
 
 
 occur with significant probabilities.  It could happen that most of the values for  
 
3611 
 
 
 
 
 
 
 
 
 
 
 
 
 
 such a feature are clustered at one of the two ends of the range. At the same time,  
 
3612 
 
 
 
 
 
 
 
 
 
 
 
 
 
 there may exist a significant number of values near the end of the range that is less  
 
3613 
 
 
 
 
 
 
 
 
 
 
 
 
 
 populated.  (Typically, features related to human economic activities --- such as  
 
3614 
 
 
 
 
 
 
 
 
 
 
 
 
 
 wealth, incomes, etc. --- are of this type.)  With the logic described in the  
 
3615 
 
 
 
 
 
 
 
 
 
 
 
 
 
 previous paragraph, you could end up with a sampling interval that is much too small,  
 
3616 
 
 
 
 
 
 
 
 
 
 
 
 
 
 which could result in millions of sampling points for the feature if you are not  
 
3617 
 
 
 
 
 
 
 
 
 
 
 
 
 
 careful.  
 
3618 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3619 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Beginning with Version 2.22, you have two options in dealing with such features.  You  
 
3620 
 
 
 
 
 
 
 
 
 
 
 
 
 
 can choose to go with the default behavior of the module, which is to sample the  
 
3621 
 
 
 
 
 
 
 
 
 
 
 
 
 
 value range for such a feature over a maximum of 500 points.  Or, you can supply an  
 
3622 
 
 
 
 
 
 
 
 
 
 
 
 
 
 additional option to the constructor that sets a user-defined value for the number of  
 
3623 
 
 
 
 
 
 
 
 
 
 
 
 
 
 points to use.  The name of the option is C.  The following   
 
3624 
 
 
 
 
 
 
 
 
 
 
 
 
 
 script   
 
3625 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3626 
 
 
 
 
 
 
 
 
 
 
 
 
 
     construct_dt_for_heavytailed.pl   
 
3627 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3628 
 
 
 
 
 
 
 
 
 
 
 
 
 
 in the C directory shows an example of how to call the constructor of the   
 
3629 
 
 
 
 
 
 
 
 
 
 
 
 
 
 module with the C option.   
 
3630 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3631 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3632 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 TESTING THE QUALITY OF YOUR TRAINING DATA  
 
3633 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3634 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Versions 2.1 and higher include a new class named C, derived from   
 
3635 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the main class C, that runs a 10-fold cross-validation test on your   
 
3636 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training data to test its ability to discriminate between the classes mentioned in  
 
3637 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the training file.  
 
3638 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3639 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The 10-fold cross-validation test divides all of the training data into ten parts,  
 
3640 
 
 
 
 
 
 
 
 
 
 
 
 
 
 with nine parts used for training a decision tree and one part used for testing its  
 
3641 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ability to classify correctly. This selection of nine parts for training and one part  
 
3642 
 
 
 
 
 
 
 
 
 
 
 
 
 
 for testing is carried out in all of the ten different possible ways.  
 
3643 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3644 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The following code fragment illustrates how you invoke the testing function of the  
 
3645 
 
 
 
 
 
 
 
 
 
 
 
 
 
 EvalTrainingData class:  
 
3646 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3647 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $training_datafile = "training.csv";                                           
 
3648 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $eval_data = EvalTrainingData->new(  
 
3649 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   training_datafile => $training_datafile,  
 
3650 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   csv_class_column_index => 1,  
 
3651 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   csv_columns_for_features => [2,3],  
 
3652 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   entropy_threshold => 0.01,  
 
3653 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   max_depth_desired => 3,  
 
3654 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   symbolic_to_numeric_cardinality_threshold => 10,  
 
3655 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   csv_cleanup_needed => 1,  
 
3656 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     );  
 
3657 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $eval_data->get_training_data();  
 
3658 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $eval_data->evaluate_training_data()  
 
3659 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3660 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The last statement above prints out a Confusion Matrix and the value of Training Data  
 
3661 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Quality Index on a scale of 0 to 100, with 100 designating perfect training data.  
 
3662 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The Confusion Matrix shows how the different classes were mislabeled in the 10-fold  
 
3663 
 
 
 
 
 
 
 
 
 
 
 
 
 
 cross-validation test.  
 
3664 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3665 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This testing functionality can also be used to find the best values to use for the  
 
3666 
 
 
 
 
 
 
 
 
 
 
 
 
 
 constructor parameters C, C, and    
 
3667 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C.   
 
3668 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3669 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The following two scripts in the C directory illustrate the use of the   
 
3670 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C class for testing the quality of your data:   
 
3671 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3672 
 
 
 
 
 
 
 
 
 
 
 
 
 
     evaluate_training_data1.pl  
 
3673 
 
 
 
 
 
 
 
 
 
 
 
 
 
     evaluate_training_data2.pl  
 
3674 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3675 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3676 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 HOW TO MAKE THE BEST CHOICES FOR THE CONSTRUCTOR PARAMETERS  
 
3677 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3678 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Assuming your training data is good, the quality of the results you get from a  
 
3679 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision tree would depend on the choices you make for the constructor parameters  
 
3680 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C, C, and    
 
3681 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C.  You can optimize your choices for   
 
3682 
 
 
 
 
 
 
 
 
 
 
 
 
 
 these parameters by running the 10-fold cross-validation test that is made available  
 
3683 
 
 
 
 
 
 
 
 
 
 
 
 
 
 in Versions 2.2 and higher through the new class C that is included   
 
3684 
 
 
 
 
 
 
 
 
 
 
 
 
 
 in the module file.  A description of how to run this test is in the previous section  
 
3685 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of this document.  
 
3686 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3687 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3688 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 DECISION TREE INTROSPECTION  
 
3689 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3690 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Starting with Version 2.30, you can ask the C class of the module to   
 
3691 
 
 
 
 
 
 
 
 
 
 
 
 
 
 explain the classification decisions made at the different nodes of the decision  
 
3692 
 
 
 
 
 
 
 
 
 
 
 
 
 
 tree.  
 
3693 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3694 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Perhaps the most important bit of information you are likely to seek through DT  
 
3695 
 
 
 
 
 
 
 
 
 
 
 
 
 
 introspection is the list of the training samples that fall directly in the portion  
 
3696 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of the feature space that is assigned to a node.  
 
3697 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3698 
 
 
 
 
 
 
 
 
 
 
 
 
 
 However, note that, when training samples are non-uniformly distributed in the  
 
3699 
 
 
 
 
 
 
 
 
 
 
 
 
 
 underlying feature space, it is possible for a node to exist even when there are no  
 
3700 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training samples in the portion of the feature space assigned to the node.  That is  
 
3701 
 
 
 
 
 
 
 
 
 
 
 
 
 
 because the decision tree is constructed from the probability densities estimated  
 
3702 
 
 
 
 
 
 
 
 
 
 
 
 
 
 from the training data.  When the training samples are non-uniformly distributed, it  
 
3703 
 
 
 
 
 
 
 
 
 
 
 
 
 
 is entirely possible for the estimated probability densities to be non-zero in a  
 
3704 
 
 
 
 
 
 
 
 
 
 
 
 
 
 small region around a point even when there are no training samples specifically in  
 
3705 
 
 
 
 
 
 
 
 
 
 
 
 
 
 that region.  (After you have created a statistical model for, say, the height  
 
3706 
 
 
 
 
 
 
 
 
 
 
 
 
 
 distribution of people in a community, the model may return a non-zero probability  
 
3707 
 
 
 
 
 
 
 
 
 
 
 
 
 
 for the height values in a small interval even if the community does not include a  
 
3708 
 
 
 
 
 
 
 
 
 
 
 
 
 
 single individual whose height falls in that interval.)  
 
3709 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3710 
 
 
 
 
 
 
 
 
 
 
 
 
 
 That a decision-tree node can exist even when there are no training samples in that  
 
3711 
 
 
 
 
 
 
 
 
 
 
 
 
 
 portion of the feature space that belongs to the node is an important indication of  
 
3712 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the generalization ability of a decision-tree-based classifier.  
 
3713 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3714 
 
 
 
 
 
 
 
 
 
 
 
 
 
 In light of the explanation provided above, before the DTIntrospection class supplies  
 
3715 
 
 
 
 
 
 
 
 
 
 
 
 
 
 any answers at all, it asks you to accept the fact that features can take on non-zero  
 
3716 
 
 
 
 
 
 
 
 
 
 
 
 
 
 probabilities at a point in the feature space even though there are zero training  
 
3717 
 
 
 
 
 
 
 
 
 
 
 
 
 
 samples at that point (or in a small region around that point).  If you do not accept  
 
3718 
 
 
 
 
 
 
 
 
 
 
 
 
 
 this rudimentary fact, the introspection class will not yield any answers (since you  
 
3719 
 
 
 
 
 
 
 
 
 
 
 
 
 
 are not going to believe the answers anyway).  
 
3720 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3721 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The point made above implies that the path leading to a node in the decision tree may  
 
3722 
 
 
 
 
 
 
 
 
 
 
 
 
 
 test a feature for a certain value or threshold despite the fact that the portion of  
 
3723 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the feature space assigned to that node is devoid of any training data.  
 
3724 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3725 
 
 
 
 
 
 
 
 
 
 
 
 
 
 See the following three scripts in the Examples directory for how to carry out DT  
 
3726 
 
 
 
 
 
 
 
 
 
 
 
 
 
 introspection:  
 
3727 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3728 
 
 
 
 
 
 
 
 
 
 
 
 
 
     introspection_in_a_loop_interactive.pl  
 
3729 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3730 
 
 
 
 
 
 
 
 
 
 
 
 
 
     introspection_show_training_samples_at_all_nodes_direct_influence.pl  
 
3731 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3732 
 
 
 
 
 
 
 
 
 
 
 
 
 
     introspection_show_training_samples_to_nodes_influence_propagation.pl  
 
3733 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3734 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The first script places you in an interactive session in which you will first be  
 
3735 
 
 
 
 
 
 
 
 
 
 
 
 
 
 asked for the node number you are interested in.  Subsequently, you will be asked for  
 
3736 
 
 
 
 
 
 
 
 
 
 
 
 
 
 whether or not you are interested in specific questions that the introspection can  
 
3737 
 
 
 
 
 
 
 
 
 
 
 
 
 
 provide answers for. The second script descends down the decision tree and shows for  
 
3738 
 
 
 
 
 
 
 
 
 
 
 
 
 
 each node the training samples that fall directly in the portion of the feature space  
 
3739 
 
 
 
 
 
 
 
 
 
 
 
 
 
 assigned to that node.  The third script shows for each training sample how it  
 
3740 
 
 
 
 
 
 
 
 
 
 
 
 
 
 affects the decision-tree nodes either directly or indirectly through the  
 
3741 
 
 
 
 
 
 
 
 
 
 
 
 
 
 generalization achieved by the probabilistic modeling of the data.  
 
3742 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3743 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The output of the script  
 
3744 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C looks like:   
 
3745 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3746 
 
 
 
 
 
 
 
 
 
 
 
 
 
     Node 0: the samples are: None  
 
3747 
 
 
 
 
 
 
 
 
 
 
 
 
 
     Node 1: the samples are: [sample_46 sample_58]  
 
3748 
 
 
 
 
 
 
 
 
 
 
 
 
 
     Node 2: the samples are: [sample_1 sample_4 sample_7 .....]  
 
3749 
 
 
 
 
 
 
 
 
 
 
 
 
 
     Node 3: the samples are: []  
 
3750 
 
 
 
 
 
 
 
 
 
 
 
 
 
     Node 4: the samples are: []  
 
3751 
 
 
 
 
 
 
 
 
 
 
 
 
 
     ...  
 
3752 
 
 
 
 
 
 
 
 
 
 
 
 
 
     ...              
 
3753 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3754 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The nodes for which no samples are listed come into existence through  
 
3755 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the generalization achieved by the probabilistic modeling of the data.  
 
3756 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3757 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The output produced by the script  
 
3758 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C looks like   
 
3759 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3760 
 
 
 
 
 
 
 
 
 
 
 
 
 
     sample_1:                                                                   
 
3761 
 
 
 
 
 
 
 
 
 
 
 
 
 
        nodes affected directly: [2 5 19 23]                                  
 
3762 
 
 
 
 
 
 
 
 
 
 
 
 
 
        nodes affected through probabilistic generalization:                     
 
3763 
 
 
 
 
 
 
 
 
 
 
 
 
 
             2=> [3 4 25]                                                      
 
3764 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 25=> [26]                                                       
 
3765 
 
 
 
 
 
 
 
 
 
 
 
 
 
             5=> [6]                                                             
 
3766 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 6=> [7 13]                                                     
 
3767 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     7=> [8 11]                                                 
 
3768 
 
 
 
 
 
 
 
 
 
 
 
 
 
                         8=> [9 10]                                             
 
3769 
 
 
 
 
 
 
 
 
 
 
 
 
 
                         11=> [12]                                               
 
3770 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     13=> [14 18]                                               
 
3771 
 
 
 
 
 
 
 
 
 
 
 
 
 
                         14=> [15 16]                                           
 
3772 
 
 
 
 
 
 
 
 
 
 
 
 
 
                             16=> [17]                                           
 
3773 
 
 
 
 
 
 
 
 
 
 
 
 
 
             19=> [20]                                                           
 
3774 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 20=> [21 22]                                                   
 
3775 
 
 
 
 
 
 
 
 
 
 
 
 
 
             23=> [24]                                                           
 
3776 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                
 
3777 
 
 
 
 
 
 
 
 
 
 
 
 
 
     sample_4:                                                                   
 
3778 
 
 
 
 
 
 
 
 
 
 
 
 
 
        nodes affected directly: [2 5 6 7 11]                                
 
3779 
 
 
 
 
 
 
 
 
 
 
 
 
 
        nodes affected through probabilistic generalization:                     
 
3780 
 
 
 
 
 
 
 
 
 
 
 
 
 
             2=> [3 4 25]                                                      
 
3781 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 25=> [26]                                                       
 
3782 
 
 
 
 
 
 
 
 
 
 
 
 
 
             5=> [19]                                                            
 
3783 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 19=> [20 23]                                                   
 
3784 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     20=> [21 22]                                               
 
3785 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     23=> [24]                                                   
 
3786 
 
 
 
 
 
 
 
 
 
 
 
 
 
             6=> [13]                                                            
 
3787 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 13=> [14 18]                                                   
 
3788 
 
 
 
 
 
 
 
 
 
 
 
 
 
                     14=> [15 16]                                               
 
3789 
 
 
 
 
 
 
 
 
 
 
 
 
 
                         16=> [17]                                               
 
3790 
 
 
 
 
 
 
 
 
 
 
 
 
 
             7=> [8]                                                             
 
3791 
 
 
 
 
 
 
 
 
 
 
 
 
 
                 8=> [9 10]                                                     
 
3792 
 
 
 
 
 
 
 
 
 
 
 
 
 
             11=> [12]                                                           
 
3793 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                                 
 
3794 
 
 
 
 
 
 
 
 
 
 
 
 
 
     ...                                                                         
 
3795 
 
 
 
 
 
 
 
 
 
 
 
 
 
     ...    
 
3796 
 
 
 
 
 
 
 
 
 
 
 
 
 
     ...  
 
3797 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3798 
 
 
 
 
 
 
 
 
 
 
 
 
 
 For each training sample, the display shown above first presents the list of nodes  
 
3799 
 
 
 
 
 
 
 
 
 
 
 
 
 
 that are directly affected by the sample.  A node is affected directly by a sample if  
 
3800 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the latter falls in the portion of the feature space that belongs to the former.  
 
3801 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Subsequently, for each training sample, the display shows a subtree of the nodes that  
 
3802 
 
 
 
 
 
 
 
 
 
 
 
 
 
 are affected indirectly by the sample through the generalization achieved by the  
 
3803 
 
 
 
 
 
 
 
 
 
 
 
 
 
 probabilistic modeling of the data.  In general, a node is affected indirectly by a  
 
3804 
 
 
 
 
 
 
 
 
 
 
 
 
 
 sample if it is a descendant of another node that is affected directly.  
 
3805 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3806 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Also see the section titled B regarding how to invoke the   
 
3807 
 
 
 
 
 
 
 
 
 
 
 
 
 
 introspection capabilities of the module in your own code.  
 
3808 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3809 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 METHODS  
 
3810 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3811 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The module provides the following methods for constructing a decision tree from  
 
3812 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training data in a disk file and for classifying new data records with the decision  
 
3813 
 
 
 
 
 
 
 
 
 
 
 
 
 
 tree thus constructed:  
 
3814 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3815 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =over 4  
 
3816 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3817 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
3818 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3819 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $dt = Algorithm::DecisionTree->new(   
 
3820 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               training_datafile => $training_datafile,  
 
3821 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_class_column_index => 2,  
 
3822 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_columns_for_features => [3,4,5,6,7,8],  
 
3823 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               entropy_threshold => 0.01,  
 
3824 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               max_depth_desired => 8,  
 
3825 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               symbolic_to_numeric_cardinality_threshold => 10,  
 
3826 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_cleanup_needed => 1,  
 
3827 
 
 
 
 
 
 
 
 
 
 
 
 
 
             );  
 
3828 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3829 
 
 
 
 
 
 
 
 
 
 
 
 
 
 A call to C constructs a new instance of the C class.    
 
3830 
 
 
 
 
 
 
 
 
 
 
 
 
 
 For this call to make sense, the training data in the training datafile must be  
 
3831 
 
 
 
 
 
 
 
 
 
 
 
 
 
 in the CSV format.    
 
3832 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3833 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =back  
 
3834 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3835 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head2 The Constructor Parameters  
 
3836 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3837 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =over 8  
 
3838 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3839 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item C:   
 
3840 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3841 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This parameter supplies the name of the file that contains the training data.  
 
3842 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3843 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item C:   
 
3844 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3845 
 
 
 
 
 
 
 
 
 
 
 
 
 
 When using a CSV file for your training data, this parameter supplies the zero-based  
 
3846 
 
 
 
 
 
 
 
 
 
 
 
 
 
 column index for the column that contains the class label for each data record in the  
 
3847 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training file.  
 
3848 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3849 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3850 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item C:   
 
3851 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3852 
 
 
 
 
 
 
 
 
 
 
 
 
 
 You need to set this parameter to 1 if your CSV file has double quoted strings (which  
 
3853 
 
 
 
 
 
 
 
 
 
 
 
 
 
 may include commas) as values for the fields and if such values are allowed to  
 
3854 
 
 
 
 
 
 
 
 
 
 
 
 
 
 include commas for, presumably, better readability.  
 
3855 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3856 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item C:   
 
3857 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3858 
 
 
 
 
 
 
 
 
 
 
 
 
 
 When using a CSV file for your training data, this parameter supplies a list of  
 
3859 
 
 
 
 
 
 
 
 
 
 
 
 
 
 columns corresponding to the features you wish to use for decision tree construction.  
 
3860 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Each column is specified by its zero-based index.  
 
3861 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3862 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item C:   
 
3863 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3864 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This parameter sets the granularity with which the entropies are sampled by the  
 
3865 
 
 
 
 
 
 
 
 
 
 
 
 
 
 module.  For example, a feature test at a node in the decision tree is acceptable if  
 
3866 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the entropy gain achieved by the test exceeds this threshold.  The larger the value  
 
3867 
 
 
 
 
 
 
 
 
 
 
 
 
 
 you choose for this parameter, the smaller the tree.  Its default value is 0.001.  
 
3868 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3869 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item C:   
 
3870 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3871 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This parameter sets the maximum depth of the decision tree.  For obvious reasons, the  
 
3872 
 
 
 
 
 
 
 
 
 
 
 
 
 
 smaller the value you choose for this parameter, the smaller the tree.  
 
3873 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3874 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item C:   
 
3875 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3876 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This parameter allows the module to treat an otherwise numeric feature symbolically  
 
3877 
 
 
 
 
 
 
 
 
 
 
 
 
 
 if the number of different values the feature takes in the training data file does  
 
3878 
 
 
 
 
 
 
 
 
 
 
 
 
 
 not exceed the value of this parameter.  
 
3879 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3880 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item C:   
 
3881 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3882 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This parameter gives the user the option to set the number of points at which the  
 
3883 
 
 
 
 
 
 
 
 
 
 
 
 
 
 value range for a feature should be sampled for estimating the probabilities.  This  
 
3884 
 
 
 
 
 
 
 
 
 
 
 
 
 
 parameter is effective only for those features that occupy a large value range and  
 
3885 
 
 
 
 
 
 
 
 
 
 
 
 
 
 whose probability distributions are heavy tailed.  B
   
3886 
 
 
 
 
 
 
 
 
 
 
 
 
 
 when you have a very large training dataset:> In general, the larger the dataset, the  
 
3887 
 
 
 
 
 
 
 
 
 
 
 
 
 
 smaller the smallest difference between any two values for a numeric feature in  
 
3888 
 
 
 
 
 
 
 
 
 
 
 
 
 
 relation to the overall range of values for that feature. In such cases, the module  
 
3889 
 
 
 
 
 
 
 
 
 
 
 
 
 
 may use too large a number of bins for estimating the probabilities and that may slow  
 
3890 
 
 
 
 
 
 
 
 
 
 
 
 
 
 down the calculation of the decision tree.  You can get around this difficulty by  
 
3891 
 
 
 
 
 
 
 
 
 
 
 
 
 
 explicitly giving a value to the 'C' parameter.   
 
3892 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3893 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =back  
 
3894 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3895 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3896 
 
 
 
 
 
 
 
 
 
 
 
 
 
 You can choose the best values to use for the last three constructor parameters by  
 
3897 
 
 
 
 
 
 
 
 
 
 
 
 
 
 running a 10-fold cross-validation test on your training data through the class  
 
3898 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C that comes with Versions 2.1 and higher of this module.  See the   
 
3899 
 
 
 
 
 
 
 
 
 
 
 
 
 
 section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.  
 
3900 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3901 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =over  
 
3902 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3903 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
3904 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3905 
 
 
 
 
 
 
 
 
 
 
 
 
 
 After you have constructed a new instance of the C class,   
 
3906 
 
 
 
 
 
 
 
 
 
 
 
 
 
 you must now read in the training data that is the file named in the call to the  
 
3907 
 
 
 
 
 
 
 
 
 
 
 
 
 
 constructor.  This you do by:  
 
3908 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3909 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $dt->get_training_data();   
 
3910 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3911 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3912 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
3913 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3914 
 
 
 
 
 
 
 
 
 
 
 
 
 
 If you wish to see the training data that was just digested by the module,  
 
3915 
 
 
 
 
 
 
 
 
 
 
 
 
 
 call   
 
3916 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3917 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $dt->show_training_data();   
 
3918 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3919 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
3920 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3921 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
3922 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3923 
 
 
 
 
 
 
 
 
 
 
 
 
 
 After the module has read the training data file, it needs to initialize the  
 
3924 
 
 
 
 
 
 
 
 
 
 
 
 
 
 probability cache.  This you do by invoking:  
 
3925 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3926 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $dt->calculate_first_order_probabilities()  
 
3927 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $dt->calculate_class_priors()   
 
3928 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3929 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
3930 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3931 
 
 
 
 
 
 
 
 
 
 
 
 
 
 With the probability cache initialized, it is time to construct a decision tree  
 
3932 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classifier.  This you do by  
 
3933 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3934 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $root_node = $dt->construct_decision_tree_classifier();  
 
3935 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3936 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This call returns an instance of type C.  The C class is defined    
 
3937 
 
 
 
 
 
 
 
 
 
 
 
 
 
 within the main package file.  So, don't forget, that C<$root_node> in the above  
 
3938 
 
 
 
 
 
 
 
 
 
 
 
 
 
 example call will be instantiated to an object of type C.   
 
3939 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3940 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B<$root_nodeC<< -> >>display_decision_tree(" "):>  
 
3941 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3942 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $root_node->display_decision_tree("   ");  
 
3943 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3944 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This will display the decision tree in your terminal window by using a recursively  
 
3945 
 
 
 
 
 
 
 
 
 
 
 
 
 
 determined offset for each node as the display routine descends down the tree.  
 
3946 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3947 
 
 
 
 
 
 
 
 
 
 
 
 
 
 I have intentionally left the syntax fragment C<$root_node> in the above call to  
 
3948 
 
 
 
 
 
 
 
 
 
 
 
 
 
 remind the reader that C is NOT called on the instance of   
 
3949 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the C we constructed earlier, but on the C instance returned by    
 
3950 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the call to C.   
 
3951 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3952 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
3953 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3954 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Let's say you want to classify the following data record:  
 
3955 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3956 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my @test_sample  = qw /  g2=4.2  
 
3957 
 
 
 
 
 
 
 
 
 
 
 
 
 
                              grade=2.3  
 
3958 
 
 
 
 
 
 
 
 
 
 
 
 
 
                              gleason=4  
 
3959 
 
 
 
 
 
 
 
 
 
 
 
 
 
                              eet=1.7  
 
3960 
 
 
 
 
 
 
 
 
 
 
 
 
 
                              age=55.0  
 
3961 
 
 
 
 
 
 
 
 
 
 
 
 
 
                              ploidy=diploid /;  
 
3962 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3963 
 
 
 
 
 
 
 
 
 
 
 
 
 
 you'd make the following call:  
 
3964 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3965 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $classification = $dt->classify($root_node, \@test_sample);  
 
3966 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3967 
 
 
 
 
 
 
 
 
 
 
 
 
 
 where, again, C<$root_node> is an instance of type C returned by the call to   
 
3968 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C.  The variable C<$classification> holds a   
 
3969 
 
 
 
 
 
 
 
 
 
 
 
 
 
 reference to a hash whose keys are the class names and whose values the associated  
 
3970 
 
 
 
 
 
 
 
 
 
 
 
 
 
 probabilities.  The hash that is returned by the above call also includes a special  
 
3971 
 
 
 
 
 
 
 
 
 
 
 
 
 
 key-value pair for a key named C.  The value associated with this key   
 
3972 
 
 
 
 
 
 
 
 
 
 
 
 
 
 is an anonymous array that holds the path, in the form of a list of nodes, from the  
 
3973 
 
 
 
 
 
 
 
 
 
 
 
 
 
 root node to the leaf node in the decision tree where the final classification was  
 
3974 
 
 
 
 
 
 
 
 
 
 
 
 
 
 made.  
 
3975 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3976 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3977 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
3978 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3979 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This method allows you to use a decision-tree based classifier in an interactive  
 
3980 
 
 
 
 
 
 
 
 
 
 
 
 
 
 mode.  In this mode, a user is prompted for answers to the questions pertaining to  
 
3981 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the feature tests at the nodes of the tree.  The syntax for invoking this method is:  
 
3982 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3983 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $classification = $dt->classify_by_asking_questions($root_node);  
 
3984 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3985 
 
 
 
 
 
 
 
 
 
 
 
 
 
 where C<$dt> is an instance of the C class returned by a   
 
3986 
 
 
 
 
 
 
 
 
 
 
 
 
 
 call to C and C<$root_node> the root node of the decision tree returned by a   
 
3987 
 
 
 
 
 
 
 
 
 
 
 
 
 
 call to C.   
 
3988 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3989 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =back  
 
3990 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3991 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3992 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 THE INTROSPECTION API  
 
3993 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3994 
 
 
 
 
 
 
 
 
 
 
 
 
 
 To construct an instance of C, you call   
 
3995 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3996 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $introspector = DTIntrospection->new($dt);  
 
3997 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
3998 
 
 
 
 
 
 
 
 
 
 
 
 
 
 where you supply the instance of the C class you used for constructing   
 
3999 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the decision tree through the parameter C<$dt>.  After you have constructed an  
 
4000 
 
 
 
 
 
 
 
 
 
 
 
 
 
 instance of the introspection class, you must initialize it by  
 
4001 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4002 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $introspector->initialize();  
 
4003 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4004 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Subsequently, you can invoke either of the following methods:  
 
4005 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4006 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $introspector->explain_classification_at_one_node($node);  
 
4007 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4008 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $introspector->explain_classifications_at_multiple_nodes_interactively();  
 
4009 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4010 
 
 
 
 
 
 
 
 
 
 
 
 
 
 depending on whether you want introspection at a single specified node or inside an  
 
4011 
 
 
 
 
 
 
 
 
 
 
 
 
 
 infinite loop for an arbitrary number of nodes.  
 
4012 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4013 
 
 
 
 
 
 
 
 
 
 
 
 
 
 If you want to output a tabular display that shows for each node in the decision tree  
 
4014 
 
 
 
 
 
 
 
 
 
 
 
 
 
 all the training samples that fall in the portion of the feature space that belongs  
 
4015 
 
 
 
 
 
 
 
 
 
 
 
 
 
 to that node, call  
 
4016 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4017 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $introspector->display_training_samples_at_all_nodes_direct_influence_only();  
 
4018 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4019 
 
 
 
 
 
 
 
 
 
 
 
 
 
 If you want to output a tabular display that shows for each training sample a list of  
 
4020 
 
 
 
 
 
 
 
 
 
 
 
 
 
 all the nodes that are affected directly AND indirectly by that sample, call  
 
4021 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4022 
 
 
 
 
 
 
 
 
 
 
 
 
 
     $introspector->display_training_training_samples_to_nodes_influence_propagation();  
 
4023 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4024 
 
 
 
 
 
 
 
 
 
 
 
 
 
 A training sample affects a node directly if the sample falls in the portion of the  
 
4025 
 
 
 
 
 
 
 
 
 
 
 
 
 
 features space assigned to that node. On the other hand, a training sample is  
 
4026 
 
 
 
 
 
 
 
 
 
 
 
 
 
 considered to affect a node indirectly if the node is a descendant of a node that is  
 
4027 
 
 
 
 
 
 
 
 
 
 
 
 
 
 affected directly by the sample.  
 
4028 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4029 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4030 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 BULK CLASSIFICATION OF DATA RECORDS  
 
4031 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4032 
 
 
 
 
 
 
 
 
 
 
 
 
 
 For large test datasets, you would obviously want to process an entire file of test  
 
4033 
 
 
 
 
 
 
 
 
 
 
 
 
 
 data at a time. The following scripts in the C directory illustrate how you   
 
4034 
 
 
 
 
 
 
 
 
 
 
 
 
 
 can do that:  
 
4035 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4036 
 
 
 
 
 
 
 
 
 
 
 
 
 
       classify_test_data_in_a_file.pl  
 
4037 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4038 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This script requires three command-line arguments, the first argument names the  
 
4039 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training datafile, the second the test datafile, and the third the file in which the  
 
4040 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classification results are to be deposited.    
 
4041 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4042 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The other examples directories, C, C, and    
 
4043 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C, also contain scripts that illustrate how to carry out   
 
4044 
 
 
 
 
 
 
 
 
 
 
 
 
 
 bulk classification of data records when you wish to take advantage of bagging,  
 
4045 
 
 
 
 
 
 
 
 
 
 
 
 
 
 boosting, or tree randomization.  In their respective directories, these scripts are  
 
4046 
 
 
 
 
 
 
 
 
 
 
 
 
 
 named:  
 
4047 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4048 
 
 
 
 
 
 
 
 
 
 
 
 
 
     bagging_for_bulk_classification.pl  
 
4049 
 
 
 
 
 
 
 
 
 
 
 
 
 
     boosting_for_bulk_classification.pl  
 
4050 
 
 
 
 
 
 
 
 
 
 
 
 
 
     classify_database_records.pl  
 
4051 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4052 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4053 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 HOW THE CLASSIFICATION RESULTS ARE DISPLAYED  
 
4054 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4055 
 
 
 
 
 
 
 
 
 
 
 
 
 
 It depends on whether you apply the classifier at once to all the data samples in a  
 
4056 
 
 
 
 
 
 
 
 
 
 
 
 
 
 file, or whether you feed one data sample at a time into the classifier.  
 
4057 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4058 
 
 
 
 
 
 
 
 
 
 
 
 
 
 In general, the classifier returns soft classification for a test data vector.  What  
 
4059 
 
 
 
 
 
 
 
 
 
 
 
 
 
 that means is that, in general, the classifier will list all the classes to which a  
 
4060 
 
 
 
 
 
 
 
 
 
 
 
 
 
 given data vector could belong and the probability of each such class label for the  
 
4061 
 
 
 
 
 
 
 
 
 
 
 
 
 
 data vector. Run the examples scripts in the Examples directory to see how the output  
 
4062 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of classification can be displayed.  
 
4063 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4064 
 
 
 
 
 
 
 
 
 
 
 
 
 
 With regard to the soft classifications returned by this classifier, if the  
 
4065 
 
 
 
 
 
 
 
 
 
 
 
 
 
 probability distributions for the different classes overlap in the underlying feature  
 
4066 
 
 
 
 
 
 
 
 
 
 
 
 
 
 space, you would want the classifier to return all of the applicable class labels for  
 
4067 
 
 
 
 
 
 
 
 
 
 
 
 
 
 a data vector along with the corresponding class probabilities.  Another reason for  
 
4068 
 
 
 
 
 
 
 
 
 
 
 
 
 
 why the decision tree classifier may associate significant probabilities with  
 
4069 
 
 
 
 
 
 
 
 
 
 
 
 
 
 multiple class labels is that you used inadequate number of training samples to  
 
4070 
 
 
 
 
 
 
 
 
 
 
 
 
 
 induce the decision tree.  The good thing is that the classifier does not lie to you  
 
4071 
 
 
 
 
 
 
 
 
 
 
 
 
 
 (unlike, say, a hard classification rule that would return a single class label  
 
4072 
 
 
 
 
 
 
 
 
 
 
 
 
 
 corresponding to the partitioning of the underlying feature space).  The decision  
 
4073 
 
 
 
 
 
 
 
 
 
 
 
 
 
 tree classifier give you the best classification that can be made given the training  
 
4074 
 
 
 
 
 
 
 
 
 
 
 
 
 
 data you fed into it.  
 
4075 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4076 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4077 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 USING BAGGING  
 
4078 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4079 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Starting with Version 3.0, you can use the class C that   
 
4080 
 
 
 
 
 
 
 
 
 
 
 
 
 
 comes with the module to incorporate bagging in your decision tree based  
 
4081 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classification.  Bagging means constructing multiple decision trees for different  
 
4082 
 
 
 
 
 
 
 
 
 
 
 
 
 
 (possibly overlapping) segments of the data extracted from your training dataset and  
 
4083 
 
 
 
 
 
 
 
 
 
 
 
 
 
 then aggregating the decisions made by the individual decision trees for the final  
 
4084 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classification.  The aggregation of the classification decisions can average out the  
 
4085 
 
 
 
 
 
 
 
 
 
 
 
 
 
 noise and bias that may otherwise affect the classification decision obtained from  
 
4086 
 
 
 
 
 
 
 
 
 
 
 
 
 
 just one tree.  
 
4087 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4088 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =over 4  
 
4089 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4090 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4091 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4092 
 
 
 
 
 
 
 
 
 
 
 
 
 
 A typical call to the constructor for the C class looks   
 
4093 
 
 
 
 
 
 
 
 
 
 
 
 
 
 like:  
 
4094 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4095 
 
 
 
 
 
 
 
 
 
 
 
 
 
     use Algorithm::DecisionTreeWithBagging;  
 
4096 
 
 
 
 
 
 
 
 
 
 
 
 
 
       
 
4097 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $training_datafile = "stage3cancer.csv";  
 
4098 
 
 
 
 
 
 
 
 
 
 
 
 
 
       
 
4099 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $dtbag = Algorithm::DecisionTreeWithBagging->new(  
 
4100 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   training_datafile => $training_datafile,  
 
4101 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   csv_class_column_index => 2,  
 
4102 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   csv_columns_for_features => [3,4,5,6,7,8],  
 
4103 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   entropy_threshold => 0.01,  
 
4104 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   max_depth_desired => 8,  
 
4105 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   symbolic_to_numeric_cardinality_threshold => 10,  
 
4106 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   how_many_bags => 4,  
 
4107 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   bag_overlap_fraction => 0.2,  
 
4108 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                   csv_cleanup_needed => 1,  
 
4109 
 
 
 
 
 
 
 
 
 
 
 
 
 
                  );  
 
4110 
 
 
 
 
 
 
 
 
 
 
 
 
 
       
 
4111 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Note in particular the following two constructor parameters:  
 
4112 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                                                  
 
4113 
 
 
 
 
 
 
 
 
 
 
 
 
 
     how_many_bags  
 
4114 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4115 
 
 
 
 
 
 
 
 
 
 
 
 
 
     bag_overlap_fraction  
 
4116 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4117 
 
 
 
 
 
 
 
 
 
 
 
 
 
 where, as the name implies, the parameter C controls how many bags   
 
4118 
 
 
 
 
 
 
 
 
 
 
 
 
 
 (and, therefore, how many decision trees) will be constructed from your training  
 
4119 
 
 
 
 
 
 
 
 
 
 
 
 
 
 dataset; and where the parameter C controls the degree of   
 
4120 
 
 
 
 
 
 
 
 
 
 
 
 
 
 overlap between the bags.  To understand what exactly is achieved by setting the  
 
4121 
 
 
 
 
 
 
 
 
 
 
 
 
 
 parameter C to 0.2 in the above example, let's say that the   
 
4122 
 
 
 
 
 
 
 
 
 
 
 
 
 
 non-overlapping partitioning of the training data between the bags results in 100  
 
4123 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training samples per bag. With bag_overlap_fraction set to 0.2, additional 20 samples  
 
4124 
 
 
 
 
 
 
 
 
 
 
 
 
 
 drawn randomly from the other bags will be added to the data in each bag.  
 
4125 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4126 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =back  
 
4127 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4128 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head2 B class>   
 
4129 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4130 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =over 8  
 
4131 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4132 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4133 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4134 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This method reads your training datafile, randomizes it, and then partitions it into  
 
4135 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the specified number of bags.  Subsequently, if the constructor parameter  
 
4136 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C is non-zero, it adds to each bag additional samples drawn at   
 
4137 
 
 
 
 
 
 
 
 
 
 
 
 
 
 random from the other bags.  The number of these additional samples added to each bag  
 
4138 
 
 
 
 
 
 
 
 
 
 
 
 
 
 is controlled by the constructor parameter C.  If this   
 
4139 
 
 
 
 
 
 
 
 
 
 
 
 
 
 parameter is set to, say, 0.2, the size of each bag will grow by 20% with the samples  
 
4140 
 
 
 
 
 
 
 
 
 
 
 
 
 
 drawn from the other bags.  
 
4141 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4142 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4143 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4144 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Shows for each bag the names of the training data samples in that bag.  
 
4145 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4146 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4147 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4148 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Calls on the appropriate methods of the main C class to estimate the   
 
4149 
 
 
 
 
 
 
 
 
 
 
 
 
 
 first-order probabilities from the data samples in each bag.  
 
4150 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4151 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4152 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4153 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Calls on the appropriate method of the main C class to estimate the   
 
4154 
 
 
 
 
 
 
 
 
 
 
 
 
 
 class priors for the data samples in each bag.  
 
4155 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4156 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4157 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4158 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Calls on the appropriate method of the main C class to construct a   
 
4159 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision tree from the training data in each bag.  
 
4160 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4161 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4162 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4163 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Display separately the decision tree for each bag..  
 
4164 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4165 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4166 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4167 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Calls on the appropriate methods of the main C class to classify the   
 
4168 
 
 
 
 
 
 
 
 
 
 
 
 
 
 argument test sample.  
 
4169 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4170 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4171 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4172 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Displays separately the classification decision made by each the decision tree  
 
4173 
 
 
 
 
 
 
 
 
 
 
 
 
 
 constructed for each bag.  
 
4174 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4175 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4176 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4177 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Using majority voting, this method aggregates the classification decisions made by  
 
4178 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the individual decision trees into a single decision.  
 
4179 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4180 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =back  
 
4181 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4182 
 
 
 
 
 
 
 
 
 
 
 
 
 
 See the example scripts in the directory C for how to call these   
 
4183 
 
 
 
 
 
 
 
 
 
 
 
 
 
 methods for classifying individual samples and for bulk classification when you place  
 
4184 
 
 
 
 
 
 
 
 
 
 
 
 
 
 all your test samples in a single file.  
 
4185 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4186 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 USING BOOSTING  
 
4187 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4188 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Starting with Version 3.20, you can use the class C for   
 
4189 
 
 
 
 
 
 
 
 
 
 
 
 
 
 constructing a boosted decision-tree classifier.  Boosting results in a cascade of  
 
4190 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision trees in which each decision tree is constructed with samples that are  
 
4191 
 
 
 
 
 
 
 
 
 
 
 
 
 
 mostly those that are misclassified by the previous decision tree.  To be precise,  
 
4192 
 
 
 
 
 
 
 
 
 
 
 
 
 
 you create a probability distribution over the training samples for the selection of  
 
4193 
 
 
 
 
 
 
 
 
 
 
 
 
 
 samples for training each decision tree in the cascade.  To start out, the  
 
4194 
 
 
 
 
 
 
 
 
 
 
 
 
 
 distribution is uniform over all of the samples. Subsequently, this probability  
 
4195 
 
 
 
 
 
 
 
 
 
 
 
 
 
 distribution changes according to the misclassifications by each tree in the cascade:  
 
4196 
 
 
 
 
 
 
 
 
 
 
 
 
 
 if a sample is misclassified by a given tree in the cascade, the probability of its  
 
4197 
 
 
 
 
 
 
 
 
 
 
 
 
 
 being selected for training the next tree is increased significantly.  You also  
 
4198 
 
 
 
 
 
 
 
 
 
 
 
 
 
 associate a trust factor with each decision tree depending on its power to classify  
 
4199 
 
 
 
 
 
 
 
 
 
 
 
 
 
 correctly all of the training data samples.  After a cascade of decision trees is  
 
4200 
 
 
 
 
 
 
 
 
 
 
 
 
 
 constructed in this manner, you construct a final classifier that calculates the  
 
4201 
 
 
 
 
 
 
 
 
 
 
 
 
 
 class label for a test data sample by taking into account the classification  
 
4202 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decisions made by each individual tree in the cascade, the decisions being weighted  
 
4203 
 
 
 
 
 
 
 
 
 
 
 
 
 
 by the trust factors associated with the individual classifiers.  These boosting  
 
4204 
 
 
 
 
 
 
 
 
 
 
 
 
 
 notions --- generally referred to as the AdaBoost algorithm --- are based on a now  
 
4205 
 
 
 
 
 
 
 
 
 
 
 
 
 
 celebrated paper "A Decision-Theoretic Generalization of On-Line Learning and an  
 
4206 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Application to Boosting" by Yoav Freund and Robert Schapire that appeared in 1995 in  
 
4207 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the Proceedings of the 2nd European Conf. on Computational Learning Theory.  For a  
 
4208 
 
 
 
 
 
 
 
 
 
 
 
 
 
 tutorial introduction to AdaBoost, see L   
 
4209 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4210 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Keep in mind the fact that, ordinarily, the theoretical guarantees provided by  
 
4211 
 
 
 
 
 
 
 
 
 
 
 
 
 
 boosting apply only to the case of binary classification.  Additionally, your  
 
4212 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training dataset must capture all of the significant statistical variations in the  
 
4213 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classes represented therein.  
 
4214 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4215 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =over 4  
 
4216 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4217 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4218 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4219 
 
 
 
 
 
 
 
 
 
 
 
 
 
 If you'd like to experiment with boosting, a typical call to the constructor for the  
 
4220 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C class looks like:    
 
4221 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4222 
 
 
 
 
 
 
 
 
 
 
 
 
 
     use Algorithm::BoostedDecisionTree;  
 
4223 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $training_datafile = "training6.csv";  
 
4224 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $boosted = Algorithm::BoostedDecisionTree->new(  
 
4225 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               training_datafile => $training_datafile,  
 
4226 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_class_column_index => 1,  
 
4227 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_columns_for_features => [2,3],  
 
4228 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               entropy_threshold => 0.01,  
 
4229 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               max_depth_desired => 8,  
 
4230 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               symbolic_to_numeric_cardinality_threshold => 10,  
 
4231 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               how_many_stages => 4,  
 
4232 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_cleanup_needed => 1,  
 
4233 
 
 
 
 
 
 
 
 
 
 
 
 
 
                   );  
 
4234 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4235 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Note in particular the constructor parameter:  
 
4236 
 
 
 
 
 
 
 
 
 
 
 
 
 
       
 
4237 
 
 
 
 
 
 
 
 
 
 
 
 
 
     how_many_stages  
 
4238 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4239 
 
 
 
 
 
 
 
 
 
 
 
 
 
 As its name implies, this parameter controls how many stages will be used in the  
 
4240 
 
 
 
 
 
 
 
 
 
 
 
 
 
 boosted decision tree classifier.  As mentioned above, a separate decision tree is  
 
4241 
 
 
 
 
 
 
 
 
 
 
 
 
 
 constructed for each stage of boosting using a set of training samples that are drawn  
 
4242 
 
 
 
 
 
 
 
 
 
 
 
 
 
 through a probability distribution maintained over the entire training dataset.  
 
4243 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4244 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =back  
 
4245 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4246 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head2 B class>   
 
4247 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4248 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =over 8  
 
4249 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4250 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4251 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4252 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This method reads your training datafile, creates the data structures from the data  
 
4253 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ingested for constructing the base decision tree.  
 
4254 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4255 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4256 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4257 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Writes to the standard output the training data samples and also some relevant  
 
4258 
 
 
 
 
 
 
 
 
 
 
 
 
 
 properties of the features used in the training dataset.  
 
4259 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4260 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4261 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4262 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Calls on the appropriate methods of the main C class to estimate the   
 
4263 
 
 
 
 
 
 
 
 
 
 
 
 
 
 first-order probabilities and the class priors.  
 
4264 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4265 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4266 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4267 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Calls on the appropriate method of the main C class to construct the   
 
4268 
 
 
 
 
 
 
 
 
 
 
 
 
 
 base decision tree.  
 
4269 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4270 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4271 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4272 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Displays the base decision tree in your terminal window. (The textual form of the  
 
4273 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision tree is written out to the standard output.)  
 
4274 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4275 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4276 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4277 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Uses the AdaBoost algorithm to construct a cascade of decision trees.  As mentioned  
 
4278 
 
 
 
 
 
 
 
 
 
 
 
 
 
 earlier, the training samples for each tree in the cascade are drawn using a  
 
4279 
 
 
 
 
 
 
 
 
 
 
 
 
 
 probability distribution over the entire training dataset. This probability  
 
4280 
 
 
 
 
 
 
 
 
 
 
 
 
 
 distribution for any given tree in the cascade is heavily influenced by which  
 
4281 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training samples are misclassified by the previous tree.  
 
4282 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4283 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4284 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4285 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Displays separately in your terminal window the decision tree constructed for each  
 
4286 
 
 
 
 
 
 
 
 
 
 
 
 
 
 stage of the cascade. (The textual form of the trees is written out to the standard  
 
4287 
 
 
 
 
 
 
 
 
 
 
 
 
 
 output.)  
 
4288 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4289 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4290 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4291 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Calls on each decision tree in the cascade to classify the argument C<$test_sample>.  
 
4292 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4293 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4294 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4295 
 
 
 
 
 
 
 
 
 
 
 
 
 
 You can call this method to display in your terminal window the classification  
 
4296 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision made by each decision tree in the cascade.  The method also prints out the  
 
4297 
 
 
 
 
 
 
 
 
 
 
 
 
 
 trust factor associated with each decision tree.  It is important to look  
 
4298 
 
 
 
 
 
 
 
 
 
 
 
 
 
 simultaneously at the classification decision and the trust factor for each tree ---  
 
4299 
 
 
 
 
 
 
 
 
 
 
 
 
 
 since a classification decision made by a specific tree may appear bizarre for a  
 
4300 
 
 
 
 
 
 
 
 
 
 
 
 
 
 given test sample.  This method is useful primarily for debugging purposes.  
 
4301 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4302 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4303 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4304 
 
 
 
 
 
 
 
 
 
 
 
 
 
 As with the previous method, this method is useful mostly for debugging. It returns  
 
4305 
 
 
 
 
 
 
 
 
 
 
 
 
 
 class labels for the samples misclassified by the stage whose integer index is  
 
4306 
 
 
 
 
 
 
 
 
 
 
 
 
 
 supplied as an argument to the method.  Say you have 10 stages in your cascade.  The  
 
4307 
 
 
 
 
 
 
 
 
 
 
 
 
 
 value of the argument C would go from 0 to 9, with 0 corresponding to   
 
4308 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the base tree.  
 
4309 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4310 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4311 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4312 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Uses the "final classifier" formula of the AdaBoost algorithm to pool together the  
 
4313 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classification decisions made by the individual trees while taking into account the  
 
4314 
 
 
 
 
 
 
 
 
 
 
 
 
 
 trust factors associated with the trees.  As mentioned earlier, we associate with  
 
4315 
 
 
 
 
 
 
 
 
 
 
 
 
 
 each tree of the cascade a trust factor that depends on the overall misclassification  
 
4316 
 
 
 
 
 
 
 
 
 
 
 
 
 
 rate associated with that tree.  
 
4317 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4318 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =back  
 
4319 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4320 
 
 
 
 
 
 
 
 
 
 
 
 
 
 See the example scripts in the C subdirectory for how to call the   
 
4321 
 
 
 
 
 
 
 
 
 
 
 
 
 
 methods listed above for classifying individual data samples with boosting and for  
 
4322 
 
 
 
 
 
 
 
 
 
 
 
 
 
 bulk classification when you place all your test samples in a single file.  
 
4323 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4324 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4325 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 USING RANDOMIZED DECISION TREES  
 
4326 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4327 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Consider the following two situations that call for using randomized decision trees,  
 
4328 
 
 
 
 
 
 
 
 
 
 
 
 
 
 meaning multiple decision trees that are trained using data extracted randomly from a  
 
4329 
 
 
 
 
 
 
 
 
 
 
 
 
 
 large database of training samples:   
 
4330 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4331 
 
 
 
 
 
 
 
 
 
 
 
 
 
 (1) Consider a two-class problem for which the training database is grossly  
 
4332 
 
 
 
 
 
 
 
 
 
 
 
 
 
 imbalanced in how many majority-class samples it contains vis-a-vis the number of  
 
4333 
 
 
 
 
 
 
 
 
 
 
 
 
 
 minority class samples.  Let's assume for a moment that the ratio of majority class  
 
4334 
 
 
 
 
 
 
 
 
 
 
 
 
 
 samples to minority class samples is 1000 to 1.  Let's also assume that you have a  
 
4335 
 
 
 
 
 
 
 
 
 
 
 
 
 
 test dataset that is drawn randomly from the same population mixture from which the  
 
4336 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training database was created.  Now consider a stupid data classification program  
 
4337 
 
 
 
 
 
 
 
 
 
 
 
 
 
 that classifies everything as belonging to the majority class.  If you measure the  
 
4338 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classification accuracy rate as the ratio of the number of samples correctly  
 
4339 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classified to the total number of test samples selected randomly from the population,  
 
4340 
 
 
 
 
 
 
 
 
 
 
 
 
 
 this classifier would work with an accuracy of 99.99%.   
 
4341 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4342 
 
 
 
 
 
 
 
 
 
 
 
 
 
 (2) Let's now consider another situation in which we are faced with a huge training  
 
4343 
 
 
 
 
 
 
 
 
 
 
 
 
 
 database but in which every class is equally well represented.  Feeding all the data  
 
4344 
 
 
 
 
 
 
 
 
 
 
 
 
 
 into a single decision tree would be akin to polling all of the population of the  
 
4345 
 
 
 
 
 
 
 
 
 
 
 
 
 
 United States for measuring the Coke-versus-Pepsi preference in the country.  You are  
 
4346 
 
 
 
 
 
 
 
 
 
 
 
 
 
 likely to get better results if you construct multiple decision trees, each trained  
 
4347 
 
 
 
 
 
 
 
 
 
 
 
 
 
 with a collection of training samples drawn randomly from the training database.  
 
4348 
 
 
 
 
 
 
 
 
 
 
 
 
 
 After you have created all the decision trees, your final classification decision  
 
4349 
 
 
 
 
 
 
 
 
 
 
 
 
 
 could then be based on, say, majority voting by the trees.  
 
4350 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4351 
 
 
 
 
 
 
 
 
 
 
 
 
 
 In summary, the C class allows you to solve the following   
 
4352 
 
 
 
 
 
 
 
 
 
 
 
 
 
 two problems: (1) Data classification using the needle-in-a-haystack metaphor, that  
 
4353 
 
 
 
 
 
 
 
 
 
 
 
 
 
 is, when a vast majority of your training samples belong to just one class.  And (2)  
 
4354 
 
 
 
 
 
 
 
 
 
 
 
 
 
 You have access to a very large database of training samples and you wish to  
 
4355 
 
 
 
 
 
 
 
 
 
 
 
 
 
 construct an ensemble of decision trees for classification.  
 
4356 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4357 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =over 4  
 
4358 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4359 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4360 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4361 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Here is how you'd call the C constructor for   
 
4362 
 
 
 
 
 
 
 
 
 
 
 
 
 
 needle-in-a-haystack classification:  
 
4363 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4364 
 
 
 
 
 
 
 
 
 
 
 
 
 
     use Algorithm::RandomizedTreesForBigData;  
 
4365 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $training_datafile = "your_database.csv";  
 
4366 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $rt = Algorithm::RandomizedTreesForBigData->new(  
 
4367 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               training_datafile => $training_datafile,  
 
4368 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_class_column_index => 48,  
 
4369 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_columns_for_features => [24,32,33,34,41],  
 
4370 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               entropy_threshold => 0.01,  
 
4371 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               max_depth_desired => 8,  
 
4372 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               symbolic_to_numeric_cardinality_threshold => 10,  
 
4373 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               how_many_trees => 5,  
 
4374 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               looking_for_needles_in_haystack => 1,  
 
4375 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_cleanup_needed => 1,  
 
4376 
 
 
 
 
 
 
 
 
 
 
 
 
 
              );  
 
4377 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4378 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Note in particular the constructor parameters:  
 
4379 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4380 
 
 
 
 
 
 
 
 
 
 
 
 
 
     looking_for_needles_in_haystack  
 
4381 
 
 
 
 
 
 
 
 
 
 
 
 
 
     how_many_trees  
 
4382 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4383 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The first of these parameters, C, invokes the logic for   
 
4384 
 
 
 
 
 
 
 
 
 
 
 
 
 
 constructing an ensemble of decision trees, each based on a training dataset that  
 
4385 
 
 
 
 
 
 
 
 
 
 
 
 
 
 uses all of the minority class samples, and a random drawing from the majority class  
 
4386 
 
 
 
 
 
 
 
 
 
 
 
 
 
 samples.  
 
4387 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4388 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Here is how you'd call the C constructor for a more   
 
4389 
 
 
 
 
 
 
 
 
 
 
 
 
 
 general attempt at constructing an ensemble of decision trees, with each tree trained  
 
4390 
 
 
 
 
 
 
 
 
 
 
 
 
 
 with randomly drawn samples from a large database of training data (without paying  
 
4391 
 
 
 
 
 
 
 
 
 
 
 
 
 
 attention to the differences in the sizes of the populations for the different  
 
4392 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classes):  
 
4393 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4394 
 
 
 
 
 
 
 
 
 
 
 
 
 
     use Algorithm::RandomizedTreesForBigData;  
 
4395 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $training_datafile = "your_database.csv";  
 
4396 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $rt = Algorithm::RandomizedTreesForBigData->new(  
 
4397 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               training_datafile => $training_datafile,  
 
4398 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_class_column_index => 2,  
 
4399 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_columns_for_features => [3,4,5,6,7,8],  
 
4400 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               entropy_threshold => 0.01,  
 
4401 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               max_depth_desired => 8,  
 
4402 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               symbolic_to_numeric_cardinality_threshold => 10,  
 
4403 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               how_many_trees => 3,  
 
4404 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               how_many_training_samples_per_tree => 50,  
 
4405 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_cleanup_needed => 1,  
 
4406 
 
 
 
 
 
 
 
 
 
 
 
 
 
              );  
 
4407 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4408 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Note in particular the constructor parameters:  
 
4409 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4410 
 
 
 
 
 
 
 
 
 
 
 
 
 
     how_many_training_samples_per_tree  
 
4411 
 
 
 
 
 
 
 
 
 
 
 
 
 
     how_many_trees  
 
4412 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4413 
 
 
 
 
 
 
 
 
 
 
 
 
 
 When you set the C parameter, you are not allowed   
 
4414 
 
 
 
 
 
 
 
 
 
 
 
 
 
 to also set the C parameter, and vice versa.   
 
4415 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4416 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =back  
 
4417 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4418 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head2 B class>   
 
4419 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4420 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =over 8  
 
4421 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4422 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4423 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4424 
 
 
 
 
 
 
 
 
 
 
 
 
 
 What this method does depends on which of the two constructor parameters ---  
 
4425 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C or C --- is    
 
4426 
 
 
 
 
 
 
 
 
 
 
 
 
 
 set.  When the former is set, it creates a collection of training datasets for  
 
4427 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C number of decision trees, with each dataset being a mixture of the   
 
4428 
 
 
 
 
 
 
 
 
 
 
 
 
 
 minority class and sample drawn randomly from the majority class.  However, when the  
 
4429 
 
 
 
 
 
 
 
 
 
 
 
 
 
 latter option is set, all the datasets are drawn randomly from the training database  
 
4430 
 
 
 
 
 
 
 
 
 
 
 
 
 
 with no particular attention given to the relative populations of the two classes.  
 
4431 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4432 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4433 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4434 
 
 
 
 
 
 
 
 
 
 
 
 
 
 As the name implies, this method shows the training data being used for all the  
 
4435 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision trees.  This method is useful for debugging purposes using small datasets.  
 
4436 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4437 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4438 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4439 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Calls on the appropriate method of the main C to estimate the   
 
4440 
 
 
 
 
 
 
 
 
 
 
 
 
 
 first-order probabilities for the training dataset to be used for each decision tree.  
 
4441 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4442 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4443 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4444 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Calls on the appropriate method of the main C class to estimate the   
 
4445 
 
 
 
 
 
 
 
 
 
 
 
 
 
 class priors for the training dataset to be used for each decision tree.  
 
4446 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4447 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4448 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4449 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Calls on the appropriate method of the main C class to construct the   
 
4450 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision trees.  
 
4451 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4452 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4453 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4454 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Displays all the decision trees in your terminal window. (The textual form of the  
 
4455 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision trees is written out to the standard output.)  
 
4456 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4457 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4458 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4459 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The test_sample is sent to each decision tree for classification.  
 
4460 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4461 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4462 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4463 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The classification decisions returned by the individual decision trees are written  
 
4464 
 
 
 
 
 
 
 
 
 
 
 
 
 
 out to the standard output.  
 
4465 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4466 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4467 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4468 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This method aggregates the classification results returned by the individual decision  
 
4469 
 
 
 
 
 
 
 
 
 
 
 
 
 
 trees and returns the majority decision.  
 
4470 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4471 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =back  
 
4472 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4473 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 CONSTRUCTING REGRESSION TREES:  
 
4474 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4475 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Decision tree based modeling requires that the class labels be distinct.  That is,  
 
4476 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the training dataset must contain a relatively small number of discrete class labels  
 
4477 
 
 
 
 
 
 
 
 
 
 
 
 
 
 for all of your data records if you want to model the data with one or more decision  
 
4478 
 
 
 
 
 
 
 
 
 
 
 
 
 
 trees.  However, when one is trying to understand all of the associational  
 
4479 
 
 
 
 
 
 
 
 
 
 
 
 
 
 relationships that exist in a large database, one often runs into situations where,  
 
4480 
 
 
 
 
 
 
 
 
 
 
 
 
 
 instead of discrete class labels, you have a continuously valued variable as a  
 
4481 
 
 
 
 
 
 
 
 
 
 
 
 
 
 dependent variable whose values are predicated on a set of feature values.  It is for  
 
4482 
 
 
 
 
 
 
 
 
 
 
 
 
 
 such situations that you will find useful the new class C that is now   
 
4483 
 
 
 
 
 
 
 
 
 
 
 
 
 
 a part of the C module.  The C class has been    
 
4484 
 
 
 
 
 
 
 
 
 
 
 
 
 
 programmed as a subclass of the main C class.   
 
4485 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4486 
 
 
 
 
 
 
 
 
 
 
 
 
 
 You can think of regression with a regression tree as a powerful generalization of  
 
4487 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the very commonly used Linear Regression algorithms.  Although you can certainly  
 
4488 
 
 
 
 
 
 
 
 
 
 
 
 
 
 carry out polynomial regression with run-of-the-mill Linear Regression algorithms for  
 
4489 
 
 
 
 
 
 
 
 
 
 
 
 
 
 modeling nonlinearities between the predictor variables and the dependent variable,  
 
4490 
 
 
 
 
 
 
 
 
 
 
 
 
 
 specifying the degree of the polynomial is often tricky. Additionally, a polynomial  
 
4491 
 
 
 
 
 
 
 
 
 
 
 
 
 
 can inject continuities between the predictor and the predicted variables that may  
 
4492 
 
 
 
 
 
 
 
 
 
 
 
 
 
 not really exist in the real data.  Regression trees, on the other hand, give you a  
 
4493 
 
 
 
 
 
 
 
 
 
 
 
 
 
 piecewise linear relationship between the predictor and the predicted variables that  
 
4494 
 
 
 
 
 
 
 
 
 
 
 
 
 
 is freed from the constraints of superimposed continuities at the joins between the  
 
4495 
 
 
 
 
 
 
 
 
 
 
 
 
 
 different segments.  See the following tutorial for further information regarding the  
 
4496 
 
 
 
 
 
 
 
 
 
 
 
 
 
 standard linear regression approach and the regression that can be achieved with the  
 
4497 
 
 
 
 
 
 
 
 
 
 
 
 
 
 RegressionTree class in this module:  
 
4498 
 
 
 
 
 
 
 
 
 
 
 
 
 
 L   
 
4499 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4500 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The RegressionTree class in the current version of the module assumes that all of  
 
4501 
 
 
 
 
 
 
 
 
 
 
 
 
 
 your data is numerical.  That is, unlike what is possible with the DecisionTree class  
 
4502 
 
 
 
 
 
 
 
 
 
 
 
 
 
 (and the other more closely related classes in this module) that allow your training  
 
4503 
 
 
 
 
 
 
 
 
 
 
 
 
 
 file to contain a mixture of numerical and symbolic data, the RegressionTree class  
 
4504 
 
 
 
 
 
 
 
 
 
 
 
 
 
 requires that ALL of your data be numerical.  I hope to relax this constraint in  
 
4505 
 
 
 
 
 
 
 
 
 
 
 
 
 
 future versions of this module.  Obviously, the dependent variable will always be  
 
4506 
 
 
 
 
 
 
 
 
 
 
 
 
 
 numerical for regression.  
 
4507 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4508 
 
 
 
 
 
 
 
 
 
 
 
 
 
 See the example scripts in the directory C if you wish to become   
 
4509 
 
 
 
 
 
 
 
 
 
 
 
 
 
 more familiar with the regression capabilities of the module.  
 
4510 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4511 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =over 4  
 
4512 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4513 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4514 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4515 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $training_datafile = "gendata5.csv";  
 
4516 
 
 
 
 
 
 
 
 
 
 
 
 
 
     my $rt = Algorithm::RegressionTree->new(  
 
4517 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               training_datafile => $training_datafile,  
 
4518 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               dependent_variable_column => 2,  
 
4519 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               predictor_columns => [1],  
 
4520 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               mse_threshold => 0.01,  
 
4521 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               max_depth_desired => 2,  
 
4522 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               jacobian_choice => 0,  
 
4523 
 
 
 
 
 
 
 
 
 
 
 
 
 
                               csv_cleanup_needed => 1,  
 
4524 
 
 
 
 
 
 
 
 
 
 
 
 
 
              );  
 
4525 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4526 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Note in particular the constructor parameters:  
 
4527 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4528 
 
 
 
 
 
 
 
 
 
 
 
 
 
     dependent_variable  
 
4529 
 
 
 
 
 
 
 
 
 
 
 
 
 
     predictor_columns  
 
4530 
 
 
 
 
 
 
 
 
 
 
 
 
 
     mse_threshold  
 
4531 
 
 
 
 
 
 
 
 
 
 
 
 
 
     jacobian_choice  
 
4532 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4533 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The first of these parameters, C, is set to the column index in   
 
4534 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the CSV file for the dependent variable.  The second constructor parameter,  
 
4535 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C, tells the system as to which columns contain values for the   
 
4536 
 
 
 
 
 
 
 
 
 
 
 
 
 
 predictor variables.  The third parameter, C, is for deciding when to   
 
4537 
 
 
 
 
 
 
 
 
 
 
 
 
 
 partition the data at a node into two child nodes as a regression tree is being  
 
4538 
 
 
 
 
 
 
 
 
 
 
 
 
 
 constructed.  If the minmax of MSE (Mean Squared Error) that can be achieved by  
 
4539 
 
 
 
 
 
 
 
 
 
 
 
 
 
 partitioning any of the features at a node is smaller than C, that   
 
4540 
 
 
 
 
 
 
 
 
 
 
 
 
 
 node becomes a leaf node of the regression tree.  
 
4541 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4542 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The last parameter, C, must be set to either 0 or 1 or 2.  Its   
 
4543 
 
 
 
 
 
 
 
 
 
 
 
 
 
 default value is 0. When this parameter equals 0, the regression coefficients are  
 
4544 
 
 
 
 
 
 
 
 
 
 
 
 
 
 calculated using the linear least-squares method and no further "refinement" of the  
 
4545 
 
 
 
 
 
 
 
 
 
 
 
 
 
 coefficients is carried out using gradient descent.  This is the fastest way to  
 
4546 
 
 
 
 
 
 
 
 
 
 
 
 
 
 calculate the regression coefficients.  When C is set to 1, you get   
 
4547 
 
 
 
 
 
 
 
 
 
 
 
 
 
 a weak version of gradient descent in which the Jacobian is set to the "design  
 
4548 
 
 
 
 
 
 
 
 
 
 
 
 
 
 matrix" itself. Choosing 2 for C results in a more reasonable   
 
4549 
 
 
 
 
 
 
 
 
 
 
 
 
 
 approximation to the Jacobian.  That, however, is at a cost of much longer  
 
4550 
 
 
 
 
 
 
 
 
 
 
 
 
 
 computation time.  B For most cases, using 0 for C is the    
 
4551 
 
 
 
 
 
 
 
 
 
 
 
 
 
 best choice.  See my tutorial "I" for why   
 
4552 
 
 
 
 
 
 
 
 
 
 
 
 
 
 that is the case.  
 
4553 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4554 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =back  
 
4555 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4556 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head2 B class>   
 
4557 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4558 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =over 8  
 
4559 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4560 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4561 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4562 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Only CSV training datafiles are allowed. Additionally, the first record in the file  
 
4563 
 
 
 
 
 
 
 
 
 
 
 
 
 
 must list the names of the fields, and the first column must contain an integer ID  
 
4564 
 
 
 
 
 
 
 
 
 
 
 
 
 
 for each record.  
 
4565 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4566 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4567 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4568 
 
 
 
 
 
 
 
 
 
 
 
 
 
 As the name implies, this is the method that construct a regression tree.  
 
4569 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4570 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4571 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4572 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Displays the regression tree, as the name implies.  The white-space string argument  
 
4573 
 
 
 
 
 
 
 
 
 
 
 
 
 
 specifies the offset to use in displaying the child nodes in relation to a parent  
 
4574 
 
 
 
 
 
 
 
 
 
 
 
 
 
 node.  
 
4575 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4576 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4577 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4578 
 
 
 
 
 
 
 
 
 
 
 
 
 
 You call this method after you have constructed a regression tree if you want to  
 
4579 
 
 
 
 
 
 
 
 
 
 
 
 
 
 calculate the prediction for one sample.  The parameter C<$root_node> is what is  
 
4580 
 
 
 
 
 
 
 
 
 
 
 
 
 
 returned by the call C.  The formatting of the argument   
 
4581 
 
 
 
 
 
 
 
 
 
 
 
 
 
 bound to the C<$test_sample> parameter is important.  To elaborate, let's say you are  
 
4582 
 
 
 
 
 
 
 
 
 
 
 
 
 
 using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In  
 
4583 
 
 
 
 
 
 
 
 
 
 
 
 
 
 this case, the C<$test_sample> parameter will be bound to a list that will look like  
 
4584 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4585 
 
 
 
 
 
 
 
 
 
 
 
 
 
     ['xvar1 = 23.4', 'xvar2 = 12.9']   
 
4586 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4587 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Arbitrary amount of white space, including none, on the two sides of the equality  
 
4588 
 
 
 
 
 
 
 
 
 
 
 
 
 
 symbol is allowed in the construct shown above.  A call to this method returns a  
 
4589 
 
 
 
 
 
 
 
 
 
 
 
 
 
 dictionary with two key-value pairs.  One of the keys is called C and   
 
4590 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the other C.  The value associated with key C is the path    
 
4591 
 
 
 
 
 
 
 
 
 
 
 
 
 
 in the regression tree to the leaf node that yielded the prediction.  And the value  
 
4592 
 
 
 
 
 
 
 
 
 
 
 
 
 
 associated with the key C is the answer you are looking for.   
 
4593 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4594 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4595 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4596 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This call calculates the predictions for all of the predictor variables data in your  
 
4597 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training file.  The parameter C<$root_node> is what is returned by the call to  
 
4598 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C.  The values for the dependent variable thus predicted   
 
4599 
 
 
 
 
 
 
 
 
 
 
 
 
 
 can be seen by calling C, which is the method mentioned below.   
 
4600 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4601 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4602 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4603 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This method displays the results obtained by calling the prediction method of the  
 
4604 
 
 
 
 
 
 
 
 
 
 
 
 
 
 previous entry.  This method also creates a hardcopy of the plots and saves it as a  
 
4605 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C<.png> disk file. The name of this output file is always C.   
 
4606 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4607 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4608 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4609 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This method carries out an error analysis of the predictions for the samples in your  
 
4610 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training datafile.  It shows you the overall MSE (Mean Squared Error) with tree-based  
 
4611 
 
 
 
 
 
 
 
 
 
 
 
 
 
 regression, the MSE for the data samples at each of the leaf nodes of the regression  
 
4612 
 
 
 
 
 
 
 
 
 
 
 
 
 
 tree, and the MSE for the plain old Linear Regression as applied to all of the data.  
 
4613 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The parameter C<$root_node> in the call syntax is what is returned by the call to  
 
4614 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C.   
 
4615 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4616 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =item B   
 
4617 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4618 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Call this method if you want to apply the regression tree to all your test data in a  
 
4619 
 
 
 
 
 
 
 
 
 
 
 
 
 
 disk file.  The predictions for all of the test samples in the disk file are written  
 
4620 
 
 
 
 
 
 
 
 
 
 
 
 
 
 out to another file whose name is the same as that of the test file except for the  
 
4621 
 
 
 
 
 
 
 
 
 
 
 
 
 
 addition of C<_output> in the name of the file.  The parameter C<$filename> is the  
 
4622 
 
 
 
 
 
 
 
 
 
 
 
 
 
 name of the disk file that contains the test data. And the parameter C<$columns> is a  
 
4623 
 
 
 
 
 
 
 
 
 
 
 
 
 
 list of the column indices for the predictor variables in the test file.  
 
4624 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4625 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =back  
 
4626 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4627 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 GENERATING SYNTHETIC TRAINING DATA  
 
4628 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4629 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The module file contains the following additional classes: (1)  
 
4630 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C, and (2) C for    
 
4631 
 
 
 
 
 
 
 
 
 
 
 
 
 
 generating synthetic training data.  
 
4632 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4633 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The class C outputs one CSV file for the   
 
4634 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training data and another one for the test data for experimenting with numeric  
 
4635 
 
 
 
 
 
 
 
 
 
 
 
 
 
 features.  The numeric values are generated using a multivariate Gaussian  
 
4636 
 
 
 
 
 
 
 
 
 
 
 
 
 
 distribution whose mean and covariance are specified in a parameter file. See the  
 
4637 
 
 
 
 
 
 
 
 
 
 
 
 
 
 file C in the C directory for an example of such a    
 
4638 
 
 
 
 
 
 
 
 
 
 
 
 
 
 parameter file.  Note that the dimensionality of the data is inferred from the  
 
4639 
 
 
 
 
 
 
 
 
 
 
 
 
 
 information you place in the parameter file.  
 
4640 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4641 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The class C generates synthetic training for the   
 
4642 
 
 
 
 
 
 
 
 
 
 
 
 
 
 purely symbolic case.  The relative frequencies of the different possible values for  
 
4643 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the features is controlled by the biasing information you place in a parameter file.  
 
4644 
 
 
 
 
 
 
 
 
 
 
 
 
 
 See C for an example of such a file.   
 
4645 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4646 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4647 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 THE C DIRECTORY   
 
4648 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4649 
 
 
 
 
 
 
 
 
 
 
 
 
 
 See the C directory in the distribution for how to construct a decision   
 
4650 
 
 
 
 
 
 
 
 
 
 
 
 
 
 tree, and how to then classify new data using the decision tree.  To become more  
 
4651 
 
 
 
 
 
 
 
 
 
 
 
 
 
 familiar with the module, run the scripts  
 
4652 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4653 
 
 
 
 
 
 
 
 
 
 
 
 
 
     construct_dt_and_classify_one_sample_case1.pl  
 
4654 
 
 
 
 
 
 
 
 
 
 
 
 
 
     construct_dt_and_classify_one_sample_case2.pl  
 
4655 
 
 
 
 
 
 
 
 
 
 
 
 
 
     construct_dt_and_classify_one_sample_case3.pl  
 
4656 
 
 
 
 
 
 
 
 
 
 
 
 
 
     construct_dt_and_classify_one_sample_case4.pl  
 
4657 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4658 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The first script is for the purely symbolic case, the second for the case that  
 
4659 
 
 
 
 
 
 
 
 
 
 
 
 
 
 involves both numeric and symbolic features, the third for the case of purely numeric  
 
4660 
 
 
 
 
 
 
 
 
 
 
 
 
 
 features, and the last for the case when the training data is synthetically generated  
 
4661 
 
 
 
 
 
 
 
 
 
 
 
 
 
 by the script C.   
 
4662 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4663 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Next run the following script as it is for bulk classification of data records placed  
 
4664 
 
 
 
 
 
 
 
 
 
 
 
 
 
 in a CSV file:  
 
4665 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4666 
 
 
 
 
 
 
 
 
 
 
 
 
 
     classify_test_data_in_a_file.pl   training4.csv   test4.csv   out4.csv  
 
4667 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4668 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The script first constructs a decision tree using the training data in the training  
 
4669 
 
 
 
 
 
 
 
 
 
 
 
 
 
 file supplied by the first argument file C.  The script then   
 
4670 
 
 
 
 
 
 
 
 
 
 
 
 
 
 calculates the class label for each data record in the test data file supplied  
 
4671 
 
 
 
 
 
 
 
 
 
 
 
 
 
 through the second argument file, C.  The estimated class labels are   
 
4672 
 
 
 
 
 
 
 
 
 
 
 
 
 
 written out to the output file which in the call shown above is C.  An   
 
4673 
 
 
 
 
 
 
 
 
 
 
 
 
 
 important thing to note here is that your test file --- in this case C ---   
 
4674 
 
 
 
 
 
 
 
 
 
 
 
 
 
 must have a column for class labels.  Obviously, in real-life situations, there will  
 
4675 
 
 
 
 
 
 
 
 
 
 
 
 
 
 be no class labels in this column.  What that is the case, you can place an empty  
 
4676 
 
 
 
 
 
 
 
 
 
 
 
 
 
 string C<""> there for each data record. This is demonstrated by the following call:  
 
4677 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4678 
 
 
 
 
 
 
 
 
 
 
 
 
 
     classify_test_data_in_a_file.pl   training4.csv   test4_no_class_labels.csv   out4.csv  
 
4679 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4680 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The following script in the C directory   
 
4681 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4682 
 
 
 
 
 
 
 
 
 
 
 
 
 
     classify_by_asking_questions.pl  
 
4683 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4684 
 
 
 
 
 
 
 
 
 
 
 
 
 
 shows how you can use a decision-tree classifier interactively.  In this mode, you  
 
4685 
 
 
 
 
 
 
 
 
 
 
 
 
 
 first construct the decision tree from the training data and then the user is  
 
4686 
 
 
 
 
 
 
 
 
 
 
 
 
 
 prompted for answers to the feature tests at the nodes of the tree.  
 
4687 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4688 
 
 
 
 
 
 
 
 
 
 
 
 
 
 If your training data has a feature whose values span a large range and, at the same  
 
4689 
 
 
 
 
 
 
 
 
 
 
 
 
 
 time, are characterized by a heavy-tail distribution, you should look at the script  
 
4690 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4691 
 
 
 
 
 
 
 
 
 
 
 
 
 
     construct_dt_for_heavytailed.pl                                                       
 
4692 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4693 
 
 
 
 
 
 
 
 
 
 
 
 
 
 to see how to use the option C in the call to the   
 
4694 
 
 
 
 
 
 
 
 
 
 
 
 
 
 constructor.  This option was introduced in Version 2.22 for dealing with such  
 
4695 
 
 
 
 
 
 
 
 
 
 
 
 
 
 features.  If you do not set this option, the module will use the default value of  
 
4696 
 
 
 
 
 
 
 
 
 
 
 
 
 
 500 for the number of points at which to sample the value range for such a feature.  
 
4697 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4698 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The C directory also contains the following scripts:   
 
4699 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4700 
 
 
 
 
 
 
 
 
 
 
 
 
 
     generate_training_data_numeric.pl  
 
4701 
 
 
 
 
 
 
 
 
 
 
 
 
 
     generate_training_data_symbolic.pl  
 
4702 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4703 
 
 
 
 
 
 
 
 
 
 
 
 
 
 that show how you can use the module to generate synthetic training.  Synthetic  
 
4704 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training is generated according to the specifications laid out in a parameter file.  
 
4705 
 
 
 
 
 
 
 
 
 
 
 
 
 
 There are constraints on how the information is laid out in a parameter file.  See  
 
4706 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the files C and C in the C directory     
 
4707 
 
 
 
 
 
 
 
 
 
 
 
 
 
 for how to structure these files.  
 
4708 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4709 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The C directory of Versions 2.1 and higher of the module also contains the   
 
4710 
 
 
 
 
 
 
 
 
 
 
 
 
 
 following two scripts:  
 
4711 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4712 
 
 
 
 
 
 
 
 
 
 
 
 
 
     evaluate_training_data1.pl  
 
4713 
 
 
 
 
 
 
 
 
 
 
 
 
 
     evaluate_training_data2.pl  
 
4714 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4715 
 
 
 
 
 
 
 
 
 
 
 
 
 
 that illustrate how the Perl class C can be used to evaluate the   
 
4716 
 
 
 
 
 
 
 
 
 
 
 
 
 
 quality of your training data (as long as it resides in a `C<.csv>' file.)  This new  
 
4717 
 
 
 
 
 
 
 
 
 
 
 
 
 
 class is a subclass of the C class in the module file.  See the README   
 
4718 
 
 
 
 
 
 
 
 
 
 
 
 
 
 in the C directory for further information regarding these two scripts.   
 
4719 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4720 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The C directory of Versions 2.31 and higher of the module contains the   
 
4721 
 
 
 
 
 
 
 
 
 
 
 
 
 
 following three scripts:  
 
4722 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4723 
 
 
 
 
 
 
 
 
 
 
 
 
 
     introspection_in_a_loop_interactive.pl  
 
4724 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4725 
 
 
 
 
 
 
 
 
 
 
 
 
 
     introspection_show_training_samples_at_all_nodes_direct_influence.pl  
 
4726 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4727 
 
 
 
 
 
 
 
 
 
 
 
 
 
     introspection_show_training_samples_to_nodes_influence_propagation.pl  
 
4728 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4729 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The first script illustrates how to use the C class of the module   
 
4730 
 
 
 
 
 
 
 
 
 
 
 
 
 
 interactively for generating explanations for the classification decisions made at  
 
4731 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the nodes of the decision tree.  In the interactive session you are first asked for  
 
4732 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the node number you are interested in.  Subsequently, you are asked for whether or  
 
4733 
 
 
 
 
 
 
 
 
 
 
 
 
 
 not you are interested in specific questions that the introspector can provide  
 
4734 
 
 
 
 
 
 
 
 
 
 
 
 
 
 answers for. The second script generates a tabular display that shows for each node  
 
4735 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of the decision tree a list of the training samples that fall directly in the portion  
 
4736 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of the feature space assigned that node.  (As mentioned elsewhere in this  
 
4737 
 
 
 
 
 
 
 
 
 
 
 
 
 
 documentation, when this list is empty for a node, that means the node is a result of  
 
4738 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the generalization achieved by probabilistic modeling of the data.  Note that this  
 
4739 
 
 
 
 
 
 
 
 
 
 
 
 
 
 module constructs a decision tree NOT by partitioning the set of training samples,  
 
4740 
 
 
 
 
 
 
 
 
 
 
 
 
 
 BUT by partitioning the domains of the probability density functions.)  The third  
 
4741 
 
 
 
 
 
 
 
 
 
 
 
 
 
 script listed above also generates a tabular display, but one that shows how the  
 
4742 
 
 
 
 
 
 
 
 
 
 
 
 
 
 influence of each training sample propagates in the tree.  This display first shows  
 
4743 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the list of nodes that are affected directly by the data in a training sample. This  
 
4744 
 
 
 
 
 
 
 
 
 
 
 
 
 
 list is followed by an indented display of the nodes that are affected indirectly by  
 
4745 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the training sample.  A training sample affects a node indirectly if the node is a  
 
4746 
 
 
 
 
 
 
 
 
 
 
 
 
 
 descendant of one of the nodes affected directly.  
 
4747 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4748 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The latest addition to the Examples directory is the script:  
 
4749 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4750 
 
 
 
 
 
 
 
 
 
 
 
 
 
     get_indexes_associated_with_fields.py  
 
4751 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4752 
 
 
 
 
 
 
 
 
 
 
 
 
 
 As to why you may find this script useful, note that large database files may have  
 
4753 
 
 
 
 
 
 
 
 
 
 
 
 
 
 hundreds of fields and it is not always easy to figure out what numerical index is  
 
4754 
 
 
 
 
 
 
 
 
 
 
 
 
 
 associated with a given field.  At the same time, the constructor of the DecisionTree  
 
4755 
 
 
 
 
 
 
 
 
 
 
 
 
 
 module requires that the field that holds the class label and the fields that contain  
 
4756 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the feature values be specified by their numerical zero-based indexes.  If you have a  
 
4757 
 
 
 
 
 
 
 
 
 
 
 
 
 
 large database and you are faced with this problem, you can run this script to see  
 
4758 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the zero-based numerical index values associated with the different columns of your  
 
4759 
 
 
 
 
 
 
 
 
 
 
 
 
 
 CSV file.  
 
4760 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4761 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4762 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 THE C DIRECTORY   
 
4763 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4764 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The C directory contains the following scripts:   
 
4765 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4766 
 
 
 
 
 
 
 
 
 
 
 
 
 
     bagging_for_classifying_one_test_sample.pl  
 
4767 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                                                  
 
4768 
 
 
 
 
 
 
 
 
 
 
 
 
 
     bagging_for_bulk_classification.pl  
 
4769 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4770 
 
 
 
 
 
 
 
 
 
 
 
 
 
 As the names of the scripts imply, the first shows how to call the different methods  
 
4771 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of the C class for classifying a single test sample.  When   
 
4772 
 
 
 
 
 
 
 
 
 
 
 
 
 
 you are classifying a single test sample, you can also see how each bag is  
 
4773 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classifying the test sample.  You can, for example, display the training data used in  
 
4774 
 
 
 
 
 
 
 
 
 
 
 
 
 
 each bag, the decision tree constructed for each bag, etc.  
 
4775 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4776 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The second script is for the case when you place all of the test samples in a single  
 
4777 
 
 
 
 
 
 
 
 
 
 
 
 
 
 file.  The demonstration script displays for each test sample a single aggregate  
 
4778 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classification decision that is obtained through majority voting by all the decision  
 
4779 
 
 
 
 
 
 
 
 
 
 
 
 
 
 trees.  
 
4780 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4781 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4782 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 THE C DIRECTORY   
 
4783 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4784 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The C subdirectory in the main installation directory contains the   
 
4785 
 
 
 
 
 
 
 
 
 
 
 
 
 
 following three scripts:  
 
4786 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4787 
 
 
 
 
 
 
 
 
 
 
 
 
 
     boosting_for_classifying_one_test_sample_1.pl  
 
4788 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4789 
 
 
 
 
 
 
 
 
 
 
 
 
 
     boosting_for_classifying_one_test_sample_2.pl  
 
4790 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4791 
 
 
 
 
 
 
 
 
 
 
 
 
 
     boosting_for_bulk_classification.pl  
 
4792 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4793 
 
 
 
 
 
 
 
 
 
 
 
 
 
 As the names of the first two scripts imply, these show how to call the different  
 
4794 
 
 
 
 
 
 
 
 
 
 
 
 
 
 methods of the C class for classifying a single test sample.   
 
4795 
 
 
 
 
 
 
 
 
 
 
 
 
 
 When you are classifying a single test sample, you can see how each stage of the  
 
4796 
 
 
 
 
 
 
 
 
 
 
 
 
 
 cascade of decision trees is classifying the test sample.  You can also view each  
 
4797 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision tree separately and also see the trust factor associated with the tree.  
 
4798 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4799 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The third script is for the case when you place all of the test samples in a single  
 
4800 
 
 
 
 
 
 
 
 
 
 
 
 
 
 file.  The demonstration script outputs for each test sample a single aggregate  
 
4801 
 
 
 
 
 
 
 
 
 
 
 
 
 
 classification decision that is obtained through trust-factor weighted majority  
 
4802 
 
 
 
 
 
 
 
 
 
 
 
 
 
 voting by all the decision trees.  
 
4803 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4804 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 THE C DIRECTORY   
 
4805 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4806 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The C directory shows example scripts that you can use to   
 
4807 
 
 
 
 
 
 
 
 
 
 
 
 
 
 become more familiar with the C class for solving   
 
4808 
 
 
 
 
 
 
 
 
 
 
 
 
 
 needle-in-a-haystack and big-data data classification problems. These scripts are:  
 
4809 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4810 
 
 
 
 
 
 
 
 
 
 
 
 
 
     randomized_trees_for_classifying_one_test_sample_1.pl  
 
4811 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4812 
 
 
 
 
 
 
 
 
 
 
 
 
 
     randomized_trees_for_classifying_one_test_sample_2.pl  
 
4813 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4814 
 
 
 
 
 
 
 
 
 
 
 
 
 
     classify_database_records.pl  
 
4815 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4816 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The first script shows the constructor options to use for solving a  
 
4817 
 
 
 
 
 
 
 
 
 
 
 
 
 
 needle-in-a-haystack problem --- that is, a problem in which a vast majority of the  
 
4818 
 
 
 
 
 
 
 
 
 
 
 
 
 
 training data belongs to just one class.  The second script shows the constructor  
 
4819 
 
 
 
 
 
 
 
 
 
 
 
 
 
 options for using randomized decision trees for the case when you have access to a  
 
4820 
 
 
 
 
 
 
 
 
 
 
 
 
 
 very large database of training samples and you'd like to construct an ensemble of  
 
4821 
 
 
 
 
 
 
 
 
 
 
 
 
 
 decision trees using training samples pulled randomly from the training database.  
 
4822 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The last script illustrates how you can evaluate the classification power of an  
 
4823 
 
 
 
 
 
 
 
 
 
 
 
 
 
 ensemble of decision trees as constructed by C by classifying   
 
4824 
 
 
 
 
 
 
 
 
 
 
 
 
 
 a large number of test samples extracted randomly from the training database.  
 
4825 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4826 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4827 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 THE C DIRECTORY   
 
4828 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4829 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The C subdirectory in the main installation directory shows   
 
4830 
 
 
 
 
 
 
 
 
 
 
 
 
 
 example scripts that you can use to become familiar with regression trees and how  
 
4831 
 
 
 
 
 
 
 
 
 
 
 
 
 
 they can be used for nonlinear regression.  If you are new to the concept of  
 
4832 
 
 
 
 
 
 
 
 
 
 
 
 
 
 regression trees, start by executing the following scripts without changing them and  
 
4833 
 
 
 
 
 
 
 
 
 
 
 
 
 
 see what sort of output is produced by them:  
 
4834 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4835 
 
 
 
 
 
 
 
 
 
 
 
 
 
     regression4.pl  
 
4836 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4837 
 
 
 
 
 
 
 
 
 
 
 
 
 
     regression5.pl  
 
4838 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4839 
 
 
 
 
 
 
 
 
 
 
 
 
 
     regression6.pl  
 
4840 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4841 
 
 
 
 
 
 
 
 
 
 
 
 
 
     regression8.pl  
 
4842 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4843 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The C script involves only one predictor variable and one dependent   
 
4844 
 
 
 
 
 
 
 
 
 
 
 
 
 
 variable. The training data for this exercise is drawn from the file C.   
 
4845 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This data file contains strongly nonlinear data.  When you run the script  
 
4846 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C, you will see how much better the result from tree regression is   
 
4847 
 
 
 
 
 
 
 
 
 
 
 
 
 
 compared to what you can get with linear regression.  
 
4848 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4849 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The C script is essentially the same as the previous script except   
 
4850 
 
 
 
 
 
 
 
 
 
 
 
 
 
 for the fact that the training datafile used in this case, C, consists   
 
4851 
 
 
 
 
 
 
 
 
 
 
 
 
 
 of three noisy segments, as opposed to just two in the previous case.  
 
4852 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4853 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The script C deals with the case when we have two predictor variables   
 
4854 
 
 
 
 
 
 
 
 
 
 
 
 
 
 and one dependent variable.  You can think of the data as consisting of noisy height  
 
4855 
 
 
 
 
 
 
 
 
 
 
 
 
 
 values over an C<(x1,x2)> plane.  The data used in this script is drawn from the csv  
 
4856 
 
 
 
 
 
 
 
 
 
 
 
 
 
 file C.   
 
4857 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4858 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Finally, the script C shows how you can carry out bulk prediction for   
 
4859 
 
 
 
 
 
 
 
 
 
 
 
 
 
 all your test data records in a disk file.  The script writes all the calculated  
 
4860 
 
 
 
 
 
 
 
 
 
 
 
 
 
 predictions into another disk file whose name is derived from the name of the test  
 
4861 
 
 
 
 
 
 
 
 
 
 
 
 
 
 data file.  
 
4862 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4863 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4864 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 EXPORT  
 
4865 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4866 
 
 
 
 
 
 
 
 
 
 
 
 
 
 None by design.  
 
4867 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4868 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 BUGS  
 
4869 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4870 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Please notify the author if you encounter any bugs.  When sending email, please place  
 
4871 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the string 'DecisionTree' in the subject line.  
 
4872 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4873 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 INSTALLATION  
 
4874 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4875 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Download the archive from CPAN in any directory of your choice.  Unpack the archive  
 
4876 
 
 
 
 
 
 
 
 
 
 
 
 
 
 with a command that on a Linux machine would look like:  
 
4877 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4878 
 
 
 
 
 
 
 
 
 
 
 
 
 
     tar zxvf Algorithm-DecisionTree-3.43.tar.gz  
 
4879 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4880 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This will create an installation directory for you whose name will be  
 
4881 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C.  Enter this directory and execute the following   
 
4882 
 
 
 
 
 
 
 
 
 
 
 
 
 
 commands for a standard install of the module if you have root privileges:  
 
4883 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4884 
 
 
 
 
 
 
 
 
 
 
 
 
 
     perl Makefile.PL  
 
4885 
 
 
 
 
 
 
 
 
 
 
 
 
 
     make  
 
4886 
 
 
 
 
 
 
 
 
 
 
 
 
 
     make test  
 
4887 
 
 
 
 
 
 
 
 
 
 
 
 
 
     sudo make install  
 
4888 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4889 
 
 
 
 
 
 
 
 
 
 
 
 
 
 If you do not have root privileges, you can carry out a non-standard install the  
 
4890 
 
 
 
 
 
 
 
 
 
 
 
 
 
 module in any directory of your choice by:  
 
4891 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4892 
 
 
 
 
 
 
 
 
 
 
 
 
 
     perl Makefile.PL prefix=/some/other/directory  
 
4893 
 
 
 
 
 
 
 
 
 
 
 
 
 
     make  
 
4894 
 
 
 
 
 
 
 
 
 
 
 
 
 
     make test  
 
4895 
 
 
 
 
 
 
 
 
 
 
 
 
 
     make install  
 
4896 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4897 
 
 
 
 
 
 
 
 
 
 
 
 
 
 With a non-standard install, you may also have to set your PERL5LIB environment  
 
4898 
 
 
 
 
 
 
 
 
 
 
 
 
 
 variable so that this module can find the required other modules. How you do that  
 
4899 
 
 
 
 
 
 
 
 
 
 
 
 
 
 would depend on what platform you are working on.  In order to install this module in  
 
4900 
 
 
 
 
 
 
 
 
 
 
 
 
 
 a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment  
 
4901 
 
 
 
 
 
 
 
 
 
 
 
 
 
 variable by  
 
4902 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4903 
 
 
 
 
 
 
 
 
 
 
 
 
 
     setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/  
 
4904 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4905 
 
 
 
 
 
 
 
 
 
 
 
 
 
 If I used bash, I'd need to declare:  
 
4906 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4907 
 
 
 
 
 
 
 
 
 
 
 
 
 
     export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/  
 
4908 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4909 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Another option for a non-standard install would be to place at the top of your own  
 
4910 
 
 
 
 
 
 
 
 
 
 
 
 
 
 scripts the following invocation:  
 
4911 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4912 
 
 
 
 
 
 
 
 
 
 
 
 
 
     BEGIN {  
 
4913 
 
 
 
 
 
 
 
 
 
 
 
 
 
         unshift @INC, "/some/other/directory/blib/lib";  
 
4914 
 
 
 
 
 
 
 
 
 
 
 
 
 
     }      
 
4915 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4916 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4917 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 THANKS  
 
4918 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4919 
 
 
 
 
 
 
 
 
 
 
 
 
 
 I wish to thank many users of this module for their feedback.  Many of the  
 
4920 
 
 
 
 
 
 
 
 
 
 
 
 
 
 improvements I have made to the module over the years are a result of the feedback  
 
4921 
 
 
 
 
 
 
 
 
 
 
 
 
 
 received.  
 
4922 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4923 
 
 
 
 
 
 
 
 
 
 
 
 
 
 I thank Slaven Rezic for pointing out that the module worked with Perl 5.14.x.  For  
 
4924 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Version 2.22, I had set the required version of Perl to 5.18.0 since that's what I  
 
4925 
 
 
 
 
 
 
 
 
 
 
 
 
 
 used for testing the module. Slaven's feedback in the form of the Bug report  
 
4926 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C<#96547> resulted in Version 2.23 of the module.  Version 2.25 further downshifts  
 
4927 
 
 
 
 
 
 
 
 
 
 
 
 
 
 the required version of Perl to 5.10.  
 
4928 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4929 
 
 
 
 
 
 
 
 
 
 
 
 
 
 On the basis of the report posted by Slaven at C regarding Version 2.27,   
 
4930 
 
 
 
 
 
 
 
 
 
 
 
 
 
 I am removing the Perl version restriction altogether from Version 2.30.  Thanks  
 
4931 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Slaven!  
 
4932 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4933 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Version 3.43 was prompted by a bug report sent by Jan Trukenmüller regarding a  
 
4934 
 
 
 
 
 
 
 
 
 
 
 
 
 
 missing backslash in a regex in the C function.  Thanks, Jan!   
 
4935 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4936 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 AUTHOR  
 
4937 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4938 
 
 
 
 
 
 
 
 
 
 
 
 
 
 The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy Project"  
 
4939 
 
 
 
 
 
 
 
 
 
 
 
 
 
 with the publication of the book I by John-Wiley. If   
 
4940 
 
 
 
 
 
 
 
 
 
 
 
 
 
 interested, visit his web page at Purdue to find out what this project was all  
 
4941 
 
 
 
 
 
 
 
 
 
 
 
 
 
 about. You might like I especially if you enjoyed reading   
 
4942 
 
 
 
 
 
 
 
 
 
 
 
 
 
 Harry Potter as a kid (or even as an adult, for that matter).  
 
4943 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4944 
 
 
 
 
 
 
 
 
 
 
 
 
 
 If you send email regarding this module, please place the string "DecisionTree" in  
 
4945 
 
 
 
 
 
 
 
 
 
 
 
 
 
 your subject line to get past my spam filter.  Avi Kak's email address is  
 
4946 
 
 
 
 
 
 
 
 
 
 
 
 
 
 C   
 
4947 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4948 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =head1 COPYRIGHT  
 
4949 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4950 
 
 
 
 
 
 
 
 
 
 
 
 
 
 This library is free software; you can redistribute it and/or modify it under the  
 
4951 
 
 
 
 
 
 
 
 
 
 
 
 
 
 same terms as Perl itself.  
 
4952 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4953 
 
 
 
 
 
 
 
 
 
 
 
 
 
  Copyright 2017 Avinash Kak  
 
4954 
 
 
 
 
 
 
 
 
 
 
 
 
 
    
 
4955 
 
 
 
 
 
 
 
 
 
 
 
 
 
 =cut  
 
4956