line
stmt
bran
cond
sub
pod
time
code
1
package Algorithm::DecisionTree;
2
3
#--------------------------------------------------------------------------------------
4
# Copyright (c) 2016 Avinash Kak. All rights reserved. This program is free
5
# software. You may modify and/or distribute it under the same terms as Perl itself.
6
# This copyright notice must remain attached to the file.
7
#
8
# Algorithm::DecisionTree is a Perl module for decision-tree based classification of
9
# multidimensional data.
10
# -------------------------------------------------------------------------------------
11
12
#use 5.10.0;
13
1
1
12964
use strict;
1
2
1
24
14
1
1
3
use warnings;
1
1
1
20
15
1
1
4
use Carp;
1
3
1
12093
16
17
our $VERSION = '3.42';
18
19
############################################ Constructor ##############################################
20
sub new {
21
1
1
1
330
my ($class, %args, $eval_or_boosting_mode);
22
1
50
12
if (@_ % 2 != 0) {
23
1
5
($class, %args) = @_;
24
} else {
25
0
0
$class = shift;
26
0
0
$eval_or_boosting_mode = shift;
27
0
0
0
0
die unless $eval_or_boosting_mode eq 'evalmode' || $eval_or_boosting_mode eq 'boostingmode';
28
0
0
0
die "Only one string arg allowed in eval and boosting modes" if @_;
29
}
30
1
50
3
unless ($eval_or_boosting_mode) {
31
1
3
my @params = keys %args;
32
1
50
3
croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
33
if check_for_illegal_params2(@params) == 0;
34
}
35
bless {
36
_training_datafile => $args{training_datafile},
37
_entropy_threshold => $args{entropy_threshold} || 0.01,
38
_max_depth_desired => exists $args{max_depth_desired} ?
39
$args{max_depth_desired} : undef,
40
_debug1 => $args{debug1} || 0,
41
_debug2 => $args{debug2} || 0,
42
_debug3 => $args{debug3} || 0,
43
_csv_class_column_index => $args{csv_class_column_index} || undef,
44
_csv_columns_for_features => $args{csv_columns_for_features} || undef,
45
_symbolic_to_numeric_cardinality_threshold
46
=> $args{symbolic_to_numeric_cardinality_threshold} || 10,
47
_number_of_histogram_bins => $args{number_of_histogram_bins} || undef,
48
1
50
50
53
_csv_cleanup_needed => $args{csv_cleanup_needed} || 0,
50
50
50
50
50
50
50
50
49
_training_data => [],
50
_root_node => undef,
51
_probability_cache => {},
52
_entropy_cache => {},
53
_training_data_hash => {},
54
_features_and_values_hash => {},
55
_samples_class_label_hash => {},
56
_class_names => [],
57
_class_priors => [],
58
_class_priors_hash => {},
59
_feature_names => [],
60
_numeric_features_valuerange_hash => {},
61
_sampling_points_for_numeric_feature_hash => {},
62
_feature_values_how_many_uniques_hash => {},
63
_prob_distribution_numeric_features_hash => {},
64
_histogram_delta_hash => {},
65
_num_of_histogram_bins_hash => {},
66
}, $class;
67
68
}
69
70
#################################### Classify with Decision Tree #######################################
71
72
## Classifies one test sample at a time using the decision tree constructed from
73
## your training file. The data record for the test sample must be supplied as
74
## shown in the scripts in the `examples' subdirectory. See the scripts
75
## construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
76
sub classify {
77
0
0
1
0
my $self = shift;
78
0
0
my $root_node = shift;
79
0
0
my $feature_and_values = shift;
80
0
0
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
81
0
0
my @features_and_values = @$feature_and_values;
82
0
0
@features_and_values = @{deep_copy_array(\@features_and_values)};
0
0
83
0
0
0
die "\n\nError in the names you have used for features and/or values. " .
84
"Try using the csv_cleanup_needed option in the constructor call."
85
unless $self->check_names_used(\@features_and_values);
86
0
0
my @new_features_and_values = ();
87
0
0
my $pattern = '(\S+)\s*=\s*(\S+)';
88
0
0
foreach my $feature_and_value (@features_and_values) {
89
0
0
$feature_and_value =~ /$pattern/;
90
0
0
my ($feature, $value) = ($1, $2);
91
0
0
my $newvalue = $value;
92
0
0
my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
0
0
93
0
0
my $not_all_values_float = 0;
94
0
0
0
map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
0
0
95
0
0
0
0
if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
0
0
96
$not_all_values_float == 0) {
97
0
0
$newvalue = closest_sampling_point($value, \@unique_values_for_feature);
98
}
99
0
0
push @new_features_and_values, "$feature" . '=' . "$newvalue";
100
}
101
0
0
@features_and_values = @new_features_and_values;
102
0
0
0
print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
103
0
0
my %answer = ();
104
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
105
0
0
$answer{$class_name} = undef;
106
}
107
0
0
$answer{'solution_path'} = [];
108
0
0
my %classification = %{$self->recursive_descent_for_classification($root_node,
0
0
109
\@features_and_values,\%answer)};
110
0
0
@{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
0
0
0
0
111
0
0
0
if ($self->{_debug3}) {
112
0
0
print "\nCL2 The classification:\n";
113
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
114
0
0
print " $class_name with probability $classification{$class_name}\n";
115
}
116
}
117
0
0
my %classification_for_display = ();
118
0
0
foreach my $item (keys %classification) {
119
0
0
0
if ($item ne 'solution_path') {
120
0
0
$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
121
} else {
122
0
0
my @outlist = ();
123
0
0
foreach my $x (@{$classification{$item}}) {
0
0
124
0
0
push @outlist, "NODE$x";
125
}
126
0
0
$classification_for_display{$item} = \@outlist;
127
}
128
}
129
0
0
return \%classification_for_display;
130
}
131
132
sub recursive_descent_for_classification {
133
0
0
0
0
my $self = shift;
134
0
0
my $node = shift;
135
0
0
my $features_and_values = shift;
136
0
0
my $answer = shift;
137
0
0
my @features_and_values = @$features_and_values;
138
0
0
my %answer = %$answer;
139
0
0
my @children = @{$node->get_children()};
0
0
140
0
0
0
if (@children == 0) {
141
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
142
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
143
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
144
}
145
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
146
0
0
return \%answer;
147
}
148
0
0
my $feature_tested_at_node = $node->get_feature();
149
print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n"
150
0
0
0
if $self->{_debug3};
151
0
0
my $value_for_feature;
152
my $path_found;
153
0
0
my $pattern = '(\S+)\s*=\s*(\S+)';
154
0
0
foreach my $feature_and_value (@features_and_values) {
155
0
0
$feature_and_value =~ /$pattern/;
156
0
0
0
$value_for_feature = $2 if $feature_tested_at_node eq $1;
157
}
158
# The following clause introduced in Version 3.20
159
0
0
0
if (!defined $value_for_feature) {
160
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
161
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
162
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
163
}
164
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
165
0
0
return \%answer;
166
}
167
0
0
0
if ($value_for_feature) {
168
0
0
0
if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
0
0
169
0
0
0
print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};
170
0
0
my $pattern1 = '(.+)<(.+)';
171
0
0
my $pattern2 = '(.+)>(.+)';
172
0
0
foreach my $child (@children) {
173
0
0
my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
0
0
174
0
0
my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
175
0
0
0
if ($last_feature_and_value_on_branch =~ /$pattern1/) {
176
0
0
my ($feature, $threshold) = ($1,$2);
177
0
0
0
if ($value_for_feature <= $threshold) {
178
0
0
$path_found = 1;
179
0
0
%answer = %{$self->recursive_descent_for_classification($child,
0
0
180
$features_and_values,\%answer)};
181
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
182
0
0
last;
183
}
184
}
185
0
0
0
if ($last_feature_and_value_on_branch =~ /$pattern2/) {
186
0
0
my ($feature, $threshold) = ($1,$2);
187
0
0
0
if ($value_for_feature > $threshold) {
188
0
0
$path_found = 1;
189
0
0
%answer = %{$self->recursive_descent_for_classification($child,
0
0
190
$features_and_values,\%answer)};
191
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
192
0
0
last;
193
}
194
}
195
}
196
0
0
0
return \%answer if $path_found;
197
} else {
198
0
0
my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";
199
print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n"
200
0
0
0
if $self->{_debug3};
201
0
0
foreach my $child (@children) {
202
0
0
my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
0
0
203
0
0
0
print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};
204
0
0
my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
205
0
0
0
if ($last_feature_and_value_on_branch eq $feature_value_combo) {
206
0
0
%answer = %{$self->recursive_descent_for_classification($child,
0
0
207
$features_and_values,\%answer)};
208
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
209
0
0
$path_found = 1;
210
0
0
last;
211
}
212
}
213
0
0
0
return \%answer if $path_found;
214
}
215
}
216
0
0
0
if (! $path_found) {
217
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
218
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
219
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
220
}
221
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
222
}
223
0
0
return \%answer;
224
}
225
226
## If you want classification to be carried out by engaging a human user in a
227
## question-answer session, this is the method to use for that purpose. See, for
228
## example, the script classify_by_asking_questions.pl in the `examples'
229
## subdirectory for an illustration of how to do that.
230
sub classify_by_asking_questions {
231
0
0
1
0
my $self = shift;
232
0
0
my $root_node = shift;
233
0
0
my %answer = ();
234
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
235
0
0
$answer{$class_name} = undef;
236
}
237
0
0
$answer{'solution_path'} = [];
238
0
0
my %scratchpad_for_numeric_answers = ();
239
0
0
foreach my $feature_name (keys %{$self->{_prob_distribution_numeric_features_hash}}) {
0
0
240
0
0
$scratchpad_for_numeric_answers{$feature_name} = undef;
241
}
242
0
0
my %classification = %{$self->interactive_recursive_descent_for_classification($root_node,
0
0
243
\%answer, \%scratchpad_for_numeric_answers)};
244
0
0
@{$classification{'solution_path'}} = reverse @{$classification{'solution_path'}};
0
0
0
0
245
0
0
my %classification_for_display = ();
246
0
0
foreach my $item (keys %classification) {
247
0
0
0
if ($item ne 'solution_path') {
248
0
0
$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
249
} else {
250
0
0
my @outlist = ();
251
0
0
foreach my $x (@{$classification{$item}}) {
0
0
252
0
0
push @outlist, "NODE$x";
253
}
254
0
0
$classification_for_display{$item} = \@outlist;
255
}
256
}
257
0
0
return \%classification_for_display;
258
}
259
260
sub interactive_recursive_descent_for_classification {
261
0
0
0
0
my $self = shift;
262
0
0
my $node = shift;
263
0
0
my $answer = shift;
264
0
0
my $scratchpad_for_numerics = shift;
265
0
0
my %answer = %$answer;
266
0
0
my %scratchpad_for_numerics = %$scratchpad_for_numerics;
267
0
0
my $pattern1 = '(.+)<(.+)';
268
0
0
my $pattern2 = '(.+)>(.+)';
269
0
0
my $user_value_for_feature;
270
0
0
my @children = @{$node->get_children()};
0
0
271
0
0
0
if (@children == 0) {
272
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
273
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
274
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
275
}
276
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
277
0
0
return \%answer;
278
}
279
0
0
my @list_of_branch_attributes_to_children = ();
280
0
0
foreach my $child (@children) {
281
0
0
my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
0
0
282
0
0
my $feature_and_value_on_branch = $branch_features_and_values[-1];
283
0
0
push @list_of_branch_attributes_to_children, $feature_and_value_on_branch;
284
}
285
0
0
my $feature_tested_at_node = $node->get_feature();
286
0
0
my $feature_value_combo;
287
0
0
my $path_found = 0;
288
0
0
0
if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
0
0
289
0
0
0
if ($scratchpad_for_numerics{$feature_tested_at_node}) {
290
0
0
$user_value_for_feature = $scratchpad_for_numerics{$feature_tested_at_node};
291
} else {
292
0
0
my @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_tested_at_node}};
0
0
293
0
0
while (1) {
294
0
0
print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
295
0
0
print "\nEnter a value in the range (@valuerange): ";
296
0
0
$user_value_for_feature = ;
297
0
0
$user_value_for_feature =~ s/\r?\n?$//;
298
0
0
$user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
299
0
0
my $answer_found = 0;
300
0
0
0
0
if ($user_value_for_feature >= $valuerange[0] && $user_value_for_feature <= $valuerange[1]) {
301
0
0
$answer_found = 1;
302
0
0
last;
303
}
304
0
0
0
last if $answer_found;
305
0
0
print("You entered illegal value. Let's try again")
306
}
307
0
0
$scratchpad_for_numerics{$feature_tested_at_node} = $user_value_for_feature;
308
}
309
0
0
foreach my $i (0..@list_of_branch_attributes_to_children-1) {
310
0
0
my $branch_attribute = $list_of_branch_attributes_to_children[$i];
311
0
0
0
if ($branch_attribute =~ /$pattern1/) {
312
0
0
my ($feature,$threshold) = ($1,$2);
313
0
0
0
if ($user_value_for_feature <= $threshold) {
314
0
0
%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
0
0
315
\%answer, \%scratchpad_for_numerics)};
316
0
0
$path_found = 1;
317
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
318
0
0
last;
319
}
320
}
321
0
0
0
if ($branch_attribute =~ /$pattern2/) {
322
0
0
my ($feature,$threshold) = ($1,$2);
323
0
0
0
if ($user_value_for_feature > $threshold) {
324
0
0
%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
0
0
325
\%answer, \%scratchpad_for_numerics)};
326
0
0
$path_found = 1;
327
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
328
0
0
last;
329
}
330
}
331
}
332
0
0
0
return \%answer if $path_found;
333
} else {
334
0
0
my @possible_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature_tested_at_node}};
0
0
335
0
0
while (1) {
336
0
0
print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
337
0
0
print "\nEnter a value from the list (@possible_values_for_feature): ";
338
0
0
$user_value_for_feature = ;
339
0
0
$user_value_for_feature =~ s/\r?\n?$//;
340
0
0
$user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
341
0
0
my $answer_found = 0;
342
0
0
0
if (contained_in($user_value_for_feature, @possible_values_for_feature)) {
343
0
0
$answer_found = 1;
344
0
0
last;
345
}
346
0
0
0
last if $answer_found;
347
0
0
print("You entered illegal value. Let's try again");
348
}
349
0
0
$feature_value_combo = "$feature_tested_at_node=$user_value_for_feature";
350
0
0
foreach my $i (0..@list_of_branch_attributes_to_children-1) {
351
0
0
my $branch_attribute = $list_of_branch_attributes_to_children[$i];
352
0
0
0
if ($branch_attribute eq $feature_value_combo) {
353
0
0
%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
0
0
354
\%answer, \%scratchpad_for_numerics)};
355
0
0
$path_found = 1;
356
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
357
0
0
last;
358
}
359
}
360
0
0
0
return \%answer if $path_found;
361
}
362
0
0
0
if (! $path_found) {
363
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
364
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
365
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
366
}
367
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
368
}
369
0
0
return \%answer;
370
}
371
372
###################################### Decision Tree Construction ####################################
373
374
## At the root node, we find the best feature that yields the greatest reduction in
375
## class entropy from the entropy based on just the class priors. The logic for
376
## finding this feature is different for symbolic features and for numeric features.
377
## That logic is built into the method shown later for best feature calculations.
378
sub construct_decision_tree_classifier {
379
1
1
1
8
print "\nConstructing the decision tree ...\n";
380
1
2
my $self = shift;
381
1
50
3
if ($self->{_debug3}) {
382
0
0
$self->determine_data_condition();
383
0
0
print "\nStarting construction of the decision tree:\n";
384
}
385
1
2
my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
2
5
1
2
386
1
50
4
if ($self->{_debug3}) {
387
0
0
print "\nPrior class probabilities: @class_probabilities\n";
388
0
0
print "\nClass names: @{$self->{_class_names}}\n";
0
0
389
}
390
1
4
my $entropy = $self->class_entropy_on_priors();
391
1
50
3
print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};
392
1
11
my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');
393
1
1
$root_node->set_class_names(\@{$self->{_class_names}});
1
4
394
1
1
$self->{_root_node} = $root_node;
395
1
3
$self->recursive_descent($root_node);
396
1
5
return $root_node;
397
}
398
399
## After the root node of the decision tree is calculated by the previous methods,
400
## we invoke this method recursively to create the rest of the tree. At each node,
401
## we find the feature that achieves the largest entropy reduction with regard to
402
## the partitioning of the training data samples that correspond to that node.
403
sub recursive_descent {
404
26
26
0
23
my $self = shift;
405
26
13
my $node = shift;
406
print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"
407
26
50
44
if $self->{_debug3};
408
26
31
my $node_serial_number = $node->get_serial_num();
409
26
18
my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
26
29
410
26
27
my $existing_node_entropy = $node->get_node_entropy();
411
26
50
38
if ($self->{_debug3}) {
412
0
0
print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";
413
0
0
print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
414
0
0
print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
415
0
0
my @class_probs = @{$node->get_class_probabilities()};
0
0
416
0
0
print "\nRD4 Class probabilities: @class_probs\n";
417
}
418
26
100
36
if ($existing_node_entropy < $self->{_entropy_threshold}) {
419
8
50
12
print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
420
8
23
return;
421
}
422
18
16
my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
18
22
423
18
35
my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
424
$self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
425
18
30
$node->set_feature($best_feature);
426
18
50
27
$node->display_node() if $self->{_debug3};
427
18
50
33
28
if (defined($self->{_max_depth_desired}) &&
428
(@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
429
0
0
0
print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3};
430
0
0
return;
431
}
432
18
50
27
return if ! defined $best_feature;
433
18
50
21
if ($self->{_debug3}) {
434
0
0
print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
435
0
0
print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
436
0
0
print "\nRD9 Best feature entropy: $best_feature_entropy\n";
437
0
0
print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
438
}
439
18
19
my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
440
18
50
22
print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
441
18
100
25
if ($entropy_gain > $self->{_entropy_threshold}) {
442
12
50
33
23
if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} &&
443
$self->{_feature_values_how_many_uniques_hash}->{$best_feature} >
444
$self->{_symbolic_to_numeric_cardinality_threshold}) {
445
0
0
my $best_threshold = $decision_val; # as returned by best feature calculator
446
0
0
my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
447
my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
448
0
0
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
0
0
449
my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
450
0
0
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
0
0
451
0
0
my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
452
0
0
my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
453
0
0
push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
454
$feature_threshold_combo_for_less_than;
455
0
0
push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
456
$feature_threshold_combo_for_greater_than;
457
0
0
0
if ($self->{_debug3}) {
458
0
0
print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
459
"@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
460
0
0
print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
461
"@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
462
}
463
my @class_probabilities_for_lessthan_child_node =
464
0
0
map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
465
0
0
\@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
0
0
466
my @class_probabilities_for_greaterthan_child_node =
467
0
0
map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
468
0
0
\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
0
0
469
0
0
0
if ($self->{_debug3}) {
470
0
0
print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
471
0
0
print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
472
}
473
0
0
0
if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
474
0
0
my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
475
\@class_probabilities_for_lessthan_child_node,
476
\@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
477
0
0
$node->add_child_link($left_child_node);
478
0
0
$self->recursive_descent($left_child_node);
479
}
480
0
0
0
if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
481
0
0
my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
482
\@class_probabilities_for_greaterthan_child_node,
483
\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
484
0
0
$node->add_child_link($right_child_node);
485
0
0
$self->recursive_descent($right_child_node);
486
}
487
} else {
488
print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
489
12
50
15
if $self->{_debug3};
490
12
8
my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
12
22
491
12
50
19
print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
492
12
29
my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
40
75
493
12
13
my @class_entropies_for_children = ();
494
12
23
foreach my $feature_and_value_index (0..@feature_value_combos-1) {
495
print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
496
40
50
50
if $self->{_debug3};
497
40
32
my @extended_branch_features_and_values_or_thresholds;
498
40
100
45
if (! @features_and_values_or_thresholds_on_branch) {
499
4
7
@extended_branch_features_and_values_or_thresholds =
500
($feature_value_combos[$feature_and_value_index]);
501
} else {
502
@extended_branch_features_and_values_or_thresholds =
503
36
27
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
36
38
504
36
41
push @extended_branch_features_and_values_or_thresholds,
505
$feature_value_combos[$feature_and_value_index];
506
}
507
my @class_probabilities =
508
80
89
map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
509
40
31
\@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
40
42
510
40
48
my $class_entropy_for_child =
511
$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
512
\@extended_branch_features_and_values_or_thresholds);
513
40
50
56
if ($self->{_debug3}) {
514
0
0
print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
515
0
0
print "\nRD20 class entropy for child: $class_entropy_for_child\n";
516
}
517
40
100
51
if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
518
25
42
my $child_node = DTNode->new(undef, $class_entropy_for_child,
519
\@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
520
25
38
$node->add_child_link($child_node);
521
25
78
$self->recursive_descent($child_node);
522
} else {
523
15
50
34
print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
524
}
525
}
526
}
527
} else {
528
print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n"
529
6
50
10
if $self->{_debug3};
530
6
22
return;
531
}
532
}
533
534
## This is the heart of the decision tree constructor. Its main job is to figure
535
## out the best feature to use for partitioning the training data samples that
536
## correspond to the current node. The search for the best feature is carried out
537
## differently for symbolic features and for numeric features. For a symbolic
538
## feature, the method estimates the entropy for each value of the feature and then
539
## averages out these entropies as a measure of the discriminatory power of that
540
## features. For a numeric feature, on the other hand, it estimates the entropy
541
## reduction that can be achieved if were to partition the set of training samples
542
## for each possible threshold. For a numeric feature, all possible sampling points
543
## relevant to the node in question are considered as candidates for thresholds.
544
sub best_feature_calculator {
545
18
18
0
18
my $self = shift;
546
18
8
my $features_and_values_or_thresholds_on_branch = shift;
547
18
13
my $existing_node_entropy = shift;
548
18
19
my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
549
18
17
my $pattern1 = '(.+)=(.+)';
550
18
9
my $pattern2 = '(.+)<(.+)';
551
18
13
my $pattern3 = '(.+)>(.+)';
552
18
17
my @all_symbolic_features = ();
553
18
13
foreach my $feature_name (@{$self->{_feature_names}}) {
18
25
554
push @all_symbolic_features, $feature_name
555
72
50
111
if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
556
}
557
18
14
my @symbolic_features_already_used = ();
558
18
18
foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
559
47
50
197
push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
560
}
561
18
14
my @symbolic_features_not_yet_used;
562
18
14
foreach my $x (@all_symbolic_features) {
563
72
100
65
push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
564
}
565
18
15
my @true_numeric_types = ();
566
18
74
my @symbolic_types = ();
567
18
13
my @true_numeric_types_feature_names = ();
568
18
11
my @symbolic_types_feature_names = ();
569
18
17
foreach my $item (@features_and_values_or_thresholds_on_branch) {
570
47
50
146
if ($item =~ /$pattern2/) {
50
50
571
0
0
push @true_numeric_types, $item;
572
0
0
push @true_numeric_types_feature_names, $1;
573
} elsif ($item =~ /$pattern3/) {
574
0
0
push @true_numeric_types, $item;
575
0
0
push @true_numeric_types_feature_names, $1;
576
} elsif ($item =~ /$pattern1/) {
577
47
36
push @symbolic_types, $item;
578
47
66
push @symbolic_types_feature_names, $1;
579
} else {
580
0
0
die "format error in the representation of feature and values or thresholds";
581
}
582
}
583
18
24
my %seen = ();
584
18
0
15
@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
0
0
585
18
27
%seen = ();
586
18
50
18
@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
47
104
587
my @bounded_intervals_numeric_types =
588
18
11
@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
18
33
589
# Calculate the upper and the lower bounds to be used when searching for the best
590
# threshold for each of the numeric features that are in play at the current node:
591
18
13
my (%upperbound, %lowerbound);
592
18
17
foreach my $feature (@true_numeric_types_feature_names) {
593
0
0
$upperbound{$feature} = undef;
594
0
0
$lowerbound{$feature} = undef;
595
}
596
18
17
foreach my $item (@bounded_intervals_numeric_types) {
597
0
0
foreach my $feature_grouping (@$item) {
598
0
0
0
if ($feature_grouping->[1] eq '>') {
599
0
0
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
600
} else {
601
0
0
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
602
}
603
}
604
}
605
18
15
my %entropy_values_for_different_features = ();
606
18
11
my %partitioning_point_child_entropies_hash = ();
607
18
15
my %partitioning_point_threshold = ();
608
18
16
my %entropies_for_different_values_of_symbolic_feature = ();
609
18
18
foreach my $feature (@{$self->{_feature_names}}) {
18
25
610
72
73
$entropy_values_for_different_features{$feature} = [];
611
72
59
$partitioning_point_child_entropies_hash{$feature} = {};
612
72
54
$partitioning_point_threshold{$feature} = undef;
613
72
74
$entropies_for_different_values_of_symbolic_feature{$feature} = [];
614
}
615
18
13
foreach my $i (0..@{$self->{_feature_names}}-1) {
18
51
616
72
62
my $feature_name = $self->{_feature_names}->[$i];
617
72
50
82
print "\n\nBFC1 FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
618
72
100
33
82
if (contained_in($feature_name, @symbolic_features_already_used)) {
50
619
47
45
next;
620
25
44
} elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
621
$self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
622
$self->{_symbolic_to_numeric_cardinality_threshold}) {
623
0
0
my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
0
0
624
0
0
0
print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};
625
0
0
my @newvalues = ();
626
0
0
0
if (contained_in($feature_name, @true_numeric_types_feature_names)) {
627
0
0
0
0
if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
0
0
0
0
0
0
628
$lowerbound{$feature_name} >= $upperbound{$feature_name}) {
629
0
0
next;
630
} elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
631
$lowerbound{$feature_name} < $upperbound{$feature_name}) {
632
0
0
foreach my $x (@values) {
633
0
0
0
0
push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
634
}
635
} elsif (defined($upperbound{$feature_name})) {
636
0
0
foreach my $x (@values) {
637
0
0
0
push @newvalues, $x if $x <= $upperbound{$feature_name};
638
}
639
} elsif (defined($lowerbound{$feature_name})) {
640
0
0
foreach my $x (@values) {
641
0
0
0
push @newvalues, $x if $x > $lowerbound{$feature_name};
642
}
643
} else {
644
0
0
die "Error is bound specifications in best feature calculator";
645
}
646
} else {
647
0
0
@newvalues = @{deep_copy_array(\@values)};
0
0
648
}
649
0
0
0
next if @newvalues == 0;
650
0
0
my @partitioning_entropies = ();
651
0
0
foreach my $value (@newvalues) {
652
0
0
my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
653
0
0
my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
654
0
0
my @for_left_child;
655
my @for_right_child;
656
0
0
0
if (@features_and_values_or_thresholds_on_branch) {
657
0
0
@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
0
0
658
0
0
push @for_left_child, $feature_and_less_than_value_string;
659
0
0
@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
0
0
660
0
0
push @for_right_child, $feature_and_greater_than_value_string;
661
} else {
662
0
0
@for_left_child = ($feature_and_less_than_value_string);
663
0
0
@for_right_child = ($feature_and_greater_than_value_string);
664
}
665
0
0
my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(
666
\@features_and_values_or_thresholds_on_branch, $feature_name, $value);
667
0
0
my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
668
\@features_and_values_or_thresholds_on_branch, $feature_name, $value);
669
0
0
my $partitioning_entropy = $entropy1 *
670
$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
671
$entropy2 *
672
$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);
673
674
0
0
push @partitioning_entropies, $partitioning_entropy;
675
0
0
$partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
676
}
677
0
0
my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
678
0
0
0
if ($min_entropy < $existing_node_entropy) {
679
0
0
$partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
680
0
0
$entropy_values_for_different_features{$feature_name} = $min_entropy;
681
}
682
} else {
683
25
50
29
print "\nBFC2: Entering section reserved for symbolic features\n" if $self->{_debug3};
684
25
50
30
print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
685
25
20
my %seen;
686
82
50
262
my @values = grep {$_ ne 'NA' && !$seen{$_}++}
687
25
16
@{$self->{_features_and_unique_values_hash}->{$feature_name}};
25
35
688
25
47
@values = sort @values;
689
25
50
31
print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};
690
691
25
19
my $entropy = 0;
692
25
25
foreach my $value (@values) {
693
82
127
my $feature_value_string = "$feature_name" . '=' . "$value";
694
82
50
98
print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
695
82
59
my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
82
96
696
82
100
117
if (@features_and_values_or_thresholds_on_branch) {
697
68
63
push @extended_attributes, $feature_value_string;
698
} else {
699
14
14
@extended_attributes = ($feature_value_string);
700
}
701
82
100
$entropy +=
702
$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) *
703
$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
704
print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
705
82
50
118
"is $entropy\n" if $self->{_debug3};
706
82
62
push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
82
157
707
}
708
25
50
41
if ($entropy < $existing_node_entropy) {
709
25
64
$entropy_values_for_different_features{$feature_name} = $entropy;
710
}
711
}
712
}
713
18
16
my $min_entropy_for_best_feature;
714
my $best_feature_name;
715
18
30
foreach my $feature_nom (keys %entropy_values_for_different_features) {
716
72
100
73
if (!defined($best_feature_name)) {
717
18
14
$best_feature_name = $feature_nom;
718
18
20
$min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
719
} else {
720
54
100
86
if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
721
13
11
$best_feature_name = $feature_nom;
722
13
13
$min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
723
}
724
}
725
}
726
18
15
my $threshold_for_best_feature;
727
18
50
23
if (exists $partitioning_point_threshold{$best_feature_name}) {
728
18
16
$threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
729
} else {
730
0
0
$threshold_for_best_feature = undef;
731
}
732
18
10
my $best_feature_entropy = $min_entropy_for_best_feature;
733
18
15
my @val_based_entropies_to_be_returned;
734
my $decision_val_to_be_returned;
735
18
50
33
36
if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} &&
736
$self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} >
737
$self->{_symbolic_to_numeric_cardinality_threshold}) {
738
@val_based_entropies_to_be_returned =
739
0
0
@{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
0
0
740
} else {
741
18
15
@val_based_entropies_to_be_returned = ();
742
}
743
18
50
26
if (exists $partitioning_point_threshold{$best_feature_name}) {
744
18
15
$decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
745
} else {
746
0
0
$decision_val_to_be_returned = undef;
747
}
748
print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
749
18
50
25
"@val_based_entropies_to_be_returned\n" if $self->{_debug3};
750
18
102
return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned,
751
$decision_val_to_be_returned);
752
}
753
754
######################################### Entropy Calculators #####################################
755
756
sub class_entropy_on_priors {
757
1
1
0
2
my $self = shift;
758
return $self->{_entropy_cache}->{'priors'}
759
1
50
4
if exists $self->{_entropy_cache}->{"priors"};
760
1
1
my @class_names = @{$self->{_class_names}};
1
2
761
1
2
my $entropy;
762
1
1
foreach my $class (@class_names) {
763
2
4
my $prob = $self->prior_probability_for_class($class);
764
2
50
33
18
my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;
765
2
50
3
$log_prob = 0 if $prob < 0.0001; # since X.log(X)->0 as X->0
766
2
50
4
$log_prob = 0 if $prob > 0.999; # since log(1) = 0
767
2
100
5
if (!defined $entropy) {
768
1
2
$entropy = -1.0 * $prob * $log_prob;
769
1
2
next;
770
}
771
1
3
$entropy += -1.0 * $prob * $log_prob;
772
}
773
1
2
$self->{_entropy_cache}->{'priors'} = $entropy;
774
1
2
return $entropy;
775
}
776
777
sub entropy_scanner_for_a_numeric_feature {
778
0
0
0
0
local $| = 1;
779
0
0
my $self = shift;
780
0
0
my $feature = shift;
781
0
0
my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};
0
0
782
0
0
my @entropies_for_less_than_thresholds = ();
783
0
0
my @entropies_for_greater_than_thresholds = ();
784
0
0
foreach my $point (@all_sampling_points) {
785
0
0
print ". ";
786
0
0
push @entropies_for_less_than_thresholds,
787
$self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);
788
0
0
push @entropies_for_greater_than_thresholds,
789
$self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);
790
}
791
0
0
print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ".
792
"@entropies_for_less_than_thresholds\n";
793
0
0
print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ".
794
"@entropies_for_greater_than_thresholds\n";
795
}
796
797
sub class_entropy_for_less_than_threshold_for_feature {
798
0
0
0
0
my $self = shift;
799
0
0
my $arr = shift;
800
0
0
my $feature = shift;
801
0
0
my $threshold = shift;
802
0
0
my @array_of_features_and_values_or_thresholds = @$arr;
803
0
0
my $feature_threshold_combo = "$feature" . '<' . "$threshold";
804
0
0
my $sequence = join ":", @array_of_features_and_values_or_thresholds;
805
0
0
$sequence .= ":" . $feature_threshold_combo;
806
0
0
0
return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
807
my @copy_of_array_of_features_and_values_or_thresholds =
808
0
0
@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
0
0
809
0
0
push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
810
0
0
my $entropy = 0;
811
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
812
0
0
my $log_prob = undef;
813
0
0
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
814
$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
815
0
0
0
0
if ($prob >= 0.0001 && $prob <= 0.999) {
0
0
816
0
0
$log_prob = log($prob) / log(2.0);
817
} elsif ($prob < 0.0001) {
818
0
0
$log_prob = 0;
819
} elsif ($prob > 0.999) {
820
0
0
$log_prob = 0;
821
} else {
822
0
0
die "An error has occurred in log_prob calculation";
823
}
824
0
0
$entropy += -1.0 * $prob * $log_prob;
825
}
826
0
0
0
if (abs($entropy) < 0.0000001) {
827
0
0
$entropy = 0.0;
828
}
829
0
0
$self->{_entropy_cache}->{$sequence} = $entropy;
830
0
0
return $entropy;
831
}
832
833
sub class_entropy_for_greater_than_threshold_for_feature {
834
0
0
0
0
my $self = shift;
835
0
0
my $arr = shift;
836
0
0
my $feature = shift;
837
0
0
my $threshold = shift;
838
0
0
my @array_of_features_and_values_or_thresholds = @$arr;
839
0
0
my $feature_threshold_combo = "$feature" . '>' . "$threshold";
840
0
0
my $sequence = join ":", @array_of_features_and_values_or_thresholds;
841
0
0
$sequence .= ":" . $feature_threshold_combo;
842
0
0
0
return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
843
my @copy_of_array_of_features_and_values_or_thresholds =
844
0
0
@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
0
0
845
0
0
push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
846
0
0
my $entropy = 0;
847
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
848
0
0
my $log_prob = undef;
849
0
0
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
850
$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
851
0
0
0
0
if ($prob >= 0.0001 && $prob <= 0.999) {
0
0
852
0
0
$log_prob = log($prob) / log(2.0);
853
} elsif ($prob < 0.0001) {
854
0
0
$log_prob = 0;
855
} elsif ($prob > 0.999) {
856
0
0
$log_prob = 0;
857
} else {
858
0
0
die "An error has occurred in log_prob calculation";
859
}
860
0
0
$entropy += -1.0 * $prob * $log_prob;
861
}
862
0
0
0
if (abs($entropy) < 0.0000001) {
863
0
0
$entropy = 0.0;
864
}
865
0
0
$self->{_entropy_cache}->{$sequence} = $entropy;
866
0
0
return $entropy;
867
}
868
869
sub class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds {
870
122
122
0
89
my $self = shift;
871
122
69
my $array_of_features_and_values_or_thresholds = shift;
872
122
124
my @array_of_features_and_values_or_thresholds = @$array_of_features_and_values_or_thresholds;
873
122
122
my $sequence = join ":", @array_of_features_and_values_or_thresholds;
874
122
100
229
return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
875
82
49
my $entropy = 0;
876
82
49
foreach my $class_name (@{$self->{_class_names}}) {
82
89
877
164
124
my $log_prob = undef;
878
164
192
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
879
$class_name, \@array_of_features_and_values_or_thresholds);
880
164
100
100
438
if ($prob >= 0.0001 && $prob <= 0.999) {
100
50
881
138
149
$log_prob = log($prob) / log(2.0);
882
} elsif ($prob < 0.0001) {
883
13
10
$log_prob = 0;
884
} elsif ($prob > 0.999) {
885
13
12
$log_prob = 0;
886
} else {
887
0
0
die "An error has occurred in log_prob calculation";
888
}
889
164
184
$entropy += -1.0 * $prob * $log_prob;
890
}
891
82
100
116
if (abs($entropy) < 0.0000001) {
892
13
11
$entropy = 0.0;
893
}
894
82
109
$self->{_entropy_cache}->{$sequence} = $entropy;
895
82
132
return $entropy;
896
}
897
898
899
##################################### Probability Calculators ########################################
900
901
sub prior_probability_for_class {
902
4
4
0
5
my $self = shift;
903
4
3
my $class = shift;
904
4
3
my $class_name_in_cache = "prior" . '::' . $class;
905
return $self->{_probability_cache}->{$class_name_in_cache}
906
4
50
16
if exists $self->{_probability_cache}->{$class_name_in_cache};
907
0
0
my $total_num_of_samples = keys %{$self->{_samples_class_label_hash}};
0
0
908
0
0
my @values = values %{$self->{_samples_class_label_hash}};
0
0
909
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
910
0
0
my @trues = grep {$_ eq $class_name} @values;
0
0
911
0
0
my $prior_for_this_class = (1.0 * @trues) / $total_num_of_samples;
912
0
0
my $this_class_name_in_cache = "prior" . '::' . $class_name;
913
0
0
$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
914
}
915
0
0
return $self->{_probability_cache}->{$class_name_in_cache};
916
}
917
918
sub calculate_class_priors {
919
1
1
1
7
my $self = shift;
920
1
50
2
return if scalar keys %{$self->{_class_priors_hash}} > 1;
1
4
921
1
2
foreach my $class_name (@{$self->{_class_names}}) {
1
2
922
2
4
my $class_name_in_cache = "prior::$class_name";
923
2
1
my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};
2
5
924
2
1
my @all_values = values %{$self->{_samples_class_label_hash}};
2
11
925
2
3
my @trues_for_this_class = grep {$_ eq $class_name} @all_values;
70
53
926
2
5
my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;
927
2
3
$self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;
928
2
4
my $this_class_name_in_cache = "prior::$class_name";
929
2
6
$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
930
}
931
1
50
4
if ($self->{_debug1}) {
932
0
0
foreach my $class (sort keys %{$self->{_class_priors_hash}}) {
0
0
933
0
0
print "$class => $self->{_class_priors_hash}->{$class}\n";
934
}
935
}
936
}
937
938
sub calculate_first_order_probabilities {
939
1
1
1
13
print "\nEstimating probabilities...\n";
940
1
2
my $self = shift;
941
1
1
foreach my $feature (@{$self->{_feature_names}}) {
1
3
942
4
13
$self->probability_of_feature_value($feature, undef);
943
4
50
16
if ($self->{_debug2}) {
944
0
0
0
if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {
945
0
0
print "\nPresenting probability distribution for a numeric feature:\n";
946
0
0
foreach my $sampling_point (sort {$a <=> $b} keys
0
0
947
0
0
%{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {
948
0
0
my $sampling_pt_for_display = sprintf("%.2f", $sampling_point);
949
print "$feature :: $sampling_pt_for_display=" . sprintf("%.5f",
950
0
0
$self->{_prob_distribution_numeric_features_hash}->{$feature}{$sampling_point}) . "\n";
951
}
952
} else {
953
0
0
print "\nPresenting probabilities for the values of a feature considered to be symbolic:\n";
954
0
0
my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
0
0
955
0
0
foreach my $value (sort @values_for_feature) {
956
0
0
my $prob = $self->probability_of_feature_value($feature,$value);
957
0
0
print "$feature :: $value = " . sprintf("%.5f", $prob) . "\n";
958
}
959
}
960
}
961
}
962
}
963
964
sub probability_of_feature_value {
965
190
190
0
133
my $self = shift;
966
190
142
my $feature_name = shift;
967
190
131
my $value = shift;
968
190
50
66
568
$value = sprintf("%.1f", $value) if defined($value) && $value =~ /^\d+$/;
969
190
50
66
373
if (defined($value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
970
$value = closest_sampling_point($value,
971
0
0
$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
972
}
973
190
112
my $feature_and_value;
974
190
100
228
if (defined($value)) {
975
186
196
$feature_and_value = "$feature_name=$value";
976
}
977
190
50
66
358
if (defined($value) && exists($self->{_probability_cache}->{$feature_and_value})) {
978
186
329
return $self->{_probability_cache}->{$feature_and_value};
979
}
980
4
7
my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
981
4
50
10
if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
982
0
0
0
if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
983
$self->{_symbolic_to_numeric_cardinality_threshold}) {
984
0
0
0
if (! exists $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}) {
985
0
0
@valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
0
0
986
0
0
$diffrange = $valuerange[1] - $valuerange[0];
987
0
0
my %seen = ();
988
0
0
0
0
my @unique_values_for_feature = sort {$a <=> $b} grep {$_ if $_ ne 'NA' && !$seen{$_}++}
0
0
989
0
0
@{$self->{_features_and_values_hash}->{$feature_name}};
0
0
990
0
0
my @diffs = sort {$a <=> $b} map {$unique_values_for_feature[$_] -
0
0
0
0
991
$unique_values_for_feature[$_-1]} 1..@unique_values_for_feature-1;
992
0
0
my $median_diff = $diffs[int(@diffs/2) - 1];
993
0
0
$histogram_delta = $median_diff * 2;
994
0
0
0
if ($histogram_delta < $diffrange / 500.0) {
995
0
0
0
if (defined $self->{_number_of_histogram_bins}) {
996
0
0
$histogram_delta = $diffrange / $self->{_number_of_histogram_bins};
997
} else {
998
0
0
$histogram_delta = $diffrange / 500.0;
999
}
1000
}
1001
0
0
$self->{_histogram_delta_hash}->{$feature_name} = $histogram_delta;
1002
0
0
$num_of_histogram_bins = int($diffrange / $histogram_delta) + 1;
1003
0
0
$self->{_num_of_histogram_bins_hash}->{$feature_name} = $num_of_histogram_bins;
1004
0
0
my @sampling_points_for_feature = map {$valuerange[0] + $histogram_delta * $_}
0
0
1005
0..$num_of_histogram_bins-1;
1006
0
0
$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name} =
1007
\@sampling_points_for_feature;
1008
}
1009
}
1010
}
1011
4
50
9
if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1012
0
0
0
if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1013
$self->{_symbolic_to_numeric_cardinality_threshold}) {
1014
my @sampling_points_for_feature =
1015
0
0
@{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
0
0
1016
0
0
my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1017
0
0
my @actual_values_for_feature = grep {$_ ne 'NA'}
1018
0
0
@{$self->{_features_and_values_hash}->{$feature_name}};
0
0
1019
0
0
foreach my $i (0..@sampling_points_for_feature-1) {
1020
0
0
foreach my $j (0..@actual_values_for_feature-1) {
1021
0
0
0
if (abs($sampling_points_for_feature[$i]-$actual_values_for_feature[$j]) < $histogram_delta) {
1022
0
0
$counts_at_sampling_points[$i]++
1023
}
1024
}
1025
}
1026
0
0
my $total_counts = 0;
1027
0
0
map {$total_counts += $_} @counts_at_sampling_points;
0
0
1028
0
0
my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
0
0
1029
0
0
my %bin_prob_hash = ();
1030
0
0
foreach my $i (0..@sampling_points_for_feature-1) {
1031
0
0
$bin_prob_hash{$sampling_points_for_feature[$i]} = $probs[$i];
1032
}
1033
0
0
$self->{_prob_distribution_numeric_features_hash}->{$feature_name} = \%bin_prob_hash;
1034
0
0
my @values_for_feature = map "$feature_name=$_", map {sprintf("%.5f", $_)}
0
0
1035
@sampling_points_for_feature;
1036
0
0
foreach my $i (0..@values_for_feature-1) {
1037
0
0
$self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1038
}
1039
0
0
0
0
if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1040
0
0
return $self->{_probability_cache}->{$feature_and_value};
1041
} else {
1042
0
0
return 0;
1043
}
1044
} else {
1045
0
0
my %seen = ();
1046
0
0
0
0
my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1047
0
0
@{$self->{_features_and_values_hash}->{$feature_name}};
0
0
1048
0
0
@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
0
0
1049
0
0
my @value_counts = (0) x @values_for_feature;
1050
# foreach my $sample (sort {sample_index($a) cmp sample_index($b)}keys %{$self->{_training_data_hash}}){
1051
0
0
foreach my $sample (sort {sample_index($a) <=> sample_index($b)}keys %{$self->{_training_data_hash}}){
0
0
0
0
1052
0
0
my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
0
0
1053
0
0
foreach my $i (0..@values_for_feature-1) {
1054
0
0
foreach my $current_value (@features_and_values) {
1055
0
0
0
$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1056
}
1057
}
1058
}
1059
0
0
my $total_counts = 0;
1060
0
0
map {$total_counts += $_} @value_counts;
0
0
1061
0
0
0
die "PFV Something is wrong with your training file. It contains no training samples \
1062
for feature named $feature_name" if $total_counts == 0;
1063
0
0
my @probs = map {$_ / (1.0 * $total_counts)} @value_counts;
0
0
1064
0
0
foreach my $i (0..@values_for_feature-1) {
1065
0
0
$self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1066
}
1067
0
0
0
0
if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1068
0
0
return $self->{_probability_cache}->{$feature_and_value};
1069
} else {
1070
0
0
return 0;
1071
}
1072
}
1073
} else {
1074
# This section is only for purely symbolic features:
1075
4
6
my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
4
26
1076
4
7
@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
140
157
1077
4
17
my @value_counts = (0) x @values_for_feature;
1078
# foreach my $sample (sort {sample_index($a) cmp sample_index($b)} keys %{$self->{_training_data_hash}}) {
1079
4
5
foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}}) {
548
520
4
50
1080
140
77
my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
140
232
1081
140
148
foreach my $i (0..@values_for_feature-1) {
1082
4900
3357
for my $current_value (@features_and_values) {
1083
19600
100
23181
$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1084
}
1085
}
1086
}
1087
4
22
foreach my $i (0..@values_for_feature-1) {
1088
$self->{_probability_cache}->{$values_for_feature[$i]} =
1089
140
95
$value_counts[$i] / (1.0 * scalar(keys %{$self->{_training_data_hash}}));
140
214
1090
}
1091
4
50
33
16
if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1092
0
0
return $self->{_probability_cache}->{$feature_and_value};
1093
} else {
1094
4
20
return 0;
1095
}
1096
}
1097
}
1098
1099
sub probability_of_feature_value_given_class {
1100
380
380
0
266
my $self = shift;
1101
380
261
my $feature_name = shift;
1102
380
266
my $feature_value = shift;
1103
380
225
my $class_name = shift;
1104
380
50
33
1091
$feature_value = sprintf("%.1f", $feature_value) if defined($feature_value) && $feature_value =~ /^\d+$/;
1105
380
50
33
678
if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
1106
$feature_value = closest_sampling_point($feature_value,
1107
0
0
$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
1108
}
1109
380
239
my $feature_value_class;
1110
380
50
417
if (defined($feature_value)) {
1111
380
515
$feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";
1112
}
1113
380
100
33
759
if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {
1114
print "\nNext answer returned by cache for feature $feature_name and " .
1115
372
50
443
"value $feature_value given class $class_name\n" if $self->{_debug2};
1116
372
730
return $self->{_probability_cache}->{$feature_value_class};
1117
}
1118
8
14
my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
1119
1120
8
50
10
if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1121
0
0
0
if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1122
$self->{_symbolic_to_numeric_cardinality_threshold}) {
1123
0
0
$histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};
1124
0
0
$num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};
1125
0
0
@valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
0
0
1126
0
0
$diffrange = $valuerange[1] - $valuerange[0];
1127
}
1128
}
1129
8
8
my @samples_for_class = ();
1130
# Accumulate all samples names for the given class:
1131
8
4
foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
8
47
1132
280
100
330
if ($self->{_samples_class_label_hash}->{$sample_name} eq $class_name) {
1133
140
111
push @samples_for_class, $sample_name;
1134
}
1135
}
1136
8
50
18
if (exists($self->{_numeric_features_valuerange_hash}->{$feature_name})) {
1137
0
0
0
if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1138
$self->{_symbolic_to_numeric_cardinality_threshold}) {
1139
my @sampling_points_for_feature =
1140
0
0
@{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
0
0
1141
0
0
my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1142
0
0
my @actual_feature_values_for_samples_in_class = ();
1143
0
0
foreach my $sample (@samples_for_class) {
1144
0
0
foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
0
0
1145
0
0
my $pattern = '(.+)=(.+)';
1146
0
0
$feature_and_value =~ /$pattern/;
1147
0
0
my ($feature, $value) = ($1, $2);
1148
0
0
0
0
if (($feature eq $feature_name) && ($value ne 'NA')) {
1149
0
0
push @actual_feature_values_for_samples_in_class, $value;
1150
}
1151
}
1152
}
1153
0
0
foreach my $i (0..@sampling_points_for_feature-1) {
1154
0
0
foreach my $j (0..@actual_feature_values_for_samples_in_class-1) {
1155
0
0
0
if (abs($sampling_points_for_feature[$i] -
1156
$actual_feature_values_for_samples_in_class[$j]) < $histogram_delta) {
1157
0
0
$counts_at_sampling_points[$i]++;
1158
}
1159
}
1160
}
1161
0
0
my $total_counts = 0;
1162
0
0
map {$total_counts += $_} @counts_at_sampling_points;
0
0
1163
0
0
0
die "PFVC1 Something is wrong with your training file. It contains no training " .
1164
"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1165
0
0
my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
0
0
1166
0
0
my @values_for_feature_and_class = map {"$feature_name=$_" . "::" . "$class_name"}
0
0
1167
@sampling_points_for_feature;
1168
0
0
foreach my $i (0..@values_for_feature_and_class-1) {
1169
0
0
$self->{_probability_cache}->{$values_for_feature_and_class[$i]} = $probs[$i];
1170
}
1171
0
0
0
if (exists $self->{_probability_cache}->{$feature_value_class}) {
1172
0
0
return $self->{_probability_cache}->{$feature_value_class};
1173
} else {
1174
0
0
return 0;
1175
}
1176
} else {
1177
# This section is for numeric features that will be treated symbolically
1178
0
0
my %seen = ();
1179
0
0
0
0
my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1180
0
0
@{$self->{_features_and_values_hash}->{$feature_name}};
0
0
1181
0
0
@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
0
0
1182
0
0
my @value_counts = (0) x @values_for_feature;
1183
0
0
foreach my $sample (@samples_for_class) {
1184
0
0
my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
0
0
1185
0
0
foreach my $i (0..@values_for_feature-1) {
1186
0
0
foreach my $current_value (@features_and_values) {
1187
0
0
0
$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1188
}
1189
}
1190
}
1191
0
0
my $total_counts = 0;
1192
0
0
map {$total_counts += $_} @value_counts;
0
0
1193
0
0
0
die "PFVC2 Something is wrong with your training file. It contains no training " .
1194
"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1195
# We normalize by total_count because the probabilities are conditioned on a given class
1196
0
0
foreach my $i (0..@values_for_feature-1) {
1197
0
0
my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1198
0
0
$self->{_probability_cache}->{$feature_and_value_and_class} =
1199
$value_counts[$i] / (1.0 * $total_counts);
1200
}
1201
0
0
0
if (exists $self->{_probability_cache}->{$feature_value_class}) {
1202
0
0
return $self->{_probability_cache}->{$feature_value_class};
1203
} else {
1204
0
0
return 0;
1205
}
1206
}
1207
} else {
1208
# This section is for purely symbolic features
1209
8
6
my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
8
38
1210
8
8
my %seen = ();
1211
280
100
66
737
@values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1212
8
6
@{$self->{_features_and_values_hash}->{$feature_name}};
8
11
1213
8
8
@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
28
37
1214
8
11
my @value_counts = (0) x @values_for_feature;
1215
8
9
foreach my $sample (@samples_for_class) {
1216
140
81
my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
140
181
1217
140
139
foreach my $i (0..@values_for_feature-1) {
1218
490
313
foreach my $current_value (@features_and_values) {
1219
1960
100
2377
$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1220
}
1221
}
1222
}
1223
8
9
my $total_counts = 0;
1224
8
7
map {$total_counts += $_} @value_counts;
28
23
1225
8
50
29
die "PFVC2 Something is wrong with your training file. It contains no training " .
1226
"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1227
# We normalize by total_count because the probabilities are conditioned on a given class
1228
8
11
foreach my $i (0..@values_for_feature-1) {
1229
28
32
my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1230
28
61
$self->{_probability_cache}->{$feature_and_value_and_class} =
1231
$value_counts[$i] / (1.0 * $total_counts);
1232
}
1233
8
50
11
if (exists $self->{_probability_cache}->{$feature_value_class}) {
1234
8
33
return $self->{_probability_cache}->{$feature_value_class};
1235
} else {
1236
0
0
return 0;
1237
}
1238
}
1239
}
1240
1241
sub probability_of_feature_less_than_threshold {
1242
0
0
0
0
my $self = shift;
1243
0
0
my $feature_name = shift;
1244
0
0
my $threshold = shift;
1245
0
0
my $feature_threshold_combo = "$feature_name" . '<' . "$threshold";
1246
return $self->{_probability_cache}->{$feature_threshold_combo}
1247
0
0
0
if (exists $self->{_probability_cache}->{$feature_threshold_combo});
1248
0
0
0
my @all_values = grep {$_ if $_ ne 'NA'} @{$self->{_features_and_values_hash}->{$feature_name}};
0
0
0
0
1249
0
0
0
my @all_values_less_than_threshold = grep {$_ if $_ <= $threshold} @all_values;
0
0
1250
0
0
my $probability = (1.0 * @all_values_less_than_threshold) / @all_values;
1251
0
0
$self->{_probability_cache}->{$feature_threshold_combo} = $probability;
1252
0
0
return $probability;
1253
}
1254
1255
sub probability_of_feature_less_than_threshold_given_class {
1256
0
0
0
0
my $self = shift;
1257
0
0
my $feature_name = shift;
1258
0
0
my $threshold = shift;
1259
0
0
my $class_name = shift;
1260
0
0
my $feature_threshold_class_combo = "$feature_name" . "<" . "$threshold" . "::" . "$class_name";
1261
return $self->{_probability_cache}->{$feature_threshold_class_combo}
1262
0
0
0
if (exists $self->{_probability_cache}->{$feature_threshold_class_combo});
1263
0
0
my @data_samples_for_class = ();
1264
# Accumulate all samples names for the given class:
1265
0
0
foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
0
0
1266
push @data_samples_for_class, $sample_name
1267
0
0
0
if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;
1268
}
1269
0
0
my @actual_feature_values_for_samples_in_class = ();
1270
0
0
foreach my $sample (@data_samples_for_class) {
1271
0
0
foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
0
0
1272
0
0
my $pattern = '(.+)=(.+)';
1273
0
0
$feature_and_value =~ /$pattern/;
1274
0
0
my ($feature,$value) = ($1,$2);
1275
0
0
0
0
push @actual_feature_values_for_samples_in_class, $value
1276
if $feature eq $feature_name && $value ne 'NA';
1277
}
1278
}
1279
0
0
0
my @actual_points_for_feature_less_than_threshold = grep {$_ if $_ <= $threshold} @actual_feature_values_for_samples_in_class;
0
0
1280
# The condition in the assignment that follows was a bug correction in Version 3.20
1281
0
0
0
my $probability = @actual_feature_values_for_samples_in_class > 0 ? ((1.0 * @actual_points_for_feature_less_than_threshold) / @actual_feature_values_for_samples_in_class) : 0.0;
1282
0
0
$self->{_probability_cache}->{$feature_threshold_class_combo} = $probability;
1283
0
0
return $probability;
1284
}
1285
1286
# This method requires that all truly numeric types only be expressed as '<' or '>'
1287
# constructs in the array of branch features and thresholds
1288
sub probability_of_a_sequence_of_features_and_values_or_thresholds {
1289
233
233
0
160
my $self = shift;
1290
233
157
my $arr = shift;
1291
233
253
my @array_of_features_and_values_or_thresholds = @$arr;
1292
233
50
294
return if scalar @array_of_features_and_values_or_thresholds == 0;
1293
233
278
my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1294
233
100
463
return $self->{_probability_cache}->{$sequence} if exists $self->{_probability_cache}->{$sequence};
1295
68
48
my $probability = undef;
1296
68
46
my $pattern1 = '(.+)=(.+)';
1297
68
34
my $pattern2 = '(.+)<(.+)';
1298
68
47
my $pattern3 = '(.+)>(.+)';
1299
68
53
my @true_numeric_types = ();
1300
68
59
my @true_numeric_types_feature_names = ();
1301
68
55
my @symbolic_types = ();
1302
68
41
my @symbolic_types_feature_names = ();
1303
68
69
foreach my $item (@array_of_features_and_values_or_thresholds) {
1304
186
50
358
if ($item =~ /$pattern2/) {
50
1305
0
0
push @true_numeric_types, $item;
1306
0
0
my ($feature,$value) = ($1,$2);
1307
0
0
push @true_numeric_types_feature_names, $feature;
1308
} elsif ($item =~ /$pattern3/) {
1309
0
0
push @true_numeric_types, $item;
1310
0
0
my ($feature,$value) = ($1,$2);
1311
0
0
push @true_numeric_types_feature_names, $feature;
1312
} else {
1313
186
128
push @symbolic_types, $item;
1314
186
280
$item =~ /$pattern1/;
1315
186
237
my ($feature,$value) = ($1,$2);
1316
186
193
push @symbolic_types_feature_names, $feature;
1317
}
1318
}
1319
68
66
my %seen1 = ();
1320
68
0
69
@true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
0
0
1321
68
40
my %seen2 = ();
1322
68
50
61
@symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
186
409
1323
68
107
my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1324
print_array_with_msg("POS: Answer returned by find_bounded: ",
1325
68
50
81
$bounded_intervals_numeric_types) if $self->{_debug2};
1326
# Calculate the upper and the lower bounds to be used when searching for the best
1327
# threshold for each of the numeric features that are in play at the current node:
1328
68
44
my (%upperbound, %lowerbound);
1329
68
62
foreach my $feature_name (@true_numeric_types_feature_names) {
1330
0
0
$upperbound{$feature_name} = undef;
1331
0
0
$lowerbound{$feature_name} = undef;
1332
}
1333
68
69
foreach my $item (@$bounded_intervals_numeric_types) {
1334
0
0
foreach my $feature_grouping (@$item) {
1335
0
0
0
if ($feature_grouping->[1] eq '>') {
1336
0
0
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1337
} else {
1338
0
0
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1339
}
1340
}
1341
}
1342
68
57
foreach my $feature_name (@true_numeric_types_feature_names) {
1343
0
0
0
0
if (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name}) &&
0
0
0
0
0
0
0
1344
$upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1345
0
0
return 0;
1346
} elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1347
0
0
0
if (! $probability) {
1348
$probability = $self->probability_of_feature_less_than_threshold($feature_name,
1349
$upperbound{$feature_name}) -
1350
0
0
$self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name});
1351
} else {
1352
$probability *= ($self->probability_of_feature_less_than_threshold($feature_name,
1353
$upperbound{$feature_name}) -
1354
0
0
$self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name}))
1355
}
1356
} elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1357
0
0
0
if (! $probability) {
1358
$probability = $self->probability_of_feature_less_than_threshold($feature_name,
1359
0
0
$upperbound{$feature_name});
1360
} else {
1361
$probability *= $self->probability_of_feature_less_than_threshold($feature_name,
1362
0
0
$upperbound{$feature_name});
1363
}
1364
} elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1365
0
0
0
if (! $probability) {
1366
$probability = 1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1367
0
0
$lowerbound{$feature_name});
1368
} else {
1369
$probability *= (1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1370
0
0
$lowerbound{$feature_name}));
1371
}
1372
} else {
1373
0
0
die("Ill formatted call to 'probability_of_sequence' method");
1374
}
1375
}
1376
68
50
foreach my $feature_and_value (@symbolic_types) {
1377
186
50
508
if ($feature_and_value =~ /$pattern1/) {
1378
186
228
my ($feature,$value) = ($1,$2);
1379
186
100
179
if (! $probability) {
1380
68
113
$probability = $self->probability_of_feature_value($feature, $value);
1381
} else {
1382
118
132
$probability *= $self->probability_of_feature_value($feature, $value);
1383
}
1384
}
1385
}
1386
68
106
$self->{_probability_cache}->{$sequence} = $probability;
1387
68
152
return $probability;
1388
}
1389
1390
## The following method requires that all truly numeric types only be expressed as
1391
## '<' or '>' constructs in the array of branch features and thresholds
1392
sub probability_of_a_sequence_of_features_and_values_or_thresholds_given_class {
1393
164
164
0
108
my $self = shift;
1394
164
103
my $arr = shift;
1395
164
118
my $class_name = shift;
1396
164
180
my @array_of_features_and_values_or_thresholds = @$arr;
1397
164
50
216
return if scalar @array_of_features_and_values_or_thresholds == 0;
1398
164
212
my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1399
164
178
my $sequence_with_class = "$sequence" . "::" . $class_name;
1400
return $self->{_probability_cache}->{$sequence_with_class}
1401
164
100
271
if exists $self->{_probability_cache}->{$sequence_with_class};
1402
144
95
my $probability = undef;
1403
144
118
my $pattern1 = '(.+)=(.+)';
1404
144
85
my $pattern2 = '(.+)<(.+)';
1405
144
85
my $pattern3 = '(.+)>(.+)';
1406
144
132
my @true_numeric_types = ();
1407
144
97
my @true_numeric_types_feature_names = ();
1408
144
86
my @symbolic_types = ();
1409
144
106
my @symbolic_types_feature_names = ();
1410
144
128
foreach my $item (@array_of_features_and_values_or_thresholds) {
1411
380
50
704
if ($item =~ /$pattern2/) {
50
1412
0
0
push @true_numeric_types, $item;
1413
0
0
my ($feature,$value) = ($1,$2);
1414
0
0
push @true_numeric_types_feature_names, $feature;
1415
} elsif ($item =~ /$pattern3/) {
1416
0
0
push @true_numeric_types, $item;
1417
0
0
my ($feature,$value) = ($1,$2);
1418
0
0
push @true_numeric_types_feature_names, $feature;
1419
} else {
1420
380
269
push @symbolic_types, $item;
1421
380
603
$item =~ /$pattern1/;
1422
380
486
my ($feature,$value) = ($1,$2);
1423
380
416
push @symbolic_types_feature_names, $feature;
1424
}
1425
}
1426
144
133
my %seen1 = ();
1427
144
0
125
@true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
0
0
1428
144
101
my %seen2 = ();
1429
144
50
120
@symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
380
841
1430
144
205
my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1431
print_array_with_msg("POSC: Answer returned by find_bounded: ",
1432
144
50
186
$bounded_intervals_numeric_types) if $self->{_debug2};
1433
# Calculate the upper and the lower bounds to be used when searching for the best
1434
# threshold for each of the numeric features that are in play at the current node:
1435
144
93
my (%upperbound, %lowerbound);
1436
144
130
foreach my $feature_name (@true_numeric_types_feature_names) {
1437
0
0
$upperbound{$feature_name} = undef;
1438
0
0
$lowerbound{$feature_name} = undef;
1439
}
1440
144
129
foreach my $item (@$bounded_intervals_numeric_types) {
1441
0
0
foreach my $feature_grouping (@$item) {
1442
0
0
0
if ($feature_grouping->[1] eq '>') {
1443
0
0
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1444
} else {
1445
0
0
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1446
}
1447
}
1448
}
1449
144
119
foreach my $feature_name (@true_numeric_types_feature_names) {
1450
0
0
0
0
if ($lowerbound{$feature_name} && $upperbound{$feature_name} &&
0
0
0
0
0
0
0
1451
$upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1452
0
0
return 0;
1453
} elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1454
0
0
0
if (! $probability) {
1455
1456
$probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1457
$upperbound{$feature_name}, $class_name) -
1458
$self->probability_of_feature_less_than_threshold_given_class($feature_name,
1459
0
0
$lowerbound{$feature_name}, $class_name);
1460
} else {
1461
$probability *= ($self->probability_of_feature_less_than_threshold_given_class($feature_name,
1462
$upperbound{$feature_name}, $class_name) -
1463
$self->probability_of_feature_less_than_threshold_given_class($feature_name,
1464
0
0
$lowerbound{$feature_name}, $class_name))
1465
}
1466
} elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1467
0
0
0
if (! $probability) {
1468
$probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1469
0
0
$upperbound{$feature_name}, $class_name);
1470
} else {
1471
$probability *= $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1472
0
0
$upperbound{$feature_name}, $class_name);
1473
}
1474
} elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1475
0
0
0
if (! $probability) {
1476
$probability = 1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1477
0
0
$lowerbound{$feature_name}, $class_name);
1478
} else {
1479
$probability *= (1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1480
0
0
$lowerbound{$feature_name}, $class_name));
1481
}
1482
} else {
1483
0
0
die("Ill formatted call to 'probability of sequence given class' method");
1484
}
1485
}
1486
144
122
foreach my $feature_and_value (@symbolic_types) {
1487
380
50
1027
if ($feature_and_value =~ /$pattern1/) {
1488
380
493
my ($feature,$value) = ($1,$2);
1489
380
100
365
if (! $probability) {
1490
144
192
$probability = $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1491
} else {
1492
236
272
$probability *= $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1493
}
1494
}
1495
}
1496
144
250
$self->{_probability_cache}->{$sequence_with_class} = $probability;
1497
144
337
return $probability;
1498
}
1499
1500
sub probability_of_a_class_given_sequence_of_features_and_values_or_thresholds {
1501
244
244
0
153
my $self = shift;
1502
244
161
my $class_name = shift;
1503
244
157
my $arr = shift;
1504
244
248
my @array_of_features_and_values_or_thresholds = @$arr;
1505
244
240
my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1506
244
233
my $class_and_sequence = "$class_name" . "::" . $sequence;
1507
return $self->{_probability_cache}->{$class_and_sequence}
1508
244
100
549
if exists $self->{_probability_cache}->{$class_and_sequence};
1509
82
55
my @array_of_class_probabilities = (0) x scalar @{$self->{_class_names}};
82
114
1510
82
54
foreach my $i (0..@{$self->{_class_names}}-1) {
82
100
1511
164
131
my $class_name = $self->{_class_names}->[$i];
1512
164
198
my $prob = $self->probability_of_a_sequence_of_features_and_values_or_thresholds_given_class(
1513
\@array_of_features_and_values_or_thresholds, $class_name);
1514
164
100
224
if ($prob < 0.000001) {
1515
13
16
$array_of_class_probabilities[$i] = 0.0;
1516
13
18
next;
1517
}
1518
151
208
my $prob_of_feature_sequence = $self->probability_of_a_sequence_of_features_and_values_or_thresholds(
1519
\@array_of_features_and_values_or_thresholds);
1520
# die "PCS Something is wrong with your sequence of feature values and thresholds in " .
1521
# "probability_of_a_class_given_sequence_of_features_and_values_or_thresholds()"
1522
# if ! $prob_of_feature_sequence;
1523
151
186
my $prior = $self->{_class_priors_hash}->{$self->{_class_names}->[$i]};
1524
151
50
159
if ($prob_of_feature_sequence) {
1525
151
210
$array_of_class_probabilities[$i] = $prob * $prior / $prob_of_feature_sequence;
1526
} else {
1527
0
0
$array_of_class_probabilities[$i] = $prior;
1528
}
1529
}
1530
82
56
my $sum_probability;
1531
82
66
map {$sum_probability += $_} @array_of_class_probabilities;
164
159
1532
82
50
92
if ($sum_probability == 0) {
1533
0
0
@array_of_class_probabilities = map {1.0 / (scalar @{$self->{_class_names}})}
0
0
1534
0
0
(0..@{$self->{_class_names}}-1);
0
0
1535
} else {
1536
82
63
@array_of_class_probabilities = map {$_ * 1.0 / $sum_probability} @array_of_class_probabilities;
164
200
1537
}
1538
82
64
foreach my $i (0..@{$self->{_class_names}}-1) {
82
143
1539
164
204
my $this_class_and_sequence = "$self->{_class_names}->[$i]" . "::" . "$sequence";
1540
164
275
$self->{_probability_cache}->{$this_class_and_sequence} = $array_of_class_probabilities[$i];
1541
}
1542
82
138
return $self->{_probability_cache}->{$class_and_sequence};
1543
}
1544
1545
####################################### Class Based Utilities ##########################################
1546
1547
## Given a list of branch attributes for the numeric features of the form, say,
1548
## ['g2<1','g2<2','g2<3','age>34','age>36','age>37'], this method returns the
1549
## smallest list that is relevant for the purpose of calculating the probabilities.
1550
## To explain, the probability that the feature `g2' is less than 1 AND, at the same
1551
## time, less than 2, AND, at the same time, less than 3, is the same as the
1552
## probability that the feature less than 1. Similarly, the probability that 'age'
1553
## is greater than 34 and also greater than 37 is the same as `age' being greater
1554
## than 37.
1555
sub find_bounded_intervals_for_numeric_features {
1556
230
230
0
178
my $self = shift;
1557
230
141
my $arr = shift;
1558
230
174
my @arr = @$arr;
1559
230
151
my @features = @{$self->{_feature_names}};
230
295
1560
230
192
my @arr1 = map {my @x = split /(>|<)/, $_; \@x} @arr;
0
0
0
0
1561
230
50
316
print_array_with_msg("arr1", \@arr1) if $self->{_debug2};
1562
230
155
my @arr3 = ();
1563
230
198
foreach my $feature_name (@features) {
1564
920
624
my @temp = ();
1565
920
633
foreach my $x (@arr1) {
1566
0
0
0
0
push @temp, $x if @$x > 0 && $x->[0] eq $feature_name;
1567
}
1568
920
50
1201
push @arr3, \@temp if @temp > 0;
1569
}
1570
230
50
276
print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
1571
# Sort each list so that '<' entries occur before '>' entries:
1572
230
138
my @arr4;
1573
230
166
foreach my $li (@arr3) {
1574
0
0
my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
0
0
1575
0
0
push @arr4, \@sorted;
1576
}
1577
230
50
281
print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
1578
230
168
my @arr5;
1579
230
181
foreach my $li (@arr4) {
1580
0
0
my @temp1 = ();
1581
0
0
my @temp2 = ();
1582
0
0
foreach my $inner (@$li) {
1583
0
0
0
if ($inner->[1] eq '<') {
1584
0
0
push @temp1, $inner;
1585
} else {
1586
0
0
push @temp2, $inner;
1587
}
1588
}
1589
0
0
0
0
if (@temp1 > 0 && @temp2 > 0) {
0
1590
0
0
push @arr5, [\@temp1, \@temp2];
1591
} elsif (@temp1 > 0) {
1592
0
0
push @arr5, [\@temp1];
1593
} else {
1594
0
0
push @arr5, [\@temp2];
1595
}
1596
}
1597
230
50
263
print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
1598
230
180
my @arr6 = ();
1599
230
168
foreach my $li (@arr5) {
1600
0
0
my @temp1 = ();
1601
0
0
foreach my $inner (@$li) {
1602
0
0
my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
0
0
1603
0
0
push @temp1, \@sorted;
1604
}
1605
0
0
push @arr6, \@temp1;
1606
}
1607
230
50
256
print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
1608
230
182
my @arr9 = ();
1609
230
204
foreach my $li (@arr6) {
1610
0
0
foreach my $alist (@$li) {
1611
0
0
my @newalist = ();
1612
0
0
0
if ($alist->[0][1] eq '<') {
1613
0
0
push @newalist, $alist->[0];
1614
} else {
1615
0
0
push @newalist, $alist->[-1];
1616
}
1617
0
0
0
if ($alist->[0][1] ne $alist->[-1][1]) {
1618
0
0
push @newalist, $alist->[-1];
1619
}
1620
0
0
push @arr9, \@newalist;
1621
}
1622
}
1623
230
50
292
print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
1624
230
324
return \@arr9;
1625
1626
}
1627
1628
## This method is used to verify that you used legal feature names in the test
1629
## sample that you want to classify with the decision tree.
1630
sub check_names_used {
1631
0
0
0
0
my $self = shift;
1632
0
0
my $features_and_values_test_data = shift;
1633
0
0
my @features_and_values_test_data = @$features_and_values_test_data;
1634
0
0
my $pattern = '(\S+)\s*=\s*(\S+)';
1635
0
0
foreach my $feature_and_value (@features_and_values_test_data) {
1636
0
0
$feature_and_value =~ /$pattern/;
1637
0
0
my ($feature,$value) = ($1,$2);
1638
0
0
0
0
die "Your test data has formatting error" unless defined($feature) && defined($value);
1639
0
0
0
return 0 unless contained_in($feature, @{$self->{_feature_names}});
0
0
1640
}
1641
0
0
return 1;
1642
}
1643
1644
####################################### Data Condition Calculator ######################################
1645
1646
## This method estimates the worst-case fan-out of the decision tree taking into
1647
## account the number of values (and therefore the number of branches emanating from
1648
## a node) for the symbolic features.
1649
sub determine_data_condition {
1650
0
0
0
0
my $self = shift;
1651
0
0
my $num_of_features = scalar @{$self->{_feature_names}};
0
0
1652
0
0
my @values = ();
1653
0
0
my @number_of_values;
1654
0
0
foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
0
0
1655
0
0
push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
1656
0
0
0
if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
0
0
1657
0
0
push @number_of_values, scalar @values;
1658
}
1659
0
0
0
return if ! @values;
1660
0
0
print "Number of features: $num_of_features\n";
1661
0
0
my @minmax = minmax(\@number_of_values);
1662
0
0
my $max_num_values = $minmax[1];
1663
0
0
print "Largest number of values for symbolic features is: $max_num_values\n";
1664
0
0
my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
1665
0
0
print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
1666
"nodes. The exact number of nodes created depends critically on " .
1667
"the entropy_threshold used for node expansion (the default value " .
1668
"for this threshold is 0.01) and on the value set for max_depth_desired " .
1669
"for the depth of the tree.\n";
1670
0
0
0
if ($estimated_number_of_nodes > 10000) {
1671
0
0
print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
1672
"large value for entropy_threshold and/or a small value for " .
1673
"for max_depth_desired to reduce the number of nodes created.\n";
1674
0
0
print "\nDo you wish to continue anyway? Enter 'y' for yes: ";
1675
0
0
my $answer = ;
1676
0
0
$answer =~ s/\r?\n?$//;
1677
0
0
0
while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
1678
0
0
print "\nAnswer not recognized. Let's try again. Enter 'y' or 'n': ";
1679
0
0
$answer = ;
1680
0
0
$answer =~ s/\r?\n?$//;
1681
}
1682
0
0
0
die unless $answer =~ /y(es)?/i;
1683
}
1684
}
1685
1686
1687
#################################### Read Training Data From File ######################################
1688
1689
1690
sub get_training_data {
1691
1
1
1
5
my $self = shift;
1692
1
50
11
die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
1693
1
2
my %class_names = ();
1694
1
2
my %all_record_ids_with_class_labels;
1695
my $firstline;
1696
0
0
my %data_hash;
1697
1
2
$|++;
1698
1
50
24
open FILEIN, $self->{_training_datafile} || die "unable to open $self->{_training_datafile}: $!";
1699
1
2
my $record_index = 0;
1700
1
1
my $firsetline;
1701
1
13
while () {
1702
36
50
64
next if /^[ ]*\r?\n?$/;
1703
36
72
$_ =~ s/\r?\n?$//;
1704
36
50
47
my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
1705
36
100
42
if ($record_index == 0) {
1706
1
1
$firstline = $record;
1707
1
1
$record_index++;
1708
1
3
next;
1709
}
1710
35
102
my @parts = split /,/, $record;
1711
35
29
my $classname = $parts[$self->{_csv_class_column_index}];
1712
35
22
$class_names{$classname} = 1;
1713
35
29
my $record_label = shift @parts;
1714
35
46
$record_label =~ s/^\s*\"|\"\s*$//g;
1715
35
48
$data_hash{$record_label} = \@parts;
1716
35
31
$all_record_ids_with_class_labels{$record_label} = $classname;
1717
35
50
45
print "." if $record_index % 10000 == 0;
1718
35
74
$record_index++;
1719
}
1720
1
6
close FILEIN;
1721
1
2
$|--;
1722
1
2
$self->{_how_many_total_training_samples} = $record_index - 1; # must subtract 1 for the header record
1723
1
50
3
print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
1724
1
6
my @all_feature_names = split /,/, substr($firstline, index($firstline,','));
1725
1
2
my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
1726
1
2
my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
2
6
1727
1
2
my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
4
5
1
2
1728
1
6
my %class_for_sample_hash = map {"sample_" . $_ => "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
35
59
1729
1
7
my @sample_names = map {"sample_$_"} keys %data_hash;
35
30
1730
1
50
6
my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $data_hash{$sampleID}->[$_-1] } } @{$self->{_csv_columns_for_features}} ] } keys %data_hash;
35
21
35
26
140
99
140
97
140
331
35
32
1731
1
50
5
my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};
4
3
4
3
4
16
140
78
140
199
140
111
1
2
1732
1
2
my %numeric_features_valuerange_hash = ();
1733
1
1
my %feature_values_how_many_uniques_hash = ();
1734
1
1
my %features_and_unique_values_hash = ();
1735
1
3
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
1736
1
3
foreach my $feature (keys %features_and_values_hash) {
1737
4
4
my %seen1 = ();
1738
140
100
66
369
my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
1739
4
3
@{$features_and_values_hash{$feature}};
4
6
1740
4
5
$feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
1741
4
5
my $not_all_values_float = 0;
1742
4
50
5
map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
14
92
1743
4
50
9
if ($not_all_values_float == 0) {
1744
0
0
my @minmaxvalues = minmax(\@unique_values_for_feature);
1745
0
0
$numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
1746
}
1747
4
8
$features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
1748
}
1749
1
50
4
if ($self->{_debug1}) {
1750
0
0
print "\nAll class names: @all_class_names\n";
1751
0
0
print "\nEach sample data record:\n";
1752
0
0
foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
0
0
1753
0
0
print "$sample => @{$feature_values_for_samples_hash{$sample}}\n";
0
0
1754
}
1755
0
0
print "\nclass label for each data sample:\n";
1756
0
0
foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %class_for_sample_hash) {
0
0
1757
0
0
print "$sample => $class_for_sample_hash{$sample}\n";
1758
}
1759
0
0
print "\nFeatures used: @feature_names\n\n";
1760
0
0
print "\nfeatures and the values taken by them:\n";
1761
0
0
foreach my $feature (sort keys %features_and_values_hash) {
1762
0
0
print "$feature => @{$features_and_values_hash{$feature}}\n";
0
0
1763
}
1764
0
0
print "\nnumeric features and their ranges:\n";
1765
0
0
foreach my $feature (sort keys %numeric_features_valuerange_hash) {
1766
0
0
print "$feature => @{$numeric_features_valuerange_hash{$feature}}\n";
0
0
1767
}
1768
0
0
print "\nnumber of unique values in each feature:\n";
1769
0
0
foreach my $feature (sort keys %feature_values_how_many_uniques_hash) {
1770
0
0
print "$feature => $feature_values_how_many_uniques_hash{$feature}\n";
1771
}
1772
}
1773
1
2
$self->{_class_names} = \@all_class_names;
1774
1
5
$self->{_feature_names} = \@feature_names;
1775
1
2
$self->{_samples_class_label_hash} = \%class_for_sample_hash;
1776
1
2
$self->{_training_data_hash} = \%feature_values_for_samples_hash;
1777
1
2
$self->{_features_and_values_hash} = \%features_and_values_hash;
1778
1
2
$self->{_features_and_unique_values_hash} = \%features_and_unique_values_hash;
1779
1
1
$self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
1780
1
18
$self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
1781
}
1782
1783
sub show_training_data {
1784
0
0
1
0
my $self = shift;
1785
0
0
my @class_names = @{$self->{_class_names}};
0
0
1786
0
0
my %features_and_values_hash = %{$self->{_features_and_values_hash}};
0
0
1787
0
0
my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};
0
0
1788
0
0
my %training_data_hash = %{$self->{_training_data_hash}};
0
0
1789
0
0
print "\n\nClass Names: @class_names\n";
1790
0
0
print "\n\nFeatures and Their Values:\n\n";
1791
0
0
while ( my ($k, $v) = each %features_and_values_hash ) {
1792
0
0
print "$k ---> @{$features_and_values_hash{$k}}\n";
0
0
1793
}
1794
0
0
print "\n\nSamples vs. Class Labels:\n\n";
1795
0
0
foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {
0
0
1796
0
0
print "$kee => $samples_class_label_hash{$kee}\n";
1797
}
1798
0
0
print "\n\nTraining Samples:\n\n";
1799
0
0
foreach my $kee (sort {sample_index($a) <=> sample_index($b)}
0
0
1800
keys %training_data_hash) {
1801
0
0
print "$kee => @{$training_data_hash{$kee}}\n";
0
0
1802
}
1803
}
1804
1805
sub get_class_names {
1806
0
0
0
0
my $self = shift;
1807
0
0
return @{$self->{_class_names}}
0
0
1808
}
1809
1810
########################################## Utility Routines ############################################
1811
1812
sub closest_sampling_point {
1813
0
0
0
0
my $value = shift;
1814
0
0
my $arr_ref = shift;
1815
0
0
my @arr = @{$arr_ref};
0
0
1816
0
0
my @compare = map {abs($_ - $value)} @arr;
0
0
1817
0
0
my ($minval,$index) = minimum(\@compare);
1818
0
0
return $arr[$index];
1819
}
1820
1821
## returns the array index that contains a specified STRING value: (meant only for array of strings)
1822
sub get_index_at_value {
1823
0
0
0
0
my $value = shift;
1824
0
0
my @array = @{shift @_};
0
0
1825
0
0
foreach my $i (0..@array-1) {
1826
0
0
0
return $i if $value eq $array[$i];
1827
}
1828
}
1829
1830
## When the training data is read from a CSV file, we assume that the first column
1831
## of each data record contains a unique integer identifier for the record in that
1832
## row. This training data is stored in a hash whose keys are the prefix 'sample_'
1833
## followed by the identifying integers. The purpose of this function is to return
1834
## the identifying integer associated with a data record.
1835
sub sample_index {
1836
1096
1096
0
678
my $arg = shift;
1837
1096
1079
$arg =~ /_(.+)$/;
1838
1096
1212
return $1;
1839
}
1840
1841
# Returns the minimum value and its positional index in an array
1842
sub minimum {
1843
0
0
0
0
my $arr = shift;
1844
0
0
my $min;
1845
my $index;
1846
0
0
foreach my $i (0..@{$arr}-1) {
0
0
1847
0
0
0
0
if ( (!defined $min) || ($arr->[$i] < $min) ) {
1848
0
0
$index = $i;
1849
0
0
$min = $arr->[$i];
1850
}
1851
}
1852
0
0
return ($min, $index);
1853
}
1854
1855
# Returns an array of two values, the min and the max, of an array of floats
1856
sub minmax {
1857
0
0
0
0
my $arr = shift;
1858
0
0
my ($min, $max);
1859
0
0
foreach my $i (0..@{$arr}-1) {
0
0
1860
0
0
0
0
if ( (!defined $min) || ($arr->[$i] < $min) ) {
1861
0
0
$min = $arr->[$i];
1862
}
1863
0
0
0
0
if ( (!defined $max) || ($arr->[$i] > $max) ) {
1864
0
0
$max = $arr->[$i];
1865
}
1866
}
1867
0
0
return ($min, $max);
1868
}
1869
1870
# checks whether an element is in an array:
1871
sub contained_in {
1872
169
169
0
113
my $ele = shift;
1873
169
162
my @array = @_;
1874
169
95
my $count = 0;
1875
169
100
129
map {$count++ if $ele eq $_} @array;
376
598
1876
169
275
return $count;
1877
}
1878
1879
# Meant only for an array of strings (no nesting):
1880
sub deep_copy_array {
1881
136
136
0
96
my $ref_in = shift;
1882
136
80
my $ref_out;
1883
136
100
206
return [] if scalar @$ref_in == 0;
1884
121
83
foreach my $i (0..@{$ref_in}-1) {
121
158
1885
239
251
$ref_out->[$i] = $ref_in->[$i];
1886
}
1887
121
171
return $ref_out;
1888
}
1889
1890
sub check_for_illegal_params2 {
1891
1
1
0
2
my @params = @_;
1892
1
3
my @legal_params = qw / training_datafile
1893
entropy_threshold
1894
max_depth_desired
1895
csv_class_column_index
1896
csv_columns_for_features
1897
symbolic_to_numeric_cardinality_threshold
1898
number_of_histogram_bins
1899
csv_cleanup_needed
1900
debug1
1901
debug2
1902
debug3
1903
/;
1904
1
1
my $found_match_flag;
1905
1
3
foreach my $param (@params) {
1906
3
2
foreach my $legal (@legal_params) {
1907
10
8
$found_match_flag = 0;
1908
10
100
15
if ($param eq $legal) {
1909
3
2
$found_match_flag = 1;
1910
3
3
last;
1911
}
1912
}
1913
3
50
5
last if $found_match_flag == 0;
1914
}
1915
1
4
return $found_match_flag;
1916
}
1917
1918
sub print_array_with_msg {
1919
0
0
0
0
my $message = shift;
1920
0
0
my $arr = shift;
1921
0
0
print "\n$message: ";
1922
0
0
print_nested_array( $arr );
1923
}
1924
1925
sub print_nested_array {
1926
0
0
0
0
my $arr = shift;
1927
0
0
my @arr = @$arr;
1928
0
0
print "[";
1929
0
0
foreach my $item (@arr) {
1930
0
0
0
if (ref $item) {
1931
0
0
print_nested_array($item);
1932
} else {
1933
0
0
print "$item";
1934
}
1935
}
1936
0
0
print "]";
1937
}
1938
1939
sub cleanup_csv {
1940
0
0
0
0
my $line = shift;
1941
0
0
$line =~ tr/\/:?()[]{}'/ /;
1942
0
0
my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
1943
0
0
for (@double_quoted) {
1944
0
0
my $item = $_;
1945
0
0
$item = substr($item, 1, -1);
1946
0
0
$item =~ s/^s+|,|\s+$//g;
1947
0
0
$item = join '_', split /\s+/, $item;
1948
0
0
substr($line, index($line, $_), length($_)) = $item;
1949
}
1950
0
0
my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
1951
0
0
for (@white_spaced) {
1952
0
0
my $item = $_;
1953
0
0
$item =~ s/\s+/_/g;
1954
0
0
$item =~ s/^\s*_|_\s*$//g;
1955
0
0
substr($line, index($line, $_), length($_)) = $item;
1956
}
1957
0
0
$line =~ s/,\s*(?=,|$)/,NA/g;
1958
0
0
return $line;
1959
}
1960
1961
######################################### Class EvalTrainingData ########################################
1962
1963
## This subclass of the DecisionTree class is used to evaluate the quality of your
1964
## training data by running a 10-fold cross-validation test on it. This test divides
1965
## all of the training data into ten parts, with nine parts used for training a
1966
## decision tree and one part used for testing its ability to classify correctly.
1967
## This selection of nine parts for training and one part for testing is carried out
1968
## in all of the ten different possible ways. This testing functionality can also
1969
## be used to find the best values to use for the constructor parameters
1970
## entropy_threshold, max_depth_desired, and
1971
## symbolic_to_numeric_cardinality_threshold.
1972
1973
## Only the CSV training files can be evaluated in this manner (because only CSV
1974
## training are allowed to have numeric features --- which is the more interesting
1975
## case for evaluation analytics.
1976
1977
package EvalTrainingData;
1978
1979
@EvalTrainingData::ISA = ('Algorithm::DecisionTree');
1980
1981
sub new {
1982
0
0
0
my $class = shift;
1983
0
0
my $instance = Algorithm::DecisionTree->new(@_);
1984
0
0
bless $instance, $class;
1985
}
1986
1987
sub evaluate_training_data {
1988
0
0
0
my $self = shift;
1989
0
0
my $evaldebug = 0;
1990
die "The data evaluation function in the module can only be used when your " .
1991
0
0
0
"training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
1992
0
0
print "\nWill run a 10-fold cross-validation test on your training data to test its " .
1993
"class-discriminatory power:\n";
1994
0
0
my %all_training_data = %{$self->{_training_data_hash}};
0
0
1995
0
0
my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=>
0
0
1996
Algorithm::DecisionTree::sample_index($b)} keys %all_training_data;
1997
0
0
my $fold_size = int(0.1 * (scalar keys %all_training_data));
1998
0
0
print "fold size: $fold_size\n";
1999
0
0
my %confusion_matrix = ();
2000
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
2001
0
0
foreach my $inner_class_name (@{$self->{_class_names}}) {
0
0
2002
0
0
$confusion_matrix{$class_name}->{$inner_class_name} = 0;
2003
}
2004
}
2005
0
0
foreach my $fold_index (0..9) {
2006
0
0
print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n";
2007
0
0
my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
2008
0
0
my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],
2009
@all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
2010
0
0
my %testing_data = ();
2011
0
0
foreach my $x (@testing_samples) {
2012
0
0
$testing_data{$x} = $all_training_data{$x};
2013
}
2014
0
0
my %training_data = ();
2015
0
0
foreach my $x (@training_samples) {
2016
0
0
$training_data{$x} = $all_training_data{$x};
2017
}
2018
0
0
my $trainingDT = Algorithm::DecisionTree->new('evalmode');
2019
0
0
$trainingDT->{_training_data_hash} = \%training_data;
2020
0
0
$trainingDT->{_class_names} = $self->{_class_names};
2021
0
0
$trainingDT->{_feature_names} = $self->{_feature_names};
2022
0
0
$trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
2023
0
0
$trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
2024
$trainingDT->{_symbolic_to_numeric_cardinality_threshold} =
2025
0
0
$self->{_symbolic_to_numeric_cardinality_threshold};
2026
0
0
foreach my $sample_name (@training_samples) {
2027
$trainingDT->{_samples_class_label_hash}->{$sample_name} =
2028
0
0
$self->{_samples_class_label_hash}->{$sample_name};
2029
}
2030
0
0
foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
0
0
2031
0
0
$trainingDT->{_features_and_values_hash}->{$feature} = ();
2032
}
2033
0
0
my $pattern = '(\S+)\s*=\s*(\S+)';
2034
0
0
foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=>
0
0
2035
Algorithm::DecisionTree::sample_index($b)}
2036
0
0
keys %{$trainingDT->{_training_data_hash}}) {
2037
0
0
foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
0
0
2038
0
0
$feature_and_value =~ /$pattern/;
2039
0
0
my ($feature,$value) = ($1,$2);
2040
0
0
0
push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
0
0
2041
}
2042
}
2043
0
0
foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
0
0
2044
0
0
my %seen = ();
2045
0
0
0
0
my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
2046
0
0
@{$trainingDT->{_features_and_values_hash}->{$feature}};
0
0
2047
0
0
0
if (Algorithm::DecisionTree::contained_in($feature,
2048
0
0
keys %{$self->{_numeric_features_valuerange_hash}})) {
2049
0
0
@unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;
0
0
2050
} else {
2051
0
0
@unique_values_for_feature = sort @unique_values_for_feature;
2052
}
2053
0
0
$trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
2054
}
2055
0
0
foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
0
0
2056
my @minmaxvalues = Algorithm::DecisionTree::minmax(
2057
0
0
\@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
0
0
2058
0
0
$trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
2059
}
2060
0
0
0
if ($evaldebug) {
2061
0
0
print "\n\nprinting samples in the testing set: @testing_samples\n";
2062
0
0
print "\n\nPrinting features and their values in the training set:\n";
2063
0
0
foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
0
0
2064
0
0
print "$item => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
0
0
2065
}
2066
0
0
print "\n\nPrinting unique values for features:\n";
2067
0
0
foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
0
0
2068
0
0
print "$item => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
0
0
2069
}
2070
0
0
print "\n\nPrinting unique value ranges for features:\n";
2071
0
0
foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
0
0
2072
0
0
print "$item => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
0
0
2073
}
2074
}
2075
0
0
foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
0
0
2076
$trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} =
2077
0
0
scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
0
0
2078
}
2079
0
0
0
$trainingDT->{_debug2} = 1 if $evaldebug;
2080
0
0
$trainingDT->calculate_first_order_probabilities();
2081
0
0
$trainingDT->calculate_class_priors();
2082
0
0
my $root_node = $trainingDT->construct_decision_tree_classifier();
2083
0
0
0
$root_node->display_decision_tree(" ") if $evaldebug;
2084
0
0
foreach my $test_sample_name (@testing_samples) {
2085
0
0
my @test_sample_data = @{$all_training_data{$test_sample_name}};
0
0
2086
0
0
0
print "original data in test sample: @test_sample_data\n" if $evaldebug;
2087
0
0
0
0
@test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
0
0
2088
0
0
0
print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
2089
0
0
my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
0
0
2090
0
0
my @solution_path = @{$classification{'solution_path'}};
0
0
2091
0
0
delete $classification{'solution_path'};
2092
0
0
my @which_classes = keys %classification;
2093
0
0
@which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
0
0
2094
0
0
my $most_likely_class_label = $which_classes[0];
2095
0
0
0
if ($evaldebug) {
2096
0
0
print "\nClassification:\n\n";
2097
0
0
print " class probability\n";
2098
0
0
print " ---------- -----------\n";
2099
0
0
foreach my $which_class (@which_classes) {
2100
0
0
my $classstring = sprintf("%-30s", $which_class);
2101
0
0
my $valuestring = sprintf("%-30s", $classification{$which_class});
2102
0
0
print " $classstring $valuestring\n";
2103
}
2104
0
0
print "\nSolution path in the decision tree: @solution_path\n";
2105
0
0
print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
2106
}
2107
0
0
my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
2108
0
0
0
print "$test_sample_name: true_class: $true_class_label_for_sample " .
2109
"estimated_class: $most_likely_class_label\n" if $evaldebug;
2110
0
0
$confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
2111
}
2112
}
2113
0
0
print "\n\n DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
2114
0
0
my $matrix_header = " " x 30;
2115
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
2116
0
0
$matrix_header .= sprintf("%-30s", $class_name);
2117
}
2118
0
0
print "\n" . $matrix_header . "\n\n";
2119
0
0
foreach my $row_class_name (sort keys %confusion_matrix) {
2120
0
0
my $row_display = sprintf("%-30s", $row_class_name);
2121
0
0
foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
0
0
2122
0
0
$row_display .= sprintf( "%-30u", $confusion_matrix{$row_class_name}->{$col_class_name} );
2123
}
2124
0
0
print "$row_display\n\n";
2125
}
2126
0
0
print "\n\n";
2127
0
0
my ($diagonal_sum, $off_diagonal_sum) = (0,0);
2128
0
0
foreach my $row_class_name (sort keys %confusion_matrix) {
2129
0
0
foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
0
0
2130
0
0
0
if ($row_class_name eq $col_class_name) {
2131
0
0
$diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2132
} else {
2133
0
0
$off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2134
}
2135
}
2136
}
2137
0
0
my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
2138
0
0
print "\nTraining Data Quality Index: $data_quality_index (out of a possible maximum of 100)\n";
2139
0
0
0
0
if ($data_quality_index <= 80) {
0
0
0
0
0
2140
0
0
print "\nYour training data does not possess much class discriminatory " .
2141
"information. It could be that the classes are inherently not well " .
2142
"separable or that your constructor parameter choices are not appropriate.\n";
2143
} elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
2144
0
0
print "\nYour training data possesses some class discriminatory information " .
2145
"but it may not be sufficient for real-world applications. You might " .
2146
"try tweaking the constructor parameters to see if that improves the " .
2147
"class discriminations.\n";
2148
} elsif ($data_quality_index > 90 && $data_quality_index <= 95) {
2149
0
0
print "\nYour training data appears to possess good class discriminatory " .
2150
"information. Whether or not it is acceptable would depend on your " .
2151
"application.\n";
2152
} elsif ($data_quality_index > 95 && $data_quality_index <= 98) {
2153
0
0
print "\nYour training data is of excellent quality.\n";
2154
} else {
2155
0
0
print "\nYour training data is perfect.\n";
2156
}
2157
2158
}
2159
2160
2161
############################################# Class DTNode #############################################
2162
2163
# The nodes of the decision tree are instances of this class:
2164
2165
package DTNode;
2166
2167
1
1
16
use strict;
1
2
1
25
2168
1
1
4
use Carp;
1
1
1
847
2169
2170
# $feature is the feature test at the current node. $branch_features_and_values is
2171
# an anonymous array holding the feature names and corresponding values on the path
2172
# from the root to the current node:
2173
sub new {
2174
26
26
26
my ($class, $feature, $entropy, $class_probabilities,
2175
$branch_features_and_values_or_thresholds, $dt, $root_or_not) = @_;
2176
26
100
46
$root_or_not = '' if !defined $root_or_not;
2177
26
100
39
if ($root_or_not eq 'root') {
2178
1
2
$dt->{nodes_created} = -1;
2179
1
4
$dt->{class_names} = undef;
2180
}
2181
26
78
my $self = {
2182
_dt => $dt,
2183
_feature => $feature,
2184
_node_creation_entropy => $entropy,
2185
_class_probabilities => $class_probabilities,
2186
_branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
2187
_linked_to => [],
2188
};
2189
26
29
bless $self, $class;
2190
26
31
$self->{_serial_number} = $self->get_next_serial_num();
2191
26
29
return $self;
2192
}
2193
2194
sub how_many_nodes {
2195
0
0
0
my $self = shift;
2196
0
0
return $self->{_dt}->{nodes_created} + 1;
2197
}
2198
2199
sub set_class_names {
2200
1
1
1
my $self = shift;
2201
1
1
my $class_names_list = shift;
2202
1
2
$self->{_dt}->{class_names} = $class_names_list;
2203
}
2204
2205
sub get_class_names {
2206
0
0
0
my $self = shift;
2207
0
0
return $self->{_dt}->{class_names};
2208
}
2209
2210
sub get_next_serial_num {
2211
26
26
18
my $self = shift;
2212
26
31
$self->{_dt}->{nodes_created} += 1;
2213
26
35
return $self->{_dt}->{nodes_created};
2214
}
2215
2216
sub get_serial_num {
2217
26
26
18
my $self = shift;
2218
26
24
$self->{_serial_number};
2219
}
2220
2221
# this returns the feature test at the current node
2222
sub get_feature {
2223
0
0
0
my $self = shift;
2224
0
0
return $self->{ _feature };
2225
}
2226
2227
sub set_feature {
2228
18
18
16
my $self = shift;
2229
18
16
my $feature = shift;
2230
18
20
$self->{_feature} = $feature;
2231
}
2232
2233
sub get_node_entropy {
2234
26
26
21
my $self = shift;
2235
26
26
return $self->{_node_creation_entropy};
2236
}
2237
2238
sub get_class_probabilities {
2239
0
0
0
my $self = shift;
2240
0
0
return $self->{ _class_probabilities};
2241
}
2242
2243
sub get_branch_features_and_values_or_thresholds {
2244
26
26
19
my $self = shift;
2245
26
34
return $self->{_branch_features_and_values_or_thresholds};
2246
}
2247
2248
sub add_to_branch_features_and_values {
2249
0
0
0
my $self = shift;
2250
0
0
my $feature_and_value = shift;
2251
0
0
push @{$self->{ _branch_features_and_values }}, $feature_and_value;
0
0
2252
}
2253
2254
sub get_children {
2255
0
0
0
my $self = shift;
2256
0
0
return $self->{_linked_to};
2257
}
2258
2259
sub add_child_link {
2260
25
25
22
my ($self, $new_node, ) = @_;
2261
25
16
push @{$self->{_linked_to}}, $new_node;
25
36
2262
}
2263
2264
sub delete_all_links {
2265
0
0
0
my $self = shift;
2266
0
0
$self->{_linked_to} = undef;
2267
}
2268
2269
sub display_node {
2270
0
0
0
my $self = shift;
2271
0
0
0
my $feature_at_node = $self->get_feature() || " ";
2272
0
0
my $node_creation_entropy_at_node = $self->get_node_entropy();
2273
0
0
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2274
0
0
my @class_probabilities = @{$self->get_class_probabilities()};
0
0
2275
0
0
my @class_probabilities_for_display = map {sprintf("%0.3f", $_)} @class_probabilities;
0
0
2276
0
0
my $serial_num = $self->get_serial_num();
2277
0
0
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
0
0
2278
0
0
print "\n\nNODE $serial_num" .
2279
":\n Branch features and values to this node: @branch_features_and_values_or_thresholds" .
2280
"\n Class probabilities at current node: @class_probabilities_for_display" .
2281
"\n Entropy at current node: $print_node_creation_entropy_at_node" .
2282
"\n Best feature test at current node: $feature_at_node\n\n";
2283
}
2284
2285
sub display_decision_tree {
2286
0
0
0
my $self = shift;
2287
0
0
my $offset = shift;
2288
0
0
my $serial_num = $self->get_serial_num();
2289
0
0
0
if (@{$self->get_children()} > 0) {
0
0
2290
0
0
0
my $feature_at_node = $self->get_feature() || " ";
2291
0
0
my $node_creation_entropy_at_node = $self->get_node_entropy();
2292
0
0
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2293
0
0
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
0
0
2294
0
0
my @class_probabilities = @{$self->get_class_probabilities()};
0
0
2295
0
0
my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
0
0
2296
0
0
my @class_names = @{$self->get_class_names()};
0
0
2297
my @print_class_probabilities_with_class =
2298
0
0
map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
0
0
2299
0
0
print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
2300
0
0
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2301
0
0
print "$second_line_offset" . "Decision Feature: $feature_at_node Node Creation Entropy: " ,
2302
"$print_node_creation_entropy_at_node Class Probs: @print_class_probabilities_with_class\n\n";
2303
0
0
$offset .= " ";
2304
0
0
foreach my $child (@{$self->get_children()}) {
0
0
2305
0
0
$child->display_decision_tree($offset);
2306
}
2307
} else {
2308
0
0
my $node_creation_entropy_at_node = $self->get_node_entropy();
2309
0
0
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2310
0
0
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
0
0
2311
0
0
my @class_probabilities = @{$self->get_class_probabilities()};
0
0
2312
0
0
my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
0
0
2313
0
0
my @class_names = @{$self->get_class_names()};
0
0
2314
my @print_class_probabilities_with_class =
2315
0
0
map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
0
0
2316
0
0
print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
2317
0
0
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2318
0
0
print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node " .
2319
"Class Probs: @print_class_probabilities_with_class\n\n";
2320
}
2321
}
2322
2323
2324
############################## Generate Your Own Numeric Training Data #################################
2325
############################# Class TrainingDataGeneratorNumeric ################################
2326
2327
## See the script generate_training_data_numeric.pl in the examples
2328
## directory on how to use this class for generating your own numeric training and
2329
## test data. The training and test data are generated in accordance with the
2330
## specifications you place in the parameter file that is supplied as an argument to
2331
## the constructor of this class.
2332
2333
package TrainingDataGeneratorNumeric;
2334
2335
1
1
5
use strict;
1
1
1
16
2336
1
1
3
use Carp;
1
1
1
943
2337
2338
sub new {
2339
0
0
0
my ($class, %args) = @_;
2340
0
0
my @params = keys %args;
2341
0
0
0
croak "\nYou have used a wrong name for a keyword argument " .
2342
"--- perhaps a misspelling\n"
2343
if check_for_illegal_params3(@params) == 0;
2344
bless {
2345
_output_training_csv_file => $args{'output_training_csv_file'}
2346
|| croak("name for output_training_csv_file required"),
2347
_output_test_csv_file => $args{'output_test_csv_file'}
2348
|| croak("name for output_test_csv_file required"),
2349
_parameter_file => $args{'parameter_file'}
2350
|| croak("parameter_file required"),
2351
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
2352
|| croak("number_of_samples_for_training"),
2353
_number_of_samples_for_testing => $args{'number_of_samples_for_testing'}
2354
|| croak("number_of_samples_for_testing"),
2355
0
0
0
_debug => $args{debug} || 0,
0
0
0
0
0
2356
_class_names => [],
2357
_class_names_and_priors => {},
2358
_features_with_value_range => {},
2359
_features_ordered => [],
2360
_classes_and_their_param_values => {},
2361
}, $class;
2362
}
2363
2364
sub check_for_illegal_params3 {
2365
0
0
0
my @params = @_;
2366
0
0
my @legal_params = qw / output_training_csv_file
2367
output_test_csv_file
2368
parameter_file
2369
number_of_samples_for_training
2370
number_of_samples_for_testing
2371
debug
2372
/;
2373
0
0
my $found_match_flag;
2374
0
0
foreach my $param (@params) {
2375
0
0
foreach my $legal (@legal_params) {
2376
0
0
$found_match_flag = 0;
2377
0
0
0
if ($param eq $legal) {
2378
0
0
$found_match_flag = 1;
2379
0
0
last;
2380
}
2381
}
2382
0
0
0
last if $found_match_flag == 0;
2383
}
2384
0
0
return $found_match_flag;
2385
}
2386
2387
## The training data generated by an instance of the class
2388
## TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
2389
## you supply to the class constructor through a constructor variable called
2390
## `parameter_file'. This method is for parsing the parameter file in order to
2391
## order to determine the names to be used for the different data classes, their
2392
## means, and their variances.
2393
sub read_parameter_file_numeric {
2394
0
0
0
my $self = shift;
2395
0
0
my @class_names = ();
2396
0
0
my %class_names_and_priors = ();
2397
0
0
my %features_with_value_range = ();
2398
0
0
my %classes_and_their_param_values = ();
2399
# my $regex8 = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
2400
0
0
0
open FILE, $self->{_parameter_file} || die "unable to open parameter file: $!";
2401
0
0
my @params = ;
2402
0
0
my $params = join "", @params;
2403
0
0
my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
2404
0
0
$params =~ /$regex/si;
2405
0
0
my ($class_names, $class_priors) = ($1, $2);
2406
0
0
@class_names = split ' ', $class_names;
2407
0
0
my @class_priors = split ' ', $class_priors;
2408
0
0
foreach my $i (0..@class_names-1) {
2409
0
0
$class_names_and_priors{$class_names[$i]} = $class_priors[$i];
2410
}
2411
0
0
0
if ($self->{_debug}) {
2412
0
0
foreach my $cname (keys %class_names_and_priors) {
2413
0
0
print "$cname => $class_names_and_priors{$cname}\n";
2414
}
2415
}
2416
0
0
$regex = 'feature name: \w*.*?value range: [\d\. -]+';
2417
0
0
my @features = $params =~ /$regex/gsi;
2418
0
0
my @features_ordered;
2419
0
0
$regex = 'feature name: (\w+)\W*?value range:\s*([\d. -]+)';
2420
0
0
foreach my $feature (@features) {
2421
0
0
$feature =~ /$regex/i;
2422
0
0
my $feature_name = $1;
2423
0
0
push @features_ordered, $feature_name;
2424
0
0
my @value_range = split ' ', $2;
2425
0
0
$features_with_value_range{$feature_name} = \@value_range;
2426
}
2427
0
0
0
if ($self->{_debug}) {
2428
0
0
foreach my $fname (keys %features_with_value_range) {
2429
0
0
print "$fname => @{$features_with_value_range{$fname}}\n";
0
0
2430
}
2431
}
2432
0
0
foreach my $i (0..@class_names-1) {
2433
0
0
$classes_and_their_param_values{$class_names[$i]} = {};
2434
}
2435
0
0
$regex = 'params for class: \w*?\W+?mean:[\d\. ]+\W*?covariance:\W+?(?:[ \d.]+\W+?)+';
2436
0
0
my @class_params = $params =~ /$regex/gsi;
2437
0
0
$regex = 'params for class: (\w+)\W*?mean:\s*([\d. -]+)\W*covariance:\s*([\s\d.]+)';
2438
0
0
foreach my $class_param (@class_params) {
2439
0
0
$class_param =~ /$regex/gsi;
2440
0
0
my $class_name = $1;
2441
0
0
my @class_mean = split ' ', $2;
2442
0
0
$classes_and_their_param_values{$class_name}->{'mean'} = \@class_mean;
2443
0
0
my $class_param_string = $3;
2444
0
0
my @covar_rows = split '\n', $class_param_string;
2445
0
0
my @covar_matrix;
2446
0
0
foreach my $row (@covar_rows) {
2447
0
0
my @row = split ' ', $row;
2448
0
0
push @covar_matrix, \@row;
2449
}
2450
0
0
$classes_and_their_param_values{$class_name}->{'covariance'} = \@covar_matrix;
2451
}
2452
0
0
0
if ($self->{_debug}) {
2453
0
0
print "\nThe class parameters are:\n\n";
2454
0
0
foreach my $cname (keys %classes_and_their_param_values) {
2455
0
0
print "\nFor class name $cname:\n";
2456
0
0
my %params_hash = %{$classes_and_their_param_values{$cname}};
0
0
2457
0
0
foreach my $x (keys %params_hash) {
2458
0
0
0
if ($x eq 'mean') {
2459
0
0
print " $x => @{$params_hash{$x}}\n";
0
0
2460
} else {
2461
0
0
0
if ($x eq 'covariance') {
2462
0
0
print " The covariance matrix:\n";
2463
0
0
my @matrix = @{$params_hash{'covariance'}};
0
0
2464
0
0
foreach my $row (@matrix) {
2465
0
0
print " @$row\n";
2466
}
2467
}
2468
}
2469
}
2470
}
2471
}
2472
0
0
$self->{_class_names} = \@class_names;
2473
0
0
$self->{_class_names_and_priors} = \%class_names_and_priors;
2474
0
0
$self->{_features_with_value_range} = \%features_with_value_range;
2475
0
0
$self->{_classes_and_their_param_values} = \%classes_and_their_param_values;
2476
0
0
$self->{_features_ordered} = \@features_ordered;
2477
}
2478
2479
## After the parameter file is parsed by the previous method, this method calls on
2480
## Math::Random::random_multivariate_normal() to generate the training and test data
2481
## samples. Your training and test data can be of any number of of dimensions, can
2482
## have any mean, and any covariance. The training and test data must obviously be
2483
## drawn from the same distribution.
2484
sub gen_numeric_training_and_test_data_and_write_to_csv {
2485
1
1
584
use Math::Random;
1
4461
1
767
2486
0
0
0
my $self = shift;
2487
0
0
my %training_samples_for_class;
2488
my %test_samples_for_class;
2489
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
2490
0
0
$training_samples_for_class{$class_name} = [];
2491
0
0
$test_samples_for_class{$class_name} = [];
2492
}
2493
0
0
foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
0
0
2494
0
0
my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
0
0
2495
0
0
my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
0
0
2496
my @new_training_data = Math::Random::random_multivariate_normal(
2497
0
0
$self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
2498
@mean, @covariance );
2499
my @new_test_data = Math::Random::random_multivariate_normal(
2500
0
0
$self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
2501
@mean, @covariance );
2502
0
0
0
if ($self->{_debug}) {
2503
0
0
print "training data for class $class_name:\n";
2504
0
0
foreach my $x (@new_training_data) {print "@$x\n";}
0
0
2505
0
0
print "\n\ntest data for class $class_name:\n";
2506
0
0
foreach my $x (@new_test_data) {print "@$x\n";}
0
0
2507
}
2508
0
0
$training_samples_for_class{$class_name} = \@new_training_data;
2509
0
0
$test_samples_for_class{$class_name} = \@new_test_data;
2510
}
2511
0
0
my @training_data_records = ();
2512
0
0
my @test_data_records = ();
2513
0
0
foreach my $class_name (keys %training_samples_for_class) {
2514
my $num_of_samples_for_training = $self->{_number_of_samples_for_training} *
2515
0
0
$self->{_class_names_and_priors}->{$class_name};
2516
my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} *
2517
0
0
$self->{_class_names_and_priors}->{$class_name};
2518
0
0
foreach my $sample_index (0..$num_of_samples_for_training-1) {
2519
0
0
my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};
0
0
2520
0
0
@training_vector = map {sprintf("%.3f", $_)} @training_vector;
0
0
2521
0
0
my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";
2522
0
0
push @training_data_records, $training_data_record;
2523
}
2524
0
0
foreach my $sample_index (0..$num_of_samples_for_testing-1) {
2525
0
0
my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
0
0
2526
0
0
@test_vector = map {sprintf("%.3f", $_)} @test_vector;
0
0
2527
0
0
my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
2528
0
0
push @test_data_records, $test_data_record;
2529
}
2530
}
2531
0
0
fisher_yates_shuffle(\@training_data_records);
2532
0
0
fisher_yates_shuffle(\@test_data_records);
2533
0
0
0
if ($self->{_debug}) {
2534
0
0
foreach my $record (@training_data_records) {
2535
0
0
print "$record";
2536
}
2537
0
0
foreach my $record (@test_data_records) {
2538
0
0
print "$record";
2539
}
2540
}
2541
0
0
open OUTPUT, ">$self->{_output_training_csv_file}";
2542
0
0
my @feature_names_training = @{$self->{_features_ordered}};
0
0
2543
0
0
my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
0
0
2544
0
0
my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
2545
0
0
print OUTPUT "$first_row_training\n";
2546
0
0
foreach my $i (0..@training_data_records-1) {
2547
0
0
my $i1 = $i+1;
2548
0
0
my $sample_record = "\"$i1\",$training_data_records[$i]";
2549
0
0
print OUTPUT "$sample_record";
2550
}
2551
0
0
close OUTPUT;
2552
0
0
open OUTPUT, ">$self->{_output_test_csv_file}";
2553
0
0
my @feature_names_testing = keys %{$self->{_features_with_value_range}};
0
0
2554
0
0
my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
0
0
2555
0
0
my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
2556
0
0
print OUTPUT "$first_row_testing\n";
2557
0
0
foreach my $i (0..@test_data_records-1) {
2558
0
0
my $i1 = $i+1;
2559
0
0
my $sample_record = "\"$i1\",$test_data_records[$i]";
2560
0
0
print OUTPUT "$sample_record";
2561
}
2562
0
0
close OUTPUT;
2563
}
2564
2565
# from perl docs:
2566
sub fisher_yates_shuffle {
2567
0
0
0
my $arr = shift;
2568
0
0
my $i = @$arr;
2569
0
0
while (--$i) {
2570
0
0
my $j = int rand( $i + 1 );
2571
0
0
@$arr[$i, $j] = @$arr[$j, $i];
2572
}
2573
}
2574
2575
########################### Generate Your Own Symbolic Training Data ###############################
2576
########################### Class TrainingDataGeneratorSymbolic #############################
2577
2578
## See the sample script generate_training_and_test_data_symbolic.pl for how to use
2579
## this class for generating purely symbolic training and test data. The data is
2580
## generated according to the specifications you place in a parameter file whose
2581
## name you supply as one of constructor arguments.
2582
package TrainingDataGeneratorSymbolic;
2583
2584
1
1
6
use strict;
1
1
1
21
2585
1
1
3
use Carp;
1
2
1
1734
2586
2587
sub new {
2588
1
1
14
my ($class, %args) = @_;
2589
1
4
my @params = keys %args;
2590
1
50
3
croak "\nYou have used a wrong name for a keyword argument " .
2591
"--- perhaps a misspelling\n"
2592
if check_for_illegal_params4(@params) == 0;
2593
bless {
2594
_output_training_datafile => $args{'output_training_datafile'}
2595
|| die("name for output_training_datafile required"),
2596
_parameter_file => $args{'parameter_file'}
2597
|| die("parameter_file required"),
2598
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
2599
|| die("number_of_samples_for_training required"),
2600
1
50
17
_debug => $args{debug} || 0,
50
50
50
2601
_class_names => [],
2602
_class_priors => [],
2603
_features_and_values_hash => {},
2604
_bias_hash => {},
2605
_training_sample_records => {},
2606
}, $class;
2607
}
2608
2609
sub check_for_illegal_params4 {
2610
1
1
2
my @params = @_;
2611
1
2
my @legal_params = qw / output_training_datafile
2612
parameter_file
2613
number_of_samples_for_training
2614
debug
2615
/;
2616
1
1
my $found_match_flag;
2617
1
2
foreach my $param (@params) {
2618
3
3
foreach my $legal (@legal_params) {
2619
6
3
$found_match_flag = 0;
2620
6
100
10
if ($param eq $legal) {
2621
3
2
$found_match_flag = 1;
2622
3
3
last;
2623
}
2624
}
2625
3
50
6
last if $found_match_flag == 0;
2626
}
2627
1
3
return $found_match_flag;
2628
}
2629
2630
## Read a parameter file for generating symbolic training data. See the script
2631
## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
2632
## pass the name of the parameter file to the constructor of the
2633
## TrainingDataGeneratorSymbolic class.
2634
sub read_parameter_file_symbolic {
2635
1
1
4
my $self = shift;
2636
1
5
my $debug = $self->{_debug};
2637
1
1
my $number_of_training_samples = $self->{_number_of_samples_for_training};
2638
1
2
my $input_parameter_file = $self->{_parameter_file};
2639
1
50
3
croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
2640
1
2
my $output_file_training = $self->{_output_training_datafile};
2641
1
1
my $output_file_testing = $self->{_output_test_datafile};
2642
1
1
my @all_params;
2643
my $param_string;
2644
1
33
24
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
2645
1
30
@all_params = ;
2646
1
3
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
40
46
2647
1
3
@all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
36
58
2648
1
5
$param_string = join ' ', @all_params;
2649
1
7
my ($class_names, $class_priors, $rest_param) =
2650
$param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
2651
1
50
40
my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
3
14
2652
1
1
push @{$self->{_class_names}}, @class_names;
1
2
2653
1
50
5
my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
3
10
2654
1
1
push @{$self->{_class_priors}}, @class_priors;
1
3
2655
1
8
my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;
2656
1
1
my %features_and_values_hash;
2657
1
6
my @features = split /(feature[:])/, $feature_string;
2658
1
50
5
@features = grep {defined($_) && length($_) > 0} @features;
9
23
2659
1
2
foreach my $item (@features) {
2660
8
100
12
next if $item =~ /feature/;
2661
4
13
my @splits = split / /, $item;
2662
4
50
4
@splits = grep {defined($_) && length($_) > 0} @splits;
27
61
2663
4
8
foreach my $i (0..@splits-1) {
2664
22
100
24
if ($i == 0) {
2665
4
7
$features_and_values_hash{$splits[0]} = [];
2666
} else {
2667
18
100
25
next if $splits[$i] =~ /values/;
2668
14
9
push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
14
21
2669
}
2670
}
2671
}
2672
1
2
$self->{_features_and_values_hash} = \%features_and_values_hash;
2673
1
1
my %bias_hash = %{$self->{_bias_hash}};
1
3
2674
1
7
my @biases = split /(bias[:]\s*class[:])/, $bias_string;
2675
1
50
1
@biases = grep {defined($_) && length($_) > 0} @biases;
5
13
2676
1
2
foreach my $item (@biases) {
2677
4
100
8
next if $item =~ /bias/;
2678
2
16
my @splits = split /\s+/, $item;
2679
2
50
4
@splits = grep {defined($_) && length($_) > 0} @splits;
18
43
2680
2
1
my $feature_name;
2681
2
4
foreach my $i (0..@splits-1) {
2682
16
100
29
if ($i == 0) {
100
2683
2
3
$bias_hash{$splits[0]} = {};
2684
} elsif ($splits[$i] =~ /(^.+)[:]$/) {
2685
8
9
$feature_name = $1;
2686
8
14
$bias_hash{$splits[0]}->{$feature_name} = [];
2687
} else {
2688
6
50
9
next if !defined $feature_name;
2689
6
50
8
push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
6
11
2690
if defined $feature_name;
2691
}
2692
}
2693
}
2694
1
1
$self->{_bias_hash} = \%bias_hash;
2695
1
50
10
if ($debug) {
2696
0
0
print "\n\nClass names: @class_names\n";
2697
0
0
my $num_of_classes = @class_names;
2698
0
0
print "Class priors: @class_priors\n";
2699
0
0
print "Number of classes: $num_of_classes\n";
2700
0
0
print "\nHere are the features and their possible values:\n";
2701
0
0
while ( my ($k, $v) = each %features_and_values_hash ) {
2702
0
0
print "$k ===> @$v\n";
2703
}
2704
0
0
print "\nHere is the biasing for each class:\n";
2705
0
0
while ( my ($k, $v) = each %bias_hash ) {
2706
0
0
print "$k:\n";
2707
0
0
while ( my ($k1, $v1) = each %$v ) {
2708
0
0
print " $k1 ===> @$v1\n";
2709
}
2710
}
2711
}
2712
}
2713
2714
## This method generates training data according to the specifications placed in a
2715
## parameter file that is read by the previous method.
2716
sub gen_symbolic_training_data {
2717
1
1
5
my $self = shift;
2718
1
1
my @class_names = @{$self->{_class_names}};
1
3
2719
1
1
my @class_priors = @{$self->{_class_priors}};
1
3
2720
1
1
my %training_sample_records;
2721
1
1
my %features_and_values_hash = %{$self->{_features_and_values_hash}};
1
4
2722
1
1
my %bias_hash = %{$self->{_bias_hash}};
1
3
2723
1
1
my $how_many_training_samples = $self->{_number_of_samples_for_training};
2724
1
1
my $how_many_test_samples = $self->{_number_of_samples_for_testing};
2725
1
1
my %class_priors_to_unit_interval_map;
2726
1
2
my $accumulated_interval = 0;
2727
1
2
foreach my $i (0..@class_names-1) {
2728
2
5
$class_priors_to_unit_interval_map{$class_names[$i]}
2729
= [$accumulated_interval, $accumulated_interval + $class_priors[$i]];
2730
2
4
$accumulated_interval += $class_priors[$i];
2731
}
2732
1
50
2
if ($self->{_debug}) {
2733
0
0
print "Mapping of class priors to unit interval: \n";
2734
0
0
while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {
2735
0
0
print "$k => @$v\n";
2736
}
2737
0
0
print "\n\n";
2738
}
2739
1
2
my $ele_index = 0;
2740
1
2
while ($ele_index < $how_many_training_samples) {
2741
35
33
my $sample_name = "sample" . "_$ele_index";
2742
35
50
$training_sample_records{$sample_name} = [];
2743
# Generate class label for this training sample:
2744
35
49
my $roll_the_dice = rand(1.0);
2745
35
33
my $class_label;
2746
35
39
foreach my $class_name (keys %class_priors_to_unit_interval_map ) {
2747
46
30
my $v = $class_priors_to_unit_interval_map{$class_name};
2748
46
100
66
121
if ( ($roll_the_dice >= $v->[0]) && ($roll_the_dice <= $v->[1]) ) {
2749
35
21
push @{$training_sample_records{$sample_name}},
35
51
2750
"class=" . $class_name;
2751
35
25
$class_label = $class_name;
2752
35
36
last;
2753
}
2754
}
2755
35
45
foreach my $feature (keys %features_and_values_hash) {
2756
140
88
my @values = @{$features_and_values_hash{$feature}};
140
212
2757
140
123
my $bias_string = $bias_hash{$class_label}->{$feature}->[0];
2758
140
111
my $no_bias = 1.0 / @values;
2759
140
100
271
$bias_string = "$values[0]" . "=$no_bias" if !defined $bias_string;
2760
140
68
my %value_priors_to_unit_interval_map;
2761
140
352
my @splits = split /\s*=\s*/, $bias_string;
2762
140
102
my $chosen_for_bias_value = $splits[0];
2763
140
91
my $chosen_bias = $splits[1];
2764
140
151
my $remaining_bias = 1 - $chosen_bias;
2765
140
123
my $remaining_portion_bias = $remaining_bias / (@values -1);
2766
140
50
111
@splits = grep {defined($_) && length($_) > 0} @splits;
280
786
2767
140
105
my $accumulated = 0;
2768
140
149
foreach my $i (0..@values-1) {
2769
490
100
481
if ($values[$i] eq $chosen_for_bias_value) {
2770
140
197
$value_priors_to_unit_interval_map{$values[$i]}
2771
= [$accumulated, $accumulated + $chosen_bias];
2772
140
124
$accumulated += $chosen_bias;
2773
} else {
2774
350
411
$value_priors_to_unit_interval_map{$values[$i]}
2775
= [$accumulated, $accumulated + $remaining_portion_bias];
2776
350
283
$accumulated += $remaining_portion_bias;
2777
}
2778
}
2779
140
108
my $roll_the_dice = rand(1.0);
2780
140
80
my $value_label;
2781
140
173
foreach my $value_name (keys %value_priors_to_unit_interval_map ) {
2782
312
222
my $v = $value_priors_to_unit_interval_map{$value_name};
2783
312
100
100
749
if ( ($roll_the_dice >= $v->[0])
2784
&& ($roll_the_dice <= $v->[1]) ) {
2785
140
79
push @{$training_sample_records{$sample_name}},
140
268
2786
$feature . "=" . $value_name;
2787
140
91
$value_label = $value_name;
2788
140
102
last;
2789
}
2790
}
2791
140
50
401
if ($self->{_debug}) {
2792
0
0
print "mapping feature value priors for '$feature' " .
2793
"to unit interval: \n";
2794
0
0
while ( my ($k, $v) =
2795
each %value_priors_to_unit_interval_map ) {
2796
0
0
print "$k => @$v\n";
2797
}
2798
0
0
print "\n\n";
2799
}
2800
}
2801
35
61
$ele_index++;
2802
}
2803
1
2
$self->{_training_sample_records} = \%training_sample_records;
2804
1
50
3
if ($self->{_debug}) {
2805
0
0
print "\n\nPRINTING TRAINING RECORDS:\n\n";
2806
0
0
foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {
0
0
2807
0
0
print "$kee => @{$training_sample_records{$kee}}\n\n";
0
0
2808
}
2809
}
2810
1
1
my $output_training_file = $self->{_output_training_datafile};
2811
1
50
3
print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};
2812
1
77
open FILEHANDLE, ">$output_training_file";
2813
1
7
my @features = sort keys %features_and_values_hash;
2814
1
1
my $title_string = ',class';
2815
1
2
foreach my $feature_name (@features) {
2816
4
5
$title_string .= ',' . $feature_name;
2817
}
2818
1
11
print FILEHANDLE "$title_string\n";
2819
1
11
my @sample_names = sort {$a <=> $b} map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;
105
91
35
38
35
36
2820
1
5
my $record_string = '';
2821
1
2
foreach my $sample_name (@sample_names) {
2822
35
27
$record_string .= "$sample_name,";
2823
35
19
my @record = @{$training_sample_records{"sample_$sample_name"}};
35
55
2824
35
45
my %item_parts_hash;
2825
35
26
foreach my $item (@record) {
2826
175
220
my @splits = grep $_, split /=/, $item;
2827
175
196
$item_parts_hash{$splits[0]} = $splits[1];
2828
}
2829
35
25
$record_string .= $item_parts_hash{"class"};
2830
35
29
delete $item_parts_hash{"class"};
2831
35
60
my @kees = sort keys %item_parts_hash;
2832
35
30
foreach my $kee (@kees) {
2833
140
115
$record_string .= ",$item_parts_hash{$kee}";
2834
}
2835
35
33
print FILEHANDLE "$record_string\n";
2836
35
47
$record_string = '';
2837
}
2838
1
36
close FILEHANDLE;
2839
}
2840
2841
sub sample_index {
2842
0
0
my $arg = shift;
2843
0
$arg =~ /_(.+)$/;
2844
0
return $1;
2845
}
2846
2847
################################# Decision Tree Introspection #######################################
2848
################################# Class DTIntrospection #######################################
2849
2850
package DTIntrospection;
2851
2852
## Instances constructed from this class can provide explanations for the
2853
## classification decisions at the nodes of a decision tree.
2854
##
2855
## When used in the interactive mode, the decision-tree introspection made possible
2856
## by this class provides answers to the following three questions: (1) List of the
2857
## training samples that fall in the portion of the feature space that corresponds
2858
## to a node of the decision tree; (2) The probabilities associated with the last
2859
## feature test that led to the node; and (3) The class probabilities predicated on
2860
## just the last feature test on the path to that node.
2861
##
2862
## CAVEAT: It is possible for a node to exist even when there are no training
2863
## samples in the portion of the feature space that corresponds to the node. That
2864
## is because a decision tree is based on the probability densities estimated from
2865
## the training data. When training data is non-uniformly distributed, it is
2866
## possible for the probability associated with a point in the feature space to be
2867
## non-zero even when there are no training samples at or in the vicinity of that
2868
## point.
2869
##
2870
## For a node to exist even where there are no training samples in the portion of
2871
## the feature space that belongs to the node is an indication of the generalization
2872
## ability of decision-tree based classification.
2873
##
2874
## When used in a non-interactive mode, an instance of this class can be used to
2875
## create a tabular display that shows what training samples belong directly to the
2876
## portion of the feature space that corresponds to each node of the decision tree.
2877
## An instance of this class can also construct a tabular display that shows how the
2878
## influence of each training sample propagates in the decision tree. For each
2879
## training sample, this display first shows the list of nodes that came into
2880
## existence through feature test(s) that used the data provided by that sample.
2881
## This list for each training sample is followed by a subtree of the nodes that owe
2882
## their existence indirectly to the training sample. A training sample influences a
2883
## node indirectly if the node is a descendant of another node that is affected
2884
## directly by the training sample.
2885
2886
1
1
5
use strict;
1
1
1
17
2887
1
1
4
use Carp;
1
0
1
1992
2888
2889
sub new {
2890
0
0
my ($class, $dt) = @_;
2891
0
0
croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
2892
unless ref($dt) eq "Algorithm::DecisionTree";
2893
bless {
2894
_dt => $dt,
2895
_root_dtnode => $dt->{_root_node},
2896
0
_samples_at_nodes_hash => {},
2897
_branch_features_to_nodes_hash => {},
2898
_sample_to_node_mapping_direct_hash => {},
2899
_node_serial_num_to_node_hash => {},
2900
_awareness_raising_msg_shown => 0,
2901
_debug => 0,
2902
}, $class;
2903
}
2904
2905
sub initialize {
2906
0
0
my $self = shift;
2907
croak "You must first construct the decision tree before using the DTIntrospection class"
2908
0
0
unless $self->{_root_dtnode};
2909
0
$self->recursive_descent($self->{_root_dtnode});
2910
}
2911
2912
sub recursive_descent {
2913
0
0
my $self = shift;
2914
0
my $node = shift;
2915
0
my $node_serial_number = $node->get_serial_num();
2916
0
my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2917
0
0
print "\nAt node $node_serial_number: the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
0
2918
0
$self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
2919
0
$self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
2920
0
my @samples_at_node = ();
2921
0
foreach my $item (@$branch_features_and_values_or_thresholds) {
2922
0
my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
2923
0
0
unless (@samples_at_node) {
2924
0
@samples_at_node = @$samples_for_feature_value_combo;
2925
} else {
2926
0
my @accum;
2927
0
foreach my $sample (@samples_at_node) {
2928
0
0
push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);
2929
}
2930
0
@samples_at_node = @accum;
2931
}
2932
0
0
last unless @samples_at_node;
2933
}
2934
0
@samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node;
0
2935
0
0
print "Node: $node_serial_number the samples are: [@samples_at_node]\n" if ($self->{_debug});
2936
0
$self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
2937
0
0
if (@samples_at_node) {
2938
0
foreach my $sample (@samples_at_node) {
2939
0
0
if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2940
0
$self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number];
2941
} else {
2942
0
push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
0
2943
}
2944
}
2945
}
2946
0
my $children = $node->get_children();
2947
0
foreach my $child (@$children) {
2948
0
$self->recursive_descent($child);
2949
}
2950
}
2951
2952
sub display_training_samples_at_all_nodes_direct_influence_only {
2953
0
0
my $self = shift;
2954
croak "You must first construct the decision tree before using the DT Introspection class."
2955
0
0
unless $self->{_root_dtnode};
2956
0
$self->recursive_descent_for_showing_samples_at_a_node($self->{_root_dtnode});
2957
}
2958
2959
sub recursive_descent_for_showing_samples_at_a_node{
2960
0
0
my $self = shift;
2961
0
my $node = shift;
2962
0
my $node_serial_number = $node->get_serial_num();
2963
0
my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2964
0
0
if (exists $self->{_samples_at_nodes_hash}->{$node_serial_number}) {
2965
0
0
print "\nAt node $node_serial_number: the branch features and values are: [@{$branch_features_and_values_or_thresholds}]\n" if $self->{_debug};
0
2966
0
print "Node $node_serial_number: the samples are: [@{$self->{_samples_at_nodes_hash}->{$node_serial_number}}]\n";
0
2967
}
2968
0
map $self->recursive_descent_for_showing_samples_at_a_node($_), @{$node->get_children()};
0
2969
}
2970
2971
sub display_training_samples_to_nodes_influence_propagation {
2972
0
0
my $self = shift;
2973
0
foreach my $sample (sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} keys %{$self->{_dt}->{_training_data_hash}}) {
0
0
2974
0
0
if (exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2975
0
my $nodes_directly_affected = $self->{_sample_to_node_mapping_direct_hash}->{$sample};
2976
0
print "\n$sample:\n nodes affected directly: [@{$nodes_directly_affected}]\n";
0
2977
0
print " nodes affected through probabilistic generalization:\n";
2978
0
map $self->recursive_descent_for_sample_to_node_influence($_, $nodes_directly_affected, " "), @$nodes_directly_affected;
2979
}
2980
}
2981
}
2982
2983
sub recursive_descent_for_sample_to_node_influence {
2984
0
0
my $self = shift;
2985
0
my $node_serial_num = shift;
2986
0
my $nodes_already_accounted_for = shift;
2987
0
my $offset = shift;
2988
0
$offset .= " ";
2989
0
my $node = $self->{_node_serial_num_to_node_hash}->{$node_serial_num};
2990
0
my @children = map $_->get_serial_num(), @{$node->get_children()};
0
2991
0
my @children_affected = grep {!Algorithm::DecisionTree::contained_in($_, @{$nodes_already_accounted_for})} @children;
0
0
2992
0
0
if (@children_affected) {
2993
0
print "$offset $node_serial_num => [@children_affected]\n";
2994
}
2995
0
map $self->recursive_descent_for_sample_to_node_influence($_, \@children_affected, $offset), @children_affected;
2996
}
2997
2998
sub get_samples_for_feature_value_combo {
2999
0
0
my $self = shift;
3000
0
my $feature_value_combo = shift;
3001
0
my ($feature,$op,$value) = $self->extract_feature_op_val($feature_value_combo);
3002
0
my @samples = ();
3003
0
0
if ($op eq '=') {
0
0
3004
0
@samples = grep Algorithm::DecisionTree::contained_in($feature_value_combo, @{$self->{_dt}->{_training_data_hash}->{$_}}), keys %{$self->{_dt}->{_training_data_hash}};
0
0
3005
} elsif ($op eq '<') {
3006
0
foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
0
3007
0
my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
0
3008
0
foreach my $item (@features_and_values) {
3009
0
my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3010
0
0
0
if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data <= $value)) {
0
3011
0
push @samples, $sample;
3012
0
last;
3013
}
3014
}
3015
}
3016
} elsif ($op eq '>') {
3017
0
foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
0
3018
0
my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
0
3019
0
foreach my $item (@features_and_values) {
3020
0
my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3021
0
0
0
if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data > $value)) {
0
3022
0
push @samples, $sample;
3023
0
last;
3024
}
3025
}
3026
}
3027
} else {
3028
0
die "Something strange is going on";
3029
}
3030
0
return \@samples;
3031
}
3032
3033
sub extract_feature_op_val {
3034
0
0
my $self = shift;
3035
0
my $feature_value_combo = shift;
3036
0
my $pattern1 = '(.+)=(.+)';
3037
0
my $pattern2 = '(.+)<(.+)';
3038
0
my $pattern3 = '(.+)>(.+)';
3039
0
my ($feature,$value,$op);
3040
0
0
if ($feature_value_combo =~ /$pattern2/) {
0
0
3041
0
($feature,$op,$value) = ($1,'<',$2);
3042
} elsif ($feature_value_combo =~ /$pattern3/) {
3043
0
($feature,$op,$value) = ($1,'>',$2);
3044
} elsif ($feature_value_combo =~ /$pattern1/) {
3045
0
($feature,$op,$value) = ($1,'=',$2);
3046
}
3047
0
return ($feature,$op,$value);
3048
}
3049
3050
sub explain_classifications_at_multiple_nodes_interactively {
3051
0
0
my $self = shift;
3052
croak "You called explain_classification_at_multiple_nodes_interactively() without " .
3053
"first initializing the DTIntrospection instance in your code. Aborting."
3054
0
0
unless $self->{_samples_at_nodes_hash};
3055
0
print "\n\nIn order for the decision tree to introspect\n\n";
3056
0
print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3057
" that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3058
" with it even when there are NO training data points in that region?\n" .
3059
"\nEnter 'y' for yes or any other character for no: ";
3060
0
my $ans = ;
3061
0
$ans =~ s/^\s*|\s*$//g;
3062
0
0
die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decisions in the decision tree. Aborting!\n" if $ans !~ /^ye?s?$/;
3063
0
$self->{_awareness_raising_msg_shown} = 1;
3064
0
while (1) {
3065
0
my $node_id;
3066
my $ans;
3067
0
while (1) {
3068
0
print "\nEnter the integer ID of a node: ";
3069
0
$ans = ;
3070
0
$ans =~ s/^\s*|\s*$//g;
3071
0
0
return if $ans =~ /^exit$/;
3072
0
0
last if Algorithm::DecisionTree::contained_in($ans, keys %{$self->{_samples_at_nodes_hash}});
0
3073
0
print "\nYour answer must be an integer ID of a node. Try again or enter 'exit'.\n";
3074
}
3075
0
$node_id = $ans;
3076
0
$self->explain_classification_at_one_node($node_id)
3077
}
3078
}
3079
3080
sub explain_classification_at_one_node {
3081
0
0
my $self = shift;
3082
0
my $node_id = shift;
3083
croak "You called explain_classification_at_one_node() without first initializing " .
3084
0
0
"the DTIntrospection instance in your code. Aborting." unless $self->{_samples_at_nodes_hash};
3085
0
0
unless (exists $self->{_samples_at_nodes_hash}->{$node_id}) {
3086
0
print "Node $node_id is not a node in the tree\n";
3087
0
return;
3088
}
3089
0
0
unless ($self->{_awareness_raising_msg_shown}) {
3090
0
print "\n\nIn order for the decision tree to introspect at Node $node_id: \n\n";
3091
0
print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3092
" that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3093
" with it even when there are NO training data points in that region?\n" .
3094
"\nEnter 'y' for yes or any other character for no: ";
3095
0
my $ans = ;
3096
0
$ans =~ s/^\s*|\s*$//g;
3097
0
0
die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decision at node $node_id\n" if $ans !~ /^ye?s?$/;
3098
}
3099
0
my @samples_at_node = @{$self->{_samples_at_nodes_hash}->{$node_id}};
0
3100
0
my @branch_features_to_node = @{$self->{_branch_features_to_nodes_hash}->{$node_id}};
0
3101
# my @class_names = @{$self->get_class_names()};
3102
0
my @class_names = $self->{_dt}->get_class_names();
3103
0
my $class_probabilities = $self->{_root_dtnode}->get_class_probabilities();
3104
0
my ($feature,$op,$value) = $self->extract_feature_op_val( $branch_features_to_node[-1] );
3105
0
0
my $msg = @samples_at_node == 0
3106
? "\n\n There are NO training data samples directly in the region of the feature space assigned to node $node_id: @samples_at_node\n\n"
3107
: "\n Samples in the portion of the feature space assigned to Node $node_id: @samples_at_node\n";
3108
0
$msg .= "\n Features tests on the branch to node $node_id: [@branch_features_to_node]\n\n";
3109
0
$msg .= "\n Would you like to see the probability associated with the last feature test on the branch leading to Node $node_id?\n";
3110
0
$msg .= "\n Enter 'y' if yes and any other character for 'no': ";
3111
0
print $msg;
3112
0
my $ans = ;
3113
0
$ans =~ s/^\s*|\s*$//g;
3114
0
0
if ($ans =~ /^ye?s?$/) {
3115
0
my $sequence = [$branch_features_to_node[-1]];
3116
0
my $prob = $self->{_dt}->probability_of_a_sequence_of_features_and_values_or_thresholds($sequence);
3117
0
print "\n probability of @{$sequence} is: $prob\n";
0
3118
}
3119
0
$msg = "\n Using Bayes rule, would you like to see the class probabilities predicated on just the last feature test on the branch leading to Node $node_id?\n";
3120
0
$msg .= "\n Enter 'y' for yes and any other character for no: ";
3121
0
print $msg;
3122
0
$ans = ;
3123
0
$ans =~ s/^\s*|\s*$//g;
3124
0
0
if ($ans =~ /^ye?s?$/) {
3125
0
my $sequence = [$branch_features_to_node[-1]];
3126
0
foreach my $cls (@class_names) {
3127
0
my $prob = $self->{_dt}->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($cls, $sequence);
3128
0
print "\n probability of class $cls given just the feature test @{$sequence} is: $prob\n";
0
3129
}
3130
} else {
3131
0
print "goodbye\n";
3132
}
3133
0
print "\n Finished supplying information on Node $node_id\n\n";
3134
}
3135
3136
1;
3137
3138
=pod
3139
3140
=head1 NAME
3141
3142
Algorithm::DecisionTree - A Perl module for decision-tree based classification of
3143
multidimensional data.
3144
3145
3146
=head1 SYNOPSIS
3147
3148
# FOR CONSTRUCTING A DECISION TREE AND FOR CLASSIFYING A SAMPLE:
3149
3150
# In general, your call for constructing an instance of the DecisionTree class
3151
# will look like:
3152
3153
my $training_datafile = "stage3cancer.csv";
3154
my $dt = Algorithm::DecisionTree->new(
3155
training_datafile => $training_datafile,
3156
csv_class_column_index => 2,
3157
csv_columns_for_features => [3,4,5,6,7,8],
3158
entropy_threshold => 0.01,
3159
max_depth_desired => 8,
3160
symbolic_to_numeric_cardinality_threshold => 10,
3161
csv_cleanup_needed => 1,
3162
);
3163
3164
# The constructor option `csv_class_column_index' informs the module as to which
3165
# column of your CSV file contains the class label. THE COLUMN INDEXING IS ZERO
3166
# BASED. The constructor option `csv_columns_for_features' specifies which columns
3167
# are to be used for feature values. The first row of the CSV file must specify
3168
# the names of the features. See examples of CSV files in the `Examples'
3169
# subdirectory.
3170
3171
# The option `symbolic_to_numeric_cardinality_threshold' is also important. For
3172
# the example shown above, if an ostensibly numeric feature takes on only 10 or
3173
# fewer different values in your training datafile, it will be treated like a
3174
# symbolic features. The option `entropy_threshold' determines the granularity
3175
# with which the entropies are sampled for the purpose of calculating entropy gain
3176
# with a particular choice of decision threshold for a numeric feature or a feature
3177
# value for a symbolic feature.
3178
3179
# The option 'csv_cleanup_needed' is by default set to 0. If you set it
3180
# to 1, that would cause all line records in your CSV file to be "sanitized" before
3181
# they are used for constructing a decision tree. You need this option if your CSV
3182
# file uses double-quoted field names and field values in the line records and if
3183
# such double-quoted strings are allowed to include commas for, presumably, better
3184
# readability.
3185
3186
# After you have constructed an instance of the DecisionTree class as shown above,
3187
# you read in the training data file and initialize the probability cache by
3188
# calling:
3189
3190
$dt->get_training_data();
3191
$dt->calculate_first_order_probabilities();
3192
$dt->calculate_class_priors();
3193
3194
# Next you construct a decision tree for your training data by calling:
3195
3196
$root_node = $dt->construct_decision_tree_classifier();
3197
3198
# where $root_node is an instance of the DTNode class that is also defined in the
3199
# module file. Now you are ready to classify a new data record. Let's say that
3200
# your data record looks like:
3201
3202
my @test_sample = qw / g2=4.2
3203
grade=2.3
3204
gleason=4
3205
eet=1.7
3206
age=55.0
3207
ploidy=diploid /;
3208
3209
# You can classify it by calling:
3210
3211
my $classification = $dt->classify($root_node, \@test_sample);
3212
3213
# The call to `classify()' returns a reference to a hash whose keys are the class
3214
# names and the values the associated classification probabilities. This hash also
3215
# includes another key-value pair for the solution path from the root node to the
3216
# leaf node at which the final classification was carried out.
3217
3218
3219
=head1 CHANGES
3220
3221
B This version reintroduces C as an optional
3222
parameter in the module constructor. This was done in response to several requests
3223
received from the user community. (Previously, all line records from a CSV file were
3224
processed by the C function no matter what.) The main point made by
3225
the users was that invoking C when there was no need for CSV clean-up
3226
extracted a performance penalty when ingesting large database files with tens of
3227
thousands of line records. In addition to making C optional, I
3228
have also tweaked up the code in the C function in order to extract
3229
data from a larger range of messy CSV files.
3230
3231
B All the changes made in this version relate to the construction of
3232
regression trees. I have fixed a couple of bugs in the calculation of the regression
3233
coefficients. Additionally, the C class now comes with a new
3234
constructor parameter named C. For most cases, you'd set this
3235
parameter to 0, which causes the regression coefficients to be estimated through
3236
linear least-squares minimization.
3237
3238
B In addition to constructing decision trees, this version of the
3239
module also allows you to construct regression trees. The regression tree capability
3240
has been packed into a separate subclass, named C, of the main
3241
C class. The subdirectory C in the main
3242
installation directory illustrates how you can use this new functionality of the
3243
module.
3244
3245
B This version incorporates four very significant upgrades/changes to
3246
the C module: B<(1)> The CSV cleanup is now the default. So you do not
3247
have to set any special parameters in the constructor calls to initiate CSV
3248
cleanup. B<(2)> In the form of a new Perl class named C,
3249
this module provides you with an easy-to-use programming interface for attempting
3250
needle-in-a-haystack solutions for the case when your training data is overwhelmingly
3251
dominated by a single class. You need to set the constructor parameter
3252
C to invoke the logic that constructs multiple
3253
decision trees, each using the minority class samples along with samples drawn
3254
randomly from the majority class. The final classification is made through a
3255
majority vote from all the decision trees. B<(3)> Assuming you are faced with a
3256
big-data problem --- in the sense that you have been given a training database with a
3257
very large number of training records --- the class C will
3258
also let you construct multiple decision trees by pulling training data randomly from
3259
your training database (without paying attention to the relative populations of the
3260
classes). The final classification decision for a test sample is based on a majority
3261
vote from all the decision trees thus constructed. See the C
3262
directory for how to use these new features of the module. And, finally, B<(4)>
3263
Support for the old-style '.dat' training files has been dropped in this version.
3264
3265
B This version makes it easier to use a CSV training file that
3266
violates the assumption that a comma be used only to separate the different field
3267
values in a line record. Some large econometrics databases use double-quoted values
3268
for fields, and these values may contain commas (presumably for better readability).
3269
This version also allows you to specify the leftmost entry in the first CSV record
3270
that names all the fields. Previously, this entry was required to be an empty
3271
double-quoted string. I have also made some minor changes to the
3272
'C' method to make it more user friendly for large
3273
training files that may contain tens of thousands of records. When pulling training
3274
data from such files, this method prints out a dot on the terminal screen for every
3275
10000 records it has processed.
3276
3277
B This version brings the boosting capability to the C
3278
module.
3279
3280
B This version adds bagging to the C module. If your
3281
training dataset is large enough, you can ask the module to construct multiple
3282
decision trees using data bags extracted from your dataset. The module can show you
3283
the results returned by the individual decision trees and also the results obtained
3284
by taking a majority vote of the classification decisions made by the individual
3285
trees. You can specify any arbitrary extent of overlap between the data bags.
3286
3287
B The introspection capability in this version packs more of a punch.
3288
For each training data sample, you can now figure out not only the decision-tree
3289
nodes that are affected directly by that sample, but also those nodes that are
3290
affected indirectly through the generalization achieved by the probabilistic modeling
3291
of the data. The 'examples' directory of this version includes additional scripts
3292
that illustrate these enhancements to the introspection capability. See the section
3293
"The Introspection API" for a declaration of the introspection related methods, old
3294
and new.
3295
3296
B In response to requests from several users, this version includes a new
3297
capability: You can now ask the module to introspect about the classification
3298
decisions returned by the decision tree. Toward that end, the module includes a new
3299
class named C. Perhaps the most important bit of information you
3300
are likely to seek through DT introspection is the list of the training samples that
3301
fall directly in the portion of the feature space that is assigned to a node.
3302
B When training samples are non-uniformly distributed in the underlying
3303
feature space, IT IS POSSIBLE FOR A NODE TO EXIST EVEN WHEN NO TRAINING SAMPLES FALL
3304
IN THE PORTION OF THE FEATURE SPACE ASSIGNED TO THE NODE. B<(This is an important
3305
part of the generalization achieved by probabilistic modeling of the training data.)>
3306
For additional information related to DT introspection, see the section titled
3307
"DECISION TREE INTROSPECTION" in this documentation page.
3308
3309
B makes the logic of tree construction from the old-style '.dat' training
3310
files more consistent with how trees are constructed from the data in `.csv' files.
3311
The inconsistency in the past was with respect to the naming convention for the class
3312
labels associated with the different data records.
3313
3314
B fixes a bug in the part of the module that some folks use for generating
3315
synthetic data for experimenting with decision tree construction and classification.
3316
In the class C that is a part of the module, there
3317
was a problem with the order in which the features were recorded from the
3318
user-supplied parameter file. The basic code for decision tree construction and
3319
classification remains unchanged.
3320
3321
B further downshifts the required version of Perl for this module. This
3322
was a result of testing the module with Version 5.10.1 of Perl. Only one statement
3323
in the module code needed to be changed for the module to work with the older version
3324
of Perl.
3325
3326
B fixes the C restriction on the required Perl version. This
3327
version should work with Perl versions 5.14.0 and higher.
3328
3329
B changes the required version of Perl from 5.18.0 to 5.14.0. Everything
3330
else remains the same.
3331
3332
B should prove more robust when the probability distribution for the
3333
values of a feature is expected to be heavy-tailed; that is, when the supposedly rare
3334
observations can occur with significant probabilities. A new option in the
3335
DecisionTree constructor lets the user specify the precision with which the
3336
probability distributions are estimated for such features.
3337
3338
B fixes a bug that was caused by the explicitly set zero values for
3339
numerical features being misconstrued as "false" in the conditional statements in
3340
some of the method definitions.
3341
3342
B makes it easier to write code for classifying in one go all of your test
3343
data samples in a CSV file. The bulk classifications obtained can be written out to
3344
either a CSV file or to a regular text file. See the script
3345
C in the C directory for how to
3346
classify all of your test data records in a CSV file. This version also includes
3347
improved code for generating synthetic numeric/symbolic training and test data
3348
records for experimenting with the decision tree classifier.
3349
3350
B allows you to test the quality of your training data by running a 10-fold
3351
cross-validation test on the data. This test divides all of the training data into
3352
ten parts, with nine parts used for training a decision tree and one part used for
3353
testing its ability to classify correctly. This selection of nine parts for training
3354
and one part for testing is carried out in all of the ten different ways that are
3355
possible. This testing functionality in Version 2.1 can also be used to find the
3356
best values to use for the constructor parameters C,
3357
C, and C.
3358
3359
B Now you can use both numeric and
3360
symbolic features for constructing a decision tree. A feature is numeric if it can
3361
take any floating-point value over an interval.
3362
3363
B fixes a bug in the code that was triggered by 0 being declared as one of
3364
the features values in the training datafile. Version 1.71 also include an additional
3365
safety feature that is useful for training datafiles that contain a very large number
3366
of features. The new version makes sure that the number of values you declare for
3367
each sample record matches the number of features declared at the beginning of the
3368
training datafile.
3369
3370
B includes safety checks on the consistency of the data you place in your
3371
training datafile. When a training file contains thousands of samples, it is
3372
difficult to manually check that you used the same class names in your sample records
3373
that you declared at top of your training file or that the values you have for your
3374
features are legal vis-a-vis the earlier declarations of the values in the training
3375
file. Another safety feature incorporated in this version is the non-consideration
3376
of classes that are declared at the top of the training file but that have no sample
3377
records in the file.
3378
3379
B uses probability caching much more extensively compared to the previous
3380
versions. This should result in faster construction of large decision trees.
3381
Another new feature in Version 1.6 is the use of a decision tree for interactive
3382
classification. In this mode, after you have constructed a decision tree from the
3383
training data, the user is prompted for answers to the questions pertaining to the
3384
feature tests at the nodes of the tree.
3385
3386
Some key elements of the documentation were cleaned up and made more readable in
3387
B. The implementation code remains unchanged from Version 1.4.
3388
3389
B should make things faster (and easier) for folks who want to use this
3390
module with training data that creates very large decision trees (that is, trees with
3391
tens of thousands or more decision nodes). The speedup in Version 1.4 has been
3392
achieved by eliminating duplicate calculation of probabilities as the tree grows. In
3393
addition, this version provides an additional constructor parameter,
3394
C for controlling the size of the decision tree. This is in
3395
addition to the tree size control achieved by the parameter C that
3396
was introduced in Version 1.3. Since large decision trees can take a long time to
3397
create, you may find yourself wishing you could store the tree you just created in a
3398
disk file and that, subsequently, you could use the stored tree for classification
3399
work. The C directory contains two scripts, C and
3400
C, that show how you can do exactly that with the
3401
help of Perl's C module.
3402
3403
B addresses the issue that arises when the header of a training datafile
3404
declares a certain possible value for a feature but that (feature,value) pair does
3405
NOT show up anywhere in the training data. Version 1.3 also makes it possible for a
3406
user to control the size of the decision tree by changing the value of the parameter
3407
C Additionally, Version 1.3 includes a method called
3408
C that displays useful information regarding the size and
3409
some other attributes of the training data. It also warns the user if the training
3410
data might result in a decision tree that would simply be much too large --- unless
3411
the user controls the size with the entropy_threshold parameter.
3412
3413
In addition to the removal of a couple of serious bugs, B incorporates a
3414
number of enhancements: (1) Version 1.2 includes checks on the names of the features
3415
and values used in test data --- this is the data you want to classify with the
3416
decision tree classifier constructed by this module. (2) Version 1.2 includes a
3417
separate constructor for generating test data. To make it easier to generate test
3418
data whose probabilistic parameters may not be identical to that used for the
3419
training data, I have used separate routines for generating the test data. (3)
3420
Version 1.2 also includes in its examples directory a script that classifies the test
3421
data in a file and outputs the class labels into another file. This is for folks who
3422
do not wish to write their own scripts using this module. (4) Version 1.2 also
3423
includes addition to the documentation regarding the issue of numeric values for
3424
features.
3425
3426
=head1 DESCRIPTION
3427
3428
B is a I module for constructing a decision tree from
3429
a training datafile containing multidimensional data. In one form or another,
3430
decision trees have been around for about fifty years. From a statistical
3431
perspective, they are closely related to classification and regression by recursive
3432
partitioning of multidimensional data. Early work that demonstrated the usefulness
3433
of such partitioning of data for classification and regression can be traced to the
3434
work of Terry Therneau in the early 1980's in the statistics community, and to the
3435
work of Ross Quinlan in the mid 1990's in the machine learning community.
3436
3437
For those not familiar with decision tree ideas, the traditional way to classify
3438
multidimensional data is to start with a feature space whose dimensionality is the
3439
same as that of the data. Each feature in this space corresponds to the attribute
3440
that each dimension of the data measures. You then use the training data to carve up
3441
the feature space into different regions, each corresponding to a different class.
3442
Subsequently, when you try to classify a new data sample, you locate it in the
3443
feature space and find the class label of the region to which it belongs. One can
3444
also give the new data point the same class label as that of the nearest training
3445
sample. This is referred to as the nearest neighbor classification. There exist
3446
hundreds of variations of varying power on these two basic approaches to the
3447
classification of multidimensional data.
3448
3449
A decision tree classifier works differently. When you construct a decision tree,
3450
you select for the root node a feature test that partitions the training data in a
3451
way that causes maximal disambiguation of the class labels associated with the data.
3452
In terms of information content as measured by entropy, such a feature test would
3453
cause maximum reduction in class entropy in going from all of the training data taken
3454
together to the data as partitioned by the feature test. You then drop from the root
3455
node a set of child nodes, one for each partition of the training data created by the
3456
feature test at the root node. When your features are purely symbolic, you'll have
3457
one child node for each value of the feature chosen for the feature test at the root.
3458
When the test at the root involves a numeric feature, you find the decision threshold
3459
for the feature that best bipartitions the data and you drop from the root node two
3460
child nodes, one for each partition. Now at each child node you pose the same
3461
question that you posed when you found the best feature to use at the root: Which
3462
feature at the child node in question would maximally disambiguate the class labels
3463
associated with the training data corresponding to that child node?
3464
3465
As the reader would expect, the two key steps in any approach to decision-tree based
3466
classification are the construction of the decision tree itself from a file
3467
containing the training data, and then using the decision tree thus obtained for
3468
classifying new data.
3469
3470
What is cool about decision tree classification is that it gives you soft
3471
classification, meaning it may associate more than one class label with a given data
3472
vector. When this happens, it may mean that your classes are indeed overlapping in
3473
the underlying feature space. It could also mean that you simply have not supplied
3474
sufficient training data to the decision tree classifier. For a tutorial
3475
introduction to how a decision tree is constructed and used, visit
3476
L
3477
3478
This module also allows you to generate your own synthetic training and test
3479
data. Generating your own training data, using it for constructing a decision-tree
3480
classifier, and subsequently testing the classifier on a synthetically generated
3481
test set of data is a good way to develop greater proficiency with decision trees.
3482
3483
3484
=head1 WHAT PRACTICAL PROBLEM IS SOLVED BY THIS MODULE
3485
3486
If you are new to the concept of a decision tree, their practical utility is best
3487
understood with an example that only involves symbolic features. However, as
3488
mentioned earlier, versions of the module higher than 2.0 allow you to use both
3489
symbolic and numeric features.
3490
3491
Consider the following scenario: Let's say you are running a small investment company
3492
that employs a team of stockbrokers who make buy/sell decisions for the customers of
3493
your company. Assume that your company has asked the traders to make each investment
3494
decision on the basis of the following four criteria:
3495
3496
price_to_earnings_ratio (P_to_E)
3497
3498
price_to_sales_ratio (P_to_S)
3499
3500
return_on_equity (R_on_E)
3501
3502
market_share (MS)
3503
3504
Since you are the boss, you keep track of the buy/sell decisions made by the
3505
individual traders. But one unfortunate day, all of your traders decide to quit
3506
because you did not pay them enough. So what do you do? If you had a module like
3507
the one here, you could still run your company and do so in such a way that, on the
3508
average, would do better than any of the individual traders who worked for your
3509
company. This is what you do: You pool together the individual trader buy/sell
3510
decisions you have accumulated during the last one year. This pooled information is
3511
likely to look like:
3512
3513
3514
example buy/sell P_to_E P_to_S R_on_E MS
3515
============================================================+=
3516
3517
example_1 buy high low medium low
3518
example_2 buy medium medium low low
3519
example_3 sell low medium low high
3520
....
3521
....
3522
3523
This data, when formatted according to CSV, would constitute your training file. You
3524
could feed this file into the module by calling:
3525
3526
my $dt = Algorithm::DecisionTree->new(
3527
training_datafile => $training_datafile,
3528
csv_class_column_index => 1,
3529
csv_columns_for_features => [2,3,4,5],
3530
);
3531
$dt->get_training_data();
3532
$dt->calculate_first_order_probabilities();
3533
$dt->calculate_class_priors();
3534
3535
Subsequently, you would construct a decision tree by calling:
3536
3537
my $root_node = $dt->construct_decision_tree_classifier();
3538
3539
Now you and your company (with practically no employees) are ready to service the
3540
customers again. Suppose your computer needs to make a buy/sell decision about an
3541
investment prospect that is best described by:
3542
3543
price_to_earnings_ratio = low
3544
price_to_sales_ratio = very_low
3545
return_on_equity = none
3546
market_share = medium
3547
3548
All that your computer would need to do would be to construct a data vector like
3549
3550
my @data = qw / P_to_E=low
3551
P_to_S=very_low
3552
R_on_E=none
3553
MS=medium /;
3554
3555
and call the decision tree classifier you just constructed by
3556
3557
$dt->classify($root_node, \@data);
3558
3559
The answer returned will be 'buy' and 'sell', along with the associated
3560
probabilities. So if the probability of 'buy' is considerably greater than the
3561
probability of 'sell', that's what you should instruct your computer to do.
3562
3563
The chances are that, on the average, this approach would beat the performance of any
3564
of your individual traders who worked for you previously since the buy/sell decisions
3565
made by the computer would be based on the collective wisdom of all your previous
3566
traders. B
3567
captured by the silly little example here. However, it does nicely the convey the
3568
sense in which the current module could be used.>
3569
3570
=head1 SYMBOLIC FEATURES VERSUS NUMERIC FEATURES
3571
3572
A feature is symbolic when its values are compared using string comparison operators.
3573
By the same token, a feature is numeric when its values are compared using numeric
3574
comparison operators. Having said that, features that take only a small number of
3575
numeric values in the training data can be treated symbolically provided you are
3576
careful about handling their values in the test data. At the least, you have to set
3577
the test data value for such a feature to its closest value in the training data.
3578
The module does that automatically for you for those numeric features for which the
3579
number different numeric values is less than a user-specified threshold. For those
3580
numeric features that the module is allowed to treat symbolically, this snapping of
3581
the values of the features in the test data to the small set of values in the training
3582
data is carried out automatically by the module. That is, after a user has told the
3583
module which numeric features to treat symbolically, the user need not worry about
3584
how the feature values appear in the test data.
3585
3586
The constructor parameter C let's you tell
3587
the module when to consider an otherwise numeric feature symbolically. Suppose you
3588
set this parameter to 10, that means that all numeric looking features that take 10
3589
or fewer different values in the training datafile will be considered to be symbolic
3590
features by the module. See the tutorial at
3591
L for
3592
further information on the implementation issues related to the symbolic and numeric
3593
features.
3594
3595
=head1 FEATURES WITH NOT SO "NICE" STATISTICAL PROPERTIES
3596
3597
For the purpose of estimating the probabilities, it is necessary to sample the range
3598
of values taken on by a numerical feature. For features with "nice" statistical
3599
properties, this sampling interval is set to the median of the differences between
3600
the successive feature values in the training data. (Obviously, as you would expect,
3601
you first sort all the values for a feature before computing the successive
3602
differences.) This logic will not work for the sort of a feature described below.
3603
3604
Consider a feature whose values are heavy-tailed, and, at the same time, the values
3605
span a million to one range. What I mean by heavy-tailed is that rare values can
3606
occur with significant probabilities. It could happen that most of the values for
3607
such a feature are clustered at one of the two ends of the range. At the same time,
3608
there may exist a significant number of values near the end of the range that is less
3609
populated. (Typically, features related to human economic activities --- such as
3610
wealth, incomes, etc. --- are of this type.) With the logic described in the
3611
previous paragraph, you could end up with a sampling interval that is much too small,
3612
which could result in millions of sampling points for the feature if you are not
3613
careful.
3614
3615
Beginning with Version 2.22, you have two options in dealing with such features. You
3616
can choose to go with the default behavior of the module, which is to sample the
3617
value range for such a feature over a maximum of 500 points. Or, you can supply an
3618
additional option to the constructor that sets a user-defined value for the number of
3619
points to use. The name of the option is C. The following
3620
script
3621
3622
construct_dt_for_heavytailed.pl
3623
3624
in the C directory shows an example of how to call the constructor of the
3625
module with the C option.
3626
3627
3628
=head1 TESTING THE QUALITY OF YOUR TRAINING DATA
3629
3630
Versions 2.1 and higher include a new class named C, derived from
3631
the main class C, that runs a 10-fold cross-validation test on your
3632
training data to test its ability to discriminate between the classes mentioned in
3633
the training file.
3634
3635
The 10-fold cross-validation test divides all of the training data into ten parts,
3636
with nine parts used for training a decision tree and one part used for testing its
3637
ability to classify correctly. This selection of nine parts for training and one part
3638
for testing is carried out in all of the ten different possible ways.
3639
3640
The following code fragment illustrates how you invoke the testing function of the
3641
EvalTrainingData class:
3642
3643
my $training_datafile = "training.csv";
3644
my $eval_data = EvalTrainingData->new(
3645
training_datafile => $training_datafile,
3646
csv_class_column_index => 1,
3647
csv_columns_for_features => [2,3],
3648
entropy_threshold => 0.01,
3649
max_depth_desired => 3,
3650
symbolic_to_numeric_cardinality_threshold => 10,
3651
csv_cleanup_needed => 1,
3652
);
3653
$eval_data->get_training_data();
3654
$eval_data->evaluate_training_data()
3655
3656
The last statement above prints out a Confusion Matrix and the value of Training Data
3657
Quality Index on a scale of 0 to 100, with 100 designating perfect training data.
3658
The Confusion Matrix shows how the different classes were mislabeled in the 10-fold
3659
cross-validation test.
3660
3661
This testing functionality can also be used to find the best values to use for the
3662
constructor parameters C, C, and
3663
C.
3664
3665
The following two scripts in the C directory illustrate the use of the
3666
C class for testing the quality of your data:
3667
3668
evaluate_training_data1.pl
3669
evaluate_training_data2.pl
3670
3671
3672
=head1 HOW TO MAKE THE BEST CHOICES FOR THE CONSTRUCTOR PARAMETERS
3673
3674
Assuming your training data is good, the quality of the results you get from a
3675
decision tree would depend on the choices you make for the constructor parameters
3676
C, C, and
3677
C. You can optimize your choices for
3678
these parameters by running the 10-fold cross-validation test that is made available
3679
in Versions 2.2 and higher through the new class C that is included
3680
in the module file. A description of how to run this test is in the previous section
3681
of this document.
3682
3683
3684
=head1 DECISION TREE INTROSPECTION
3685
3686
Starting with Version 2.30, you can ask the C class of the module to
3687
explain the classification decisions made at the different nodes of the decision
3688
tree.
3689
3690
Perhaps the most important bit of information you are likely to seek through DT
3691
introspection is the list of the training samples that fall directly in the portion
3692
of the feature space that is assigned to a node.
3693
3694
However, note that, when training samples are non-uniformly distributed in the
3695
underlying feature space, it is possible for a node to exist even when there are no
3696
training samples in the portion of the feature space assigned to the node. That is
3697
because the decision tree is constructed from the probability densities estimated
3698
from the training data. When the training samples are non-uniformly distributed, it
3699
is entirely possible for the estimated probability densities to be non-zero in a
3700
small region around a point even when there are no training samples specifically in
3701
that region. (After you have created a statistical model for, say, the height
3702
distribution of people in a community, the model may return a non-zero probability
3703
for the height values in a small interval even if the community does not include a
3704
single individual whose height falls in that interval.)
3705
3706
That a decision-tree node can exist even when there are no training samples in that
3707
portion of the feature space that belongs to the node is an important indication of
3708
the generalization ability of a decision-tree-based classifier.
3709
3710
In light of the explanation provided above, before the DTIntrospection class supplies
3711
any answers at all, it asks you to accept the fact that features can take on non-zero
3712
probabilities at a point in the feature space even though there are zero training
3713
samples at that point (or in a small region around that point). If you do not accept
3714
this rudimentary fact, the introspection class will not yield any answers (since you
3715
are not going to believe the answers anyway).
3716
3717
The point made above implies that the path leading to a node in the decision tree may
3718
test a feature for a certain value or threshold despite the fact that the portion of
3719
the feature space assigned to that node is devoid of any training data.
3720
3721
See the following three scripts in the Examples directory for how to carry out DT
3722
introspection:
3723
3724
introspection_in_a_loop_interactive.pl
3725
3726
introspection_show_training_samples_at_all_nodes_direct_influence.pl
3727
3728
introspection_show_training_samples_to_nodes_influence_propagation.pl
3729
3730
The first script places you in an interactive session in which you will first be
3731
asked for the node number you are interested in. Subsequently, you will be asked for
3732
whether or not you are interested in specific questions that the introspection can
3733
provide answers for. The second script descends down the decision tree and shows for
3734
each node the training samples that fall directly in the portion of the feature space
3735
assigned to that node. The third script shows for each training sample how it
3736
affects the decision-tree nodes either directly or indirectly through the
3737
generalization achieved by the probabilistic modeling of the data.
3738
3739
The output of the script
3740
C looks like:
3741
3742
Node 0: the samples are: None
3743
Node 1: the samples are: [sample_46 sample_58]
3744
Node 2: the samples are: [sample_1 sample_4 sample_7 .....]
3745
Node 3: the samples are: []
3746
Node 4: the samples are: []
3747
...
3748
...
3749
3750
The nodes for which no samples are listed come into existence through
3751
the generalization achieved by the probabilistic modeling of the data.
3752
3753
The output produced by the script
3754
C looks like
3755
3756
sample_1:
3757
nodes affected directly: [2 5 19 23]
3758
nodes affected through probabilistic generalization:
3759
2=> [3 4 25]
3760
25=> [26]
3761
5=> [6]
3762
6=> [7 13]
3763
7=> [8 11]
3764
8=> [9 10]
3765
11=> [12]
3766
13=> [14 18]
3767
14=> [15 16]
3768
16=> [17]
3769
19=> [20]
3770
20=> [21 22]
3771
23=> [24]
3772
3773
sample_4:
3774
nodes affected directly: [2 5 6 7 11]
3775
nodes affected through probabilistic generalization:
3776
2=> [3 4 25]
3777
25=> [26]
3778
5=> [19]
3779
19=> [20 23]
3780
20=> [21 22]
3781
23=> [24]
3782
6=> [13]
3783
13=> [14 18]
3784
14=> [15 16]
3785
16=> [17]
3786
7=> [8]
3787
8=> [9 10]
3788
11=> [12]
3789
3790
...
3791
...
3792
...
3793
3794
For each training sample, the display shown above first presents the list of nodes
3795
that are directly affected by the sample. A node is affected directly by a sample if
3796
the latter falls in the portion of the feature space that belongs to the former.
3797
Subsequently, for each training sample, the display shows a subtree of the nodes that
3798
are affected indirectly by the sample through the generalization achieved by the
3799
probabilistic modeling of the data. In general, a node is affected indirectly by a
3800
sample if it is a descendant of another node that is affected directly.
3801
3802
Also see the section titled B regarding how to invoke the
3803
introspection capabilities of the module in your own code.
3804
3805
=head1 METHODS
3806
3807
The module provides the following methods for constructing a decision tree from
3808
training data in a disk file and for classifying new data records with the decision
3809
tree thus constructed:
3810
3811
=over 4
3812
3813
=item B
3814
3815
my $dt = Algorithm::DecisionTree->new(
3816
training_datafile => $training_datafile,
3817
csv_class_column_index => 2,
3818
csv_columns_for_features => [3,4,5,6,7,8],
3819
entropy_threshold => 0.01,
3820
max_depth_desired => 8,
3821
symbolic_to_numeric_cardinality_threshold => 10,
3822
csv_cleanup_needed => 1,
3823
);
3824
3825
A call to C constructs a new instance of the C class.
3826
For this call to make sense, the training data in the training datafile must be
3827
in the CSV format.
3828
3829
=back
3830
3831
=head2 The Constructor Parameters
3832
3833
=over 8
3834
3835
=item C:
3836
3837
This parameter supplies the name of the file that contains the training data.
3838
3839
=item C:
3840
3841
When using a CSV file for your training data, this parameter supplies the zero-based
3842
column index for the column that contains the class label for each data record in the
3843
training file.
3844
3845
3846
=item C:
3847
3848
You need to set this parameter to 1 if your CSV file has double quoted strings (which
3849
may include commas) as values for the fields and if such values are allowed to
3850
include commas for, presumably, better readability.
3851
3852
=item C:
3853
3854
When using a CSV file for your training data, this parameter supplies a list of
3855
columns corresponding to the features you wish to use for decision tree construction.
3856
Each column is specified by its zero-based index.
3857
3858
=item C:
3859
3860
This parameter sets the granularity with which the entropies are sampled by the
3861
module. For example, a feature test at a node in the decision tree is acceptable if
3862
the entropy gain achieved by the test exceeds this threshold. The larger the value
3863
you choose for this parameter, the smaller the tree. Its default value is 0.001.
3864
3865
=item C:
3866
3867
This parameter sets the maximum depth of the decision tree. For obvious reasons, the
3868
smaller the value you choose for this parameter, the smaller the tree.
3869
3870
=item C:
3871
3872
This parameter allows the module to treat an otherwise numeric feature symbolically
3873
if the number of different values the feature takes in the training data file does
3874
not exceed the value of this parameter.
3875
3876
=item C:
3877
3878
This parameter gives the user the option to set the number of points at which the
3879
value range for a feature should be sampled for estimating the probabilities. This
3880
parameter is effective only for those features that occupy a large value range and
3881
whose probability distributions are heavy tailed. B
3882
when you have a very large training dataset:> In general, the larger the dataset, the
3883
smaller the smallest difference between any two values for a numeric feature in
3884
relation to the overall range of values for that feature. In such cases, the module
3885
may use too large a number of bins for estimating the probabilities and that may slow
3886
down the calculation of the decision tree. You can get around this difficulty by
3887
explicitly giving a value to the 'C' parameter.
3888
3889
=back
3890
3891
3892
You can choose the best values to use for the last three constructor parameters by
3893
running a 10-fold cross-validation test on your training data through the class
3894
C that comes with Versions 2.1 and higher of this module. See the
3895
section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.
3896
3897
=over
3898
3899
=item B
3900
3901
After you have constructed a new instance of the C class,
3902
you must now read in the training data that is the file named in the call to the
3903
constructor. This you do by:
3904
3905
$dt->get_training_data();
3906
3907
3908
=item B
3909
3910
If you wish to see the training data that was just digested by the module,
3911
call
3912
3913
$dt->show_training_data();
3914
3915
=item B
3916
3917
=item B
3918
3919
After the module has read the training data file, it needs to initialize the
3920
probability cache. This you do by invoking:
3921
3922
$dt->calculate_first_order_probabilities()
3923
$dt->calculate_class_priors()
3924
3925
=item B
3926
3927
With the probability cache initialized, it is time to construct a decision tree
3928
classifier. This you do by
3929
3930
my $root_node = $dt->construct_decision_tree_classifier();
3931
3932
This call returns an instance of type C. The C class is defined
3933
within the main package file. So, don't forget, that C<$root_node> in the above
3934
example call will be instantiated to an object of type C.
3935
3936
=item B<$root_nodeC<< -> >>display_decision_tree(" "):>
3937
3938
$root_node->display_decision_tree(" ");
3939
3940
This will display the decision tree in your terminal window by using a recursively
3941
determined offset for each node as the display routine descends down the tree.
3942
3943
I have intentionally left the syntax fragment C<$root_node> in the above call to
3944
remind the reader that C is NOT called on the instance of
3945
the C we constructed earlier, but on the C instance returned by
3946
the call to C.
3947
3948
=item B
3949
3950
Let's say you want to classify the following data record:
3951
3952
my @test_sample = qw / g2=4.2
3953
grade=2.3
3954
gleason=4
3955
eet=1.7
3956
age=55.0
3957
ploidy=diploid /;
3958
3959
you'd make the following call:
3960
3961
my $classification = $dt->classify($root_node, \@test_sample);
3962
3963
where, again, C<$root_node> is an instance of type C returned by the call to
3964
C. The variable C<$classification> holds a
3965
reference to a hash whose keys are the class names and whose values the associated
3966
probabilities. The hash that is returned by the above call also includes a special
3967
key-value pair for a key named C. The value associated with this key
3968
is an anonymous array that holds the path, in the form of a list of nodes, from the
3969
root node to the leaf node in the decision tree where the final classification was
3970
made.
3971
3972
3973
=item B
3974
3975
This method allows you to use a decision-tree based classifier in an interactive
3976
mode. In this mode, a user is prompted for answers to the questions pertaining to
3977
the feature tests at the nodes of the tree. The syntax for invoking this method is:
3978
3979
my $classification = $dt->classify_by_asking_questions($root_node);
3980
3981
where C<$dt> is an instance of the C class returned by a
3982
call to C and C<$root_node> the root node of the decision tree returned by a
3983
call to C.
3984
3985
=back
3986
3987
3988
=head1 THE INTROSPECTION API
3989
3990
To construct an instance of C, you call
3991
3992
my $introspector = DTIntrospection->new($dt);
3993
3994
where you supply the instance of the C class you used for constructing
3995
the decision tree through the parameter C<$dt>. After you have constructed an
3996
instance of the introspection class, you must initialize it by
3997
3998
$introspector->initialize();
3999
4000
Subsequently, you can invoke either of the following methods:
4001
4002
$introspector->explain_classification_at_one_node($node);
4003
4004
$introspector->explain_classifications_at_multiple_nodes_interactively();
4005
4006
depending on whether you want introspection at a single specified node or inside an
4007
infinite loop for an arbitrary number of nodes.
4008
4009
If you want to output a tabular display that shows for each node in the decision tree
4010
all the training samples that fall in the portion of the feature space that belongs
4011
to that node, call
4012
4013
$introspector->display_training_samples_at_all_nodes_direct_influence_only();
4014
4015
If you want to output a tabular display that shows for each training sample a list of
4016
all the nodes that are affected directly AND indirectly by that sample, call
4017
4018
$introspector->display_training_training_samples_to_nodes_influence_propagation();
4019
4020
A training sample affects a node directly if the sample falls in the portion of the
4021
features space assigned to that node. On the other hand, a training sample is
4022
considered to affect a node indirectly if the node is a descendant of a node that is
4023
affected directly by the sample.
4024
4025
4026
=head1 BULK CLASSIFICATION OF DATA RECORDS
4027
4028
For large test datasets, you would obviously want to process an entire file of test
4029
data at a time. The following scripts in the C directory illustrate how you
4030
can do that:
4031
4032
classify_test_data_in_a_file.pl
4033
4034
This script requires three command-line arguments, the first argument names the
4035
training datafile, the second the test datafile, and the third the file in which the
4036
classification results are to be deposited.
4037
4038
The other examples directories, C, C, and
4039
C, also contain scripts that illustrate how to carry out
4040
bulk classification of data records when you wish to take advantage of bagging,
4041
boosting, or tree randomization. In their respective directories, these scripts are
4042
named:
4043
4044
bagging_for_bulk_classification.pl
4045
boosting_for_bulk_classification.pl
4046
classify_database_records.pl
4047
4048
4049
=head1 HOW THE CLASSIFICATION RESULTS ARE DISPLAYED
4050
4051
It depends on whether you apply the classifier at once to all the data samples in a
4052
file, or whether you feed one data sample at a time into the classifier.
4053
4054
In general, the classifier returns soft classification for a test data vector. What
4055
that means is that, in general, the classifier will list all the classes to which a
4056
given data vector could belong and the probability of each such class label for the
4057
data vector. Run the examples scripts in the Examples directory to see how the output
4058
of classification can be displayed.
4059
4060
With regard to the soft classifications returned by this classifier, if the
4061
probability distributions for the different classes overlap in the underlying feature
4062
space, you would want the classifier to return all of the applicable class labels for
4063
a data vector along with the corresponding class probabilities. Another reason for
4064
why the decision tree classifier may associate significant probabilities with
4065
multiple class labels is that you used inadequate number of training samples to
4066
induce the decision tree. The good thing is that the classifier does not lie to you
4067
(unlike, say, a hard classification rule that would return a single class label
4068
corresponding to the partitioning of the underlying feature space). The decision
4069
tree classifier give you the best classification that can be made given the training
4070
data you fed into it.
4071
4072
4073
=head1 USING BAGGING
4074
4075
Starting with Version 3.0, you can use the class C that
4076
comes with the module to incorporate bagging in your decision tree based
4077
classification. Bagging means constructing multiple decision trees for different
4078
(possibly overlapping) segments of the data extracted from your training dataset and
4079
then aggregating the decisions made by the individual decision trees for the final
4080
classification. The aggregation of the classification decisions can average out the
4081
noise and bias that may otherwise affect the classification decision obtained from
4082
just one tree.
4083
4084
=over 4
4085
4086
=item B
4087
4088
A typical call to the constructor for the C class looks
4089
like:
4090
4091
use Algorithm::DecisionTreeWithBagging;
4092
4093
my $training_datafile = "stage3cancer.csv";
4094
4095
my $dtbag = Algorithm::DecisionTreeWithBagging->new(
4096
training_datafile => $training_datafile,
4097
csv_class_column_index => 2,
4098
csv_columns_for_features => [3,4,5,6,7,8],
4099
entropy_threshold => 0.01,
4100
max_depth_desired => 8,
4101
symbolic_to_numeric_cardinality_threshold => 10,
4102
how_many_bags => 4,
4103
bag_overlap_fraction => 0.2,
4104
csv_cleanup_needed => 1,
4105
);
4106
4107
Note in particular the following two constructor parameters:
4108
4109
how_many_bags
4110
4111
bag_overlap_fraction
4112
4113
where, as the name implies, the parameter C controls how many bags
4114
(and, therefore, how many decision trees) will be constructed from your training
4115
dataset; and where the parameter C controls the degree of
4116
overlap between the bags. To understand what exactly is achieved by setting the
4117
parameter C to 0.2 in the above example, let's say that the
4118
non-overlapping partitioning of the training data between the bags results in 100
4119
training samples per bag. With bag_overlap_fraction set to 0.2, additional 20 samples
4120
drawn randomly from the other bags will be added to the data in each bag.
4121
4122
=back
4123
4124
=head2 B class>
4125
4126
=over 8
4127
4128
=item B
4129
4130
This method reads your training datafile, randomizes it, and then partitions it into
4131
the specified number of bags. Subsequently, if the constructor parameter
4132
C is non-zero, it adds to each bag additional samples drawn at
4133
random from the other bags. The number of these additional samples added to each bag
4134
is controlled by the constructor parameter C. If this
4135
parameter is set to, say, 0.2, the size of each bag will grow by 20% with the samples
4136
drawn from the other bags.
4137
4138
=item B
4139
4140
Shows for each bag the names of the training data samples in that bag.
4141
4142
=item B
4143
4144
Calls on the appropriate methods of the main C class to estimate the
4145
first-order probabilities from the data samples in each bag.
4146
4147
=item B
4148
4149
Calls on the appropriate method of the main C class to estimate the
4150
class priors for the data samples in each bag.
4151
4152
=item B
4153
4154
Calls on the appropriate method of the main C class to construct a
4155
decision tree from the training data in each bag.
4156
4157
=item B
4158
4159
Display separately the decision tree for each bag..
4160
4161
=item B
4162
4163
Calls on the appropriate methods of the main C class to classify the
4164
argument test sample.
4165
4166
=item B
4167
4168
Displays separately the classification decision made by each the decision tree
4169
constructed for each bag.
4170
4171
=item B
4172
4173
Using majority voting, this method aggregates the classification decisions made by
4174
the individual decision trees into a single decision.
4175
4176
=back
4177
4178
See the example scripts in the directory C for how to call these
4179
methods for classifying individual samples and for bulk classification when you place
4180
all your test samples in a single file.
4181
4182
=head1 USING BOOSTING
4183
4184
Starting with Version 3.20, you can use the class C for
4185
constructing a boosted decision-tree classifier. Boosting results in a cascade of
4186
decision trees in which each decision tree is constructed with samples that are
4187
mostly those that are misclassified by the previous decision tree. To be precise,
4188
you create a probability distribution over the training samples for the selection of
4189
samples for training each decision tree in the cascade. To start out, the
4190
distribution is uniform over all of the samples. Subsequently, this probability
4191
distribution changes according to the misclassifications by each tree in the cascade:
4192
if a sample is misclassified by a given tree in the cascade, the probability of its
4193
being selected for training the next tree is increased significantly. You also
4194
associate a trust factor with each decision tree depending on its power to classify
4195
correctly all of the training data samples. After a cascade of decision trees is
4196
constructed in this manner, you construct a final classifier that calculates the
4197
class label for a test data sample by taking into account the classification
4198
decisions made by each individual tree in the cascade, the decisions being weighted
4199
by the trust factors associated with the individual classifiers. These boosting
4200
notions --- generally referred to as the AdaBoost algorithm --- are based on a now
4201
celebrated paper "A Decision-Theoretic Generalization of On-Line Learning and an
4202
Application to Boosting" by Yoav Freund and Robert Schapire that appeared in 1995 in
4203
the Proceedings of the 2nd European Conf. on Computational Learning Theory. For a
4204
tutorial introduction to AdaBoost, see L
4205
4206
Keep in mind the fact that, ordinarily, the theoretical guarantees provided by
4207
boosting apply only to the case of binary classification. Additionally, your
4208
training dataset must capture all of the significant statistical variations in the
4209
classes represented therein.
4210
4211
=over 4
4212
4213
=item B
4214
4215
If you'd like to experiment with boosting, a typical call to the constructor for the
4216
C class looks like:
4217
4218
use Algorithm::BoostedDecisionTree;
4219
my $training_datafile = "training6.csv";
4220
my $boosted = Algorithm::BoostedDecisionTree->new(
4221
training_datafile => $training_datafile,
4222
csv_class_column_index => 1,
4223
csv_columns_for_features => [2,3],
4224
entropy_threshold => 0.01,
4225
max_depth_desired => 8,
4226
symbolic_to_numeric_cardinality_threshold => 10,
4227
how_many_stages => 4,
4228
csv_cleanup_needed => 1,
4229
);
4230
4231
Note in particular the constructor parameter:
4232
4233
how_many_stages
4234
4235
As its name implies, this parameter controls how many stages will be used in the
4236
boosted decision tree classifier. As mentioned above, a separate decision tree is
4237
constructed for each stage of boosting using a set of training samples that are drawn
4238
through a probability distribution maintained over the entire training dataset.
4239
4240
=back
4241
4242
=head2 B class>
4243
4244
=over 8
4245
4246
=item B
4247
4248
This method reads your training datafile, creates the data structures from the data
4249
ingested for constructing the base decision tree.
4250
4251
=item B
4252
4253
Writes to the standard output the training data samples and also some relevant
4254
properties of the features used in the training dataset.
4255
4256
=item B
4257
4258
Calls on the appropriate methods of the main C class to estimate the
4259
first-order probabilities and the class priors.
4260
4261
=item B
4262
4263
Calls on the appropriate method of the main C class to construct the
4264
base decision tree.
4265
4266
=item B
4267
4268
Displays the base decision tree in your terminal window. (The textual form of the
4269
decision tree is written out to the standard output.)
4270
4271
=item B
4272
4273
Uses the AdaBoost algorithm to construct a cascade of decision trees. As mentioned
4274
earlier, the training samples for each tree in the cascade are drawn using a
4275
probability distribution over the entire training dataset. This probability
4276
distribution for any given tree in the cascade is heavily influenced by which
4277
training samples are misclassified by the previous tree.
4278
4279
=item B
4280
4281
Displays separately in your terminal window the decision tree constructed for each
4282
stage of the cascade. (The textual form of the trees is written out to the standard
4283
output.)
4284
4285
=item B
4286
4287
Calls on each decision tree in the cascade to classify the argument C<$test_sample>.
4288
4289
=item B
4290
4291
You can call this method to display in your terminal window the classification
4292
decision made by each decision tree in the cascade. The method also prints out the
4293
trust factor associated with each decision tree. It is important to look
4294
simultaneously at the classification decision and the trust factor for each tree ---
4295
since a classification decision made by a specific tree may appear bizarre for a
4296
given test sample. This method is useful primarily for debugging purposes.
4297
4298
=item B
4299
4300
As with the previous method, this method is useful mostly for debugging. It returns
4301
class labels for the samples misclassified by the stage whose integer index is
4302
supplied as an argument to the method. Say you have 10 stages in your cascade. The
4303
value of the argument C would go from 0 to 9, with 0 corresponding to
4304
the base tree.
4305
4306
=item B
4307
4308
Uses the "final classifier" formula of the AdaBoost algorithm to pool together the
4309
classification decisions made by the individual trees while taking into account the
4310
trust factors associated with the trees. As mentioned earlier, we associate with
4311
each tree of the cascade a trust factor that depends on the overall misclassification
4312
rate associated with that tree.
4313
4314
=back
4315
4316
See the example scripts in the C subdirectory for how to call the
4317
methods listed above for classifying individual data samples with boosting and for
4318
bulk classification when you place all your test samples in a single file.
4319
4320
4321
=head1 USING RANDOMIZED DECISION TREES
4322
4323
As mentioned earlier, the new C class allows you to solve
4324
the following two problems: (1) Data classification using the needle-in-a-haystack
4325
metaphor, that is, when a vast majority of your training samples belong to just one
4326
class. And (2) You have access to a very large database of training samples and you
4327
wish to construct an ensemble of decision trees for classification.
4328
4329
=over 4
4330
4331
=item B
4332
4333
Here is how you'd call the C constructor for
4334
needle-in-a-haystack classification:
4335
4336
use Algorithm::RandomizedTreesForBigData;
4337
my $training_datafile = "your_database.csv";
4338
my $rt = Algorithm::RandomizedTreesForBigData->new(
4339
training_datafile => $training_datafile,
4340
csv_class_column_index => 48,
4341
csv_columns_for_features => [24,32,33,34,41],
4342
entropy_threshold => 0.01,
4343
max_depth_desired => 8,
4344
symbolic_to_numeric_cardinality_threshold => 10,
4345
how_many_trees => 5,
4346
looking_for_needles_in_haystack => 1,
4347
csv_cleanup_needed => 1,
4348
);
4349
4350
Note in particular the constructor parameters:
4351
4352
looking_for_needles_in_haystack
4353
how_many_trees
4354
4355
The first of these parameters, C, invokes the logic for
4356
constructing an ensemble of decision trees, each based on a training dataset that
4357
uses all of the minority class samples, and a random drawing from the majority class
4358
samples.
4359
4360
Here is how you'd call the C constructor for a more
4361
general attempt at constructing an ensemble of decision trees, with each tree trained
4362
with randomly drawn samples from a large database of training data (without paying
4363
attention to the differences in the sizes of the populations for the different
4364
classes):
4365
4366
use Algorithm::RandomizedTreesForBigData;
4367
my $training_datafile = "your_database.csv";
4368
my $rt = Algorithm::RandomizedTreesForBigData->new(
4369
training_datafile => $training_datafile,
4370
csv_class_column_index => 2,
4371
csv_columns_for_features => [3,4,5,6,7,8],
4372
entropy_threshold => 0.01,
4373
max_depth_desired => 8,
4374
symbolic_to_numeric_cardinality_threshold => 10,
4375
how_many_trees => 3,
4376
how_many_training_samples_per_tree => 50,
4377
csv_cleanup_needed => 1,
4378
);
4379
4380
Note in particular the constructor parameters:
4381
4382
how_many_training_samples_per_tree
4383
how_many_trees
4384
4385
When you set the C parameter, you are not allowed
4386
to also set the C parameter, and vice versa.
4387
4388
=back
4389
4390
=head2 B class>
4391
4392
=over 8
4393
4394
=item B
4395
4396
What this method does depends on which of the two constructor parameters ---
4397
C or C --- is
4398
set. When the former is set, it creates a collection of training datasets for
4399
C number of decision trees, with each dataset being a mixture of the
4400
minority class and sample drawn randomly from the majority class. However, when the
4401
latter option is set, all the datasets are drawn randomly from the training database
4402
with no particular attention given to the relative populations of the two classes.
4403
4404
=item B
4405
4406
As the name implies, this method shows the training data being used for all the
4407
decision trees. This method is useful for debugging purposes using small datasets.
4408
4409
=item B
4410
4411
Calls on the appropriate method of the main C to estimate the
4412
first-order probabilities for the training dataset to be used for each decision tree.
4413
4414
=item B
4415
4416
Calls on the appropriate method of the main C class to estimate the
4417
class priors for the training dataset to be used for each decision tree.
4418
4419
=item B
4420
4421
Calls on the appropriate method of the main C class to construct the
4422
decision trees.
4423
4424
=item B
4425
4426
Displays all the decision trees in your terminal window. (The textual form of the
4427
decision trees is written out to the standard output.)
4428
4429
=item B
4430
4431
The test_sample is sent to each decision tree for classification.
4432
4433
=item B
4434
4435
The classification decisions returned by the individual decision trees are written
4436
out to the standard output.
4437
4438
=item B
4439
4440
This method aggregates the classification results returned by the individual decision
4441
trees and returns the majority decision.
4442
4443
=back
4444
4445
=head1 CONSTRUCTING REGRESSION TREES:
4446
4447
Decision tree based modeling requires that the class labels be distinct. That is,
4448
the training dataset must contain a relatively small number of discrete class labels
4449
for all of your data records if you want to model the data with one or more decision
4450
trees. However, when one is trying to understand all of the associational
4451
relationships that exist in a large database, one often runs into situations where,
4452
instead of discrete class labels, you have a continuously valued variable as a
4453
dependent variable whose values are predicated on a set of feature values. It is for
4454
such situations that you will find useful the new class C that is now
4455
a part of the C module. The C class has been
4456
programmed as a subclass of the main C class.
4457
4458
You can think of regression with a regression tree as a powerful generalization of
4459
the very commonly used Linear Regression algorithms. Although you can certainly
4460
carry out polynomial regression with run-of-the-mill Linear Regression algorithms for
4461
modeling nonlinearities between the predictor variables and the dependent variable,
4462
specifying the degree of the polynomial is often tricky. Additionally, a polynomial
4463
can inject continuities between the predictor and the predicted variables that may
4464
not really exist in the real data. Regression trees, on the other hand, give you a
4465
piecewise linear relationship between the predictor and the predicted variables that
4466
is freed from the constraints of superimposed continuities at the joins between the
4467
different segments. See the following tutorial for further information regarding the
4468
standard linear regression approach and the regression that can be achieved with the
4469
RegressionTree class in this module:
4470
L
4471
4472
The RegressionTree class in the current version of the module assumes that all of
4473
your data is numerical. That is, unlike what is possible with the DecisionTree class
4474
(and the other more closely related classes in this module) that allow your training
4475
file to contain a mixture of numerical and symbolic data, the RegressionTree class
4476
requires that ALL of your data be numerical. I hope to relax this constraint in
4477
future versions of this module. Obviously, the dependent variable will always be
4478
numerical for regression.
4479
4480
See the example scripts in the directory C if you wish to become
4481
more familiar with the regression capabilities of the module.
4482
4483
=over 4
4484
4485
=item B
4486
4487
my $training_datafile = "gendata5.csv";
4488
my $rt = Algorithm::RegressionTree->new(
4489
training_datafile => $training_datafile,
4490
dependent_variable_column => 2,
4491
predictor_columns => [1],
4492
mse_threshold => 0.01,
4493
max_depth_desired => 2,
4494
jacobian_choice => 0,
4495
csv_cleanup_needed => 1,
4496
);
4497
4498
Note in particular the constructor parameters:
4499
4500
dependent_variable
4501
predictor_columns
4502
mse_threshold
4503
jacobian_choice
4504
4505
The first of these parameters, C, is set to the column index in
4506
the CSV file for the dependent variable. The second constructor parameter,
4507
C, tells the system as to which columns contain values for the
4508
predictor variables. The third parameter, C, is for deciding when to
4509
partition the data at a node into two child nodes as a regression tree is being
4510
constructed. If the minmax of MSE (Mean Squared Error) that can be achieved by
4511
partitioning any of the features at a node is smaller than C, that
4512
node becomes a leaf node of the regression tree.
4513
4514
The last parameter, C, must be set to either 0 or 1 or 2. Its
4515
default value is 0. When this parameter equals 0, the regression coefficients are
4516
calculated using the linear least-squares method and no further "refinement" of the
4517
coefficients is carried out using gradient descent. This is the fastest way to
4518
calculate the regression coefficients. When C is set to 1, you get
4519
a weak version of gradient descent in which the Jacobian is set to the "design
4520
matrix" itself. Choosing 2 for C results in a more reasonable
4521
approximation to the Jacobian. That, however, is at a cost of much longer
4522
computation time. B For most cases, using 0 for C is the
4523
best choice. See my tutorial "I" for why
4524
that is the case.
4525
4526
=back
4527
4528
=head2 B class>
4529
4530
=over 8
4531
4532
=item B
4533
4534
Only CSV training datafiles are allowed. Additionally, the first record in the file
4535
must list the names of the fields, and the first column must contain an integer ID
4536
for each record.
4537
4538
=item B
4539
4540
As the name implies, this is the method that construct a regression tree.
4541
4542
=item B
4543
4544
Displays the regression tree, as the name implies. The white-space string argument
4545
specifies the offset to use in displaying the child nodes in relation to a parent
4546
node.
4547
4548
=item B
4549
4550
You call this method after you have constructed a regression tree if you want to
4551
calculate the prediction for one sample. The parameter C<$root_node> is what is
4552
returned by the call C. The formatting of the argument
4553
bound to the C<$test_sample> parameter is important. To elaborate, let's say you are
4554
using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In
4555
this case, the C<$test_sample> parameter will be bound to a list that will look like
4556
4557
['xvar1 = 23.4', 'xvar2 = 12.9']
4558
4559
Arbitrary amount of white space, including none, on the two sides of the equality
4560
symbol is allowed in the construct shown above. A call to this method returns a
4561
dictionary with two key-value pairs. One of the keys is called C and
4562
the other C. The value associated with key C is the path
4563
in the regression tree to the leaf node that yielded the prediction. And the value
4564
associated with the key C is the answer you are looking for.
4565
4566
=item B
4567
4568
This call calculates the predictions for all of the predictor variables data in your
4569
training file. The parameter C<$root_node> is what is returned by the call to
4570
C. The values for the dependent variable thus predicted
4571
can be seen by calling C, which is the method mentioned below.
4572
4573
=item B
4574
4575
This method displays the results obtained by calling the prediction method of the
4576
previous entry. This method also creates a hardcopy of the plots and saves it as a
4577
C<.png> disk file. The name of this output file is always C.
4578
4579
=item B
4580
4581
This method carries out an error analysis of the predictions for the samples in your
4582
training datafile. It shows you the overall MSE (Mean Squared Error) with tree-based
4583
regression, the MSE for the data samples at each of the leaf nodes of the regression
4584
tree, and the MSE for the plain old Linear Regression as applied to all of the data.
4585
The parameter C<$root_node> in the call syntax is what is returned by the call to
4586
C.
4587
4588
=item B
4589
4590
Call this method if you want to apply the regression tree to all your test data in a
4591
disk file. The predictions for all of the test samples in the disk file are written
4592
out to another file whose name is the same as that of the test file except for the
4593
addition of C<_output> in the name of the file. The parameter C<$filename> is the
4594
name of the disk file that contains the test data. And the parameter C<$columns> is a
4595
list of the column indices for the predictor variables in the test file.
4596
4597
=back
4598
4599
=head1 GENERATING SYNTHETIC TRAINING DATA
4600
4601
The module file contains the following additional classes: (1)
4602
C, and (2) C for
4603
generating synthetic training data.
4604
4605
The class C outputs one CSV file for the
4606
training data and another one for the test data for experimenting with numeric
4607
features. The numeric values are generated using a multivariate Gaussian
4608
distribution whose mean and covariance are specified in a parameter file. See the
4609
file C in the C directory for an example of such a
4610
parameter file. Note that the dimensionality of the data is inferred from the
4611
information you place in the parameter file.
4612
4613
The class C generates synthetic training for the
4614
purely symbolic case. The relative frequencies of the different possible values for
4615
the features is controlled by the biasing information you place in a parameter file.
4616
See C for an example of such a file.
4617
4618
4619
=head1 THE C DIRECTORY
4620
4621
See the C directory in the distribution for how to construct a decision
4622
tree, and how to then classify new data using the decision tree. To become more
4623
familiar with the module, run the scripts
4624
4625
construct_dt_and_classify_one_sample_case1.pl
4626
construct_dt_and_classify_one_sample_case2.pl
4627
construct_dt_and_classify_one_sample_case3.pl
4628
construct_dt_and_classify_one_sample_case4.pl
4629
4630
The first script is for the purely symbolic case, the second for the case that
4631
involves both numeric and symbolic features, the third for the case of purely numeric
4632
features, and the last for the case when the training data is synthetically generated
4633
by the script C.
4634
4635
Next run the following script as it is for bulk classification of data records placed
4636
in a CSV file:
4637
4638
classify_test_data_in_a_file.pl training4.csv test4.csv out4.csv
4639
4640
The script first constructs a decision tree using the training data in the training
4641
file supplied by the first argument file C. The script then
4642
calculates the class label for each data record in the test data file supplied
4643
through the second argument file, C. The estimated class labels are
4644
written out to the output file which in the call shown above is C. An
4645
important thing to note here is that your test file --- in this case C ---
4646
must have a column for class labels. Obviously, in real-life situations, there will
4647
be no class labels in this column. What that is the case, you can place an empty
4648
string C<""> there for each data record. This is demonstrated by the following call:
4649
4650
classify_test_data_in_a_file.pl training4.csv test4_no_class_labels.csv out4.csv
4651
4652
The following script in the C directory
4653
4654
classify_by_asking_questions.pl
4655
4656
shows how you can use a decision-tree classifier interactively. In this mode, you
4657
first construct the decision tree from the training data and then the user is
4658
prompted for answers to the feature tests at the nodes of the tree.
4659
4660
If your training data has a feature whose values span a large range and, at the same
4661
time, are characterized by a heavy-tail distribution, you should look at the script
4662
4663
construct_dt_for_heavytailed.pl
4664
4665
to see how to use the option C in the call to the
4666
constructor. This option was introduced in Version 2.22 for dealing with such
4667
features. If you do not set this option, the module will use the default value of
4668
500 for the number of points at which to sample the value range for such a feature.
4669
4670
The C directory also contains the following scripts:
4671
4672
generate_training_data_numeric.pl
4673
generate_training_data_symbolic.pl
4674
4675
that show how you can use the module to generate synthetic training. Synthetic
4676
training is generated according to the specifications laid out in a parameter file.
4677
There are constraints on how the information is laid out in a parameter file. See
4678
the files C and C in the C directory
4679
for how to structure these files.
4680
4681
The C directory of Versions 2.1 and higher of the module also contains the
4682
following two scripts:
4683
4684
evaluate_training_data1.pl
4685
evaluate_training_data2.pl
4686
4687
that illustrate how the Perl class C can be used to evaluate the
4688
quality of your training data (as long as it resides in a `C<.csv>' file.) This new
4689
class is a subclass of the C class in the module file. See the README
4690
in the C directory for further information regarding these two scripts.
4691
4692
The C directory of Versions 2.31 and higher of the module contains the
4693
following three scripts:
4694
4695
introspection_in_a_loop_interactive.pl
4696
4697
introspection_show_training_samples_at_all_nodes_direct_influence.pl
4698
4699
introspection_show_training_samples_to_nodes_influence_propagation.pl
4700
4701
The first script illustrates how to use the C class of the module
4702
interactively for generating explanations for the classification decisions made at
4703
the nodes of the decision tree. In the interactive session you are first asked for
4704
the node number you are interested in. Subsequently, you are asked for whether or
4705
not you are interested in specific questions that the introspector can provide
4706
answers for. The second script generates a tabular display that shows for each node
4707
of the decision tree a list of the training samples that fall directly in the portion
4708
of the feature space assigned that node. (As mentioned elsewhere in this
4709
documentation, when this list is empty for a node, that means the node is a result of
4710
the generalization achieved by probabilistic modeling of the data. Note that this
4711
module constructs a decision tree NOT by partitioning the set of training samples,
4712
BUT by partitioning the domains of the probability density functions.) The third
4713
script listed above also generates a tabular display, but one that shows how the
4714
influence of each training sample propagates in the tree. This display first shows
4715
the list of nodes that are affected directly by the data in a training sample. This
4716
list is followed by an indented display of the nodes that are affected indirectly by
4717
the training sample. A training sample affects a node indirectly if the node is a
4718
descendant of one of the nodes affected directly.
4719
4720
The latest addition to the Examples directory is the script:
4721
4722
get_indexes_associated_with_fields.py
4723
4724
As to why you may find this script useful, note that large database files may have
4725
hundreds of fields and it is not always easy to figure out what numerical index is
4726
associated with a given field. At the same time, the constructor of the DecisionTree
4727
module requires that the field that holds the class label and the fields that contain
4728
the feature values be specified by their numerical zero-based indexes. If you have a
4729
large database and you are faced with this problem, you can run this script to see
4730
the zero-based numerical index values associated with the different columns of your
4731
CSV file.
4732
4733
4734
=head1 THE C DIRECTORY
4735
4736
The C directory contains the following scripts:
4737
4738
bagging_for_classifying_one_test_sample.pl
4739
4740
bagging_for_bulk_classification.pl
4741
4742
As the names of the scripts imply, the first shows how to call the different methods
4743
of the C class for classifying a single test sample. When
4744
you are classifying a single test sample, you can also see how each bag is
4745
classifying the test sample. You can, for example, display the training data used in
4746
each bag, the decision tree constructed for each bag, etc.
4747
4748
The second script is for the case when you place all of the test samples in a single
4749
file. The demonstration script displays for each test sample a single aggregate
4750
classification decision that is obtained through majority voting by all the decision
4751
trees.
4752
4753
4754
=head1 THE C DIRECTORY
4755
4756
The C subdirectory in the main installation directory contains the
4757
following three scripts:
4758
4759
boosting_for_classifying_one_test_sample_1.pl
4760
4761
boosting_for_classifying_one_test_sample_2.pl
4762
4763
boosting_for_bulk_classification.pl
4764
4765
As the names of the first two scripts imply, these show how to call the different
4766
methods of the C class for classifying a single test sample.
4767
When you are classifying a single test sample, you can see how each stage of the
4768
cascade of decision trees is classifying the test sample. You can also view each
4769
decision tree separately and also see the trust factor associated with the tree.
4770
4771
The third script is for the case when you place all of the test samples in a single
4772
file. The demonstration script outputs for each test sample a single aggregate
4773
classification decision that is obtained through trust-factor weighted majority
4774
voting by all the decision trees.
4775
4776
=head1 THE C DIRECTORY
4777
4778
The C directory shows example scripts that you can use to
4779
become more familiar with the C class for solving
4780
needle-in-a-haystack and big-data data classification problems. These scripts are:
4781
4782
randomized_trees_for_classifying_one_test_sample_1.pl
4783
4784
randomized_trees_for_classifying_one_test_sample_2.pl
4785
4786
classify_database_records.pl
4787
4788
The first script shows the constructor options to use for solving a
4789
needle-in-a-haystack problem --- that is, a problem in which a vast majority of the
4790
training data belongs to just one class. The second script shows the constructor
4791
options for using randomized decision trees for the case when you have access to a
4792
very large database of training samples and you'd like to construct an ensemble of
4793
decision trees using training samples pulled randomly from the training database.
4794
The last script illustrates how you can evaluate the classification power of an
4795
ensemble of decision trees as constructed by C by classifying
4796
a large number of test samples extracted randomly from the training database.
4797
4798
4799
=head1 THE C DIRECTORY
4800
4801
The C subdirectory in the main installation directory shows
4802
example scripts that you can use to become familiar with regression trees and how
4803
they can be used for nonlinear regression. If you are new to the concept of
4804
regression trees, start by executing the following scripts without changing them and
4805
see what sort of output is produced by them:
4806
4807
regression4.pl
4808
4809
regression5.pl
4810
4811
regression6.pl
4812
4813
regression8.pl
4814
4815
The C script involves only one predictor variable and one dependent
4816
variable. The training data for this exercise is drawn from the file C.
4817
This data file contains strongly nonlinear data. When you run the script
4818
C, you will see how much better the result from tree regression is
4819
compared to what you can get with linear regression.
4820
4821
The C script is essentially the same as the previous script except
4822
for the fact that the training datafile used in this case, C, consists
4823
of three noisy segments, as opposed to just two in the previous case.
4824
4825
The script C deals with the case when we have two predictor variables
4826
and one dependent variable. You can think of the data as consisting of noisy height
4827
values over an C<(x1,x2)> plane. The data used in this script is drawn from the csv
4828
file C.
4829
4830
Finally, the script C shows how you can carry out bulk prediction for
4831
all your test data records in a disk file. The script writes all the calculated
4832
predictions into another disk file whose name is derived from the name of the test
4833
data file.
4834
4835
4836
=head1 EXPORT
4837
4838
None by design.
4839
4840
=head1 BUGS
4841
4842
Please notify the author if you encounter any bugs. When sending email, please place
4843
the string 'DecisionTree' in the subject line.
4844
4845
=head1 INSTALLATION
4846
4847
Download the archive from CPAN in any directory of your choice. Unpack the archive
4848
with a command that on a Linux machine would look like:
4849
4850
tar zxvf Algorithm-DecisionTree-3.42.tar.gz
4851
4852
This will create an installation directory for you whose name will be
4853
C. Enter this directory and execute the following
4854
commands for a standard install of the module if you have root privileges:
4855
4856
perl Makefile.PL
4857
make
4858
make test
4859
sudo make install
4860
4861
If you do not have root privileges, you can carry out a non-standard install the
4862
module in any directory of your choice by:
4863
4864
perl Makefile.PL prefix=/some/other/directory/
4865
make
4866
make test
4867
make install
4868
4869
With a non-standard install, you may also have to set your PERL5LIB environment
4870
variable so that this module can find the required other modules. How you do that
4871
would depend on what platform you are working on. In order to install this module in
4872
a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
4873
variable by
4874
4875
setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4876
4877
If I used bash, I'd need to declare:
4878
4879
export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4880
4881
4882
=head1 THANKS
4883
4884
I wish to thank many users of this module for their feedback. Many of the
4885
improvements I have made to the module over the years are a result of the feedback
4886
received.
4887
4888
I thank Slaven Rezic for pointing out that the module worked with Perl 5.14.x. For
4889
Version 2.22, I had set the required version of Perl to 5.18.0 since that's what I
4890
used for testing the module. Slaven's feedback in the form of the Bug report
4891
C<#96547> resulted in Version 2.23 of the module. Version 2.25 further downshifts
4892
the required version of Perl to 5.10.
4893
4894
On the basis of the report posted by Slaven at C regarding Version 2.27,
4895
I am removing the Perl version restriction altogether from Version 2.30. Thanks
4896
Slaven!
4897
4898
4899
=head1 AUTHOR
4900
4901
The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy Project"
4902
with the publication of the book I by John-Wiley. If
4903
interested, visit his web page at Purdue to find out what this project was all
4904
about. You might like I especially if you enjoyed reading
4905
Harry Potter as a kid (or even as an adult, for that matter).
4906
4907
If you send email regarding this module, please place the string "DecisionTree" in
4908
your subject line to get past my spam filter. Avi Kak's email address is
4909
C
4910
4911
=head1 COPYRIGHT
4912
4913
This library is free software; you can redistribute it and/or modify it under the
4914
same terms as Perl itself.
4915
4916
Copyright 2016 Avinash Kak
4917
4918
=cut
4919