line
stmt
bran
cond
sub
pod
time
code
1
package Algorithm::DecisionTree;
2
3
#--------------------------------------------------------------------------------------
4
# Copyright (c) 2016 Avinash Kak. All rights reserved. This program is free
5
# software. You may modify and/or distribute it under the same terms as Perl itself.
6
# This copyright notice must remain attached to the file.
7
#
8
# Algorithm::DecisionTree is a Perl module for decision-tree based classification of
9
# multidimensional data.
10
# -------------------------------------------------------------------------------------
11
12
#use 5.10.0;
13
1
1
10953
use strict;
1
2
1
21
14
1
1
3
use warnings;
1
1
1
17
15
1
1
3
use Carp;
1
3
1
12203
16
17
our $VERSION = '3.41';
18
19
############################################ Constructor ##############################################
20
sub new {
21
1
1
1
310
my ($class, %args, $eval_or_boosting_mode);
22
1
50
10
if (@_ % 2 != 0) {
23
1
5
($class, %args) = @_;
24
} else {
25
0
0
$class = shift;
26
0
0
$eval_or_boosting_mode = shift;
27
0
0
0
0
die unless $eval_or_boosting_mode eq 'evalmode' || $eval_or_boosting_mode eq 'boostingmode';
28
0
0
0
die "Only one string arg allowed in eval and boosting modes" if @_;
29
}
30
1
50
2
unless ($eval_or_boosting_mode) {
31
1
4
my @params = keys %args;
32
1
50
2
croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
33
if check_for_illegal_params2(@params) == 0;
34
}
35
bless {
36
_training_datafile => $args{training_datafile},
37
_entropy_threshold => $args{entropy_threshold} || 0.01,
38
_max_depth_desired => exists $args{max_depth_desired} ?
39
$args{max_depth_desired} : undef,
40
_debug1 => $args{debug1} || 0,
41
_debug2 => $args{debug2} || 0,
42
_debug3 => $args{debug3} || 0,
43
_csv_class_column_index => $args{csv_class_column_index} || undef,
44
_csv_columns_for_features => $args{csv_columns_for_features} || undef,
45
_symbolic_to_numeric_cardinality_threshold
46
=> $args{symbolic_to_numeric_cardinality_threshold} || 10,
47
_number_of_histogram_bins => $args{number_of_histogram_bins} || undef,
48
1
50
50
45
_training_data => [],
50
50
50
50
50
50
50
49
_root_node => undef,
50
_probability_cache => {},
51
_entropy_cache => {},
52
_training_data_hash => {},
53
_features_and_values_hash => {},
54
_samples_class_label_hash => {},
55
_class_names => [],
56
_class_priors => [],
57
_class_priors_hash => {},
58
_feature_names => [],
59
_numeric_features_valuerange_hash => {},
60
_sampling_points_for_numeric_feature_hash => {},
61
_feature_values_how_many_uniques_hash => {},
62
_prob_distribution_numeric_features_hash => {},
63
_histogram_delta_hash => {},
64
_num_of_histogram_bins_hash => {},
65
}, $class;
66
67
}
68
69
#################################### Classify with Decision Tree #######################################
70
71
## Classifies one test sample at a time using the decision tree constructed from
72
## your training file. The data record for the test sample must be supplied as
73
## shown in the scripts in the `examples' subdirectory. See the scripts
74
## construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
75
sub classify {
76
0
0
1
0
my $self = shift;
77
0
0
my $root_node = shift;
78
0
0
my $feature_and_values = shift;
79
0
0
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
80
0
0
my @features_and_values = @$feature_and_values;
81
0
0
@features_and_values = @{deep_copy_array(\@features_and_values)};
0
0
82
0
0
0
die "Error in the names you have used for features and/or values"
83
unless $self->check_names_used(\@features_and_values);
84
0
0
my @new_features_and_values = ();
85
0
0
my $pattern = '(\S+)\s*=\s*(\S+)';
86
0
0
foreach my $feature_and_value (@features_and_values) {
87
0
0
$feature_and_value =~ /$pattern/;
88
0
0
my ($feature, $value) = ($1, $2);
89
0
0
my $newvalue = $value;
90
0
0
my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
0
0
91
0
0
my $not_all_values_float = 0;
92
0
0
0
map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
0
0
93
0
0
0
0
if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
0
0
94
$not_all_values_float == 0) {
95
0
0
$newvalue = closest_sampling_point($value, \@unique_values_for_feature);
96
}
97
0
0
push @new_features_and_values, "$feature" . '=' . "$newvalue";
98
}
99
0
0
@features_and_values = @new_features_and_values;
100
0
0
0
print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
101
0
0
my %answer = ();
102
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
103
0
0
$answer{$class_name} = undef;
104
}
105
0
0
$answer{'solution_path'} = [];
106
0
0
my %classification = %{$self->recursive_descent_for_classification($root_node,
0
0
107
\@features_and_values,\%answer)};
108
0
0
@{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
0
0
0
0
109
0
0
0
if ($self->{_debug3}) {
110
0
0
print "\nCL2 The classification:\n";
111
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
112
0
0
print " $class_name with probability $classification{$class_name}\n";
113
}
114
}
115
0
0
my %classification_for_display = ();
116
0
0
foreach my $item (keys %classification) {
117
0
0
0
if ($item ne 'solution_path') {
118
0
0
$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
119
} else {
120
0
0
my @outlist = ();
121
0
0
foreach my $x (@{$classification{$item}}) {
0
0
122
0
0
push @outlist, "NODE$x";
123
}
124
0
0
$classification_for_display{$item} = \@outlist;
125
}
126
}
127
0
0
return \%classification_for_display;
128
}
129
130
sub recursive_descent_for_classification {
131
0
0
0
0
my $self = shift;
132
0
0
my $node = shift;
133
0
0
my $features_and_values = shift;
134
0
0
my $answer = shift;
135
0
0
my @features_and_values = @$features_and_values;
136
0
0
my %answer = %$answer;
137
0
0
my @children = @{$node->get_children()};
0
0
138
0
0
0
if (@children == 0) {
139
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
140
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
141
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
142
}
143
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
144
0
0
return \%answer;
145
}
146
0
0
my $feature_tested_at_node = $node->get_feature();
147
print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n"
148
0
0
0
if $self->{_debug3};
149
0
0
my $value_for_feature;
150
my $path_found;
151
0
0
my $pattern = '(\S+)\s*=\s*(\S+)';
152
0
0
foreach my $feature_and_value (@features_and_values) {
153
0
0
$feature_and_value =~ /$pattern/;
154
0
0
0
$value_for_feature = $2 if $feature_tested_at_node eq $1;
155
}
156
# The following clause introduced in Version 3.20
157
0
0
0
if (!defined $value_for_feature) {
158
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
159
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
160
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
161
}
162
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
163
0
0
return \%answer;
164
}
165
0
0
0
if ($value_for_feature) {
166
0
0
0
if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
0
0
167
0
0
0
print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};
168
0
0
my $pattern1 = '(.+)<(.+)';
169
0
0
my $pattern2 = '(.+)>(.+)';
170
0
0
foreach my $child (@children) {
171
0
0
my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
0
0
172
0
0
my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
173
0
0
0
if ($last_feature_and_value_on_branch =~ /$pattern1/) {
174
0
0
my ($feature, $threshold) = ($1,$2);
175
0
0
0
if ($value_for_feature <= $threshold) {
176
0
0
$path_found = 1;
177
0
0
%answer = %{$self->recursive_descent_for_classification($child,
0
0
178
$features_and_values,\%answer)};
179
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
180
0
0
last;
181
}
182
}
183
0
0
0
if ($last_feature_and_value_on_branch =~ /$pattern2/) {
184
0
0
my ($feature, $threshold) = ($1,$2);
185
0
0
0
if ($value_for_feature > $threshold) {
186
0
0
$path_found = 1;
187
0
0
%answer = %{$self->recursive_descent_for_classification($child,
0
0
188
$features_and_values,\%answer)};
189
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
190
0
0
last;
191
}
192
}
193
}
194
0
0
0
return \%answer if $path_found;
195
} else {
196
0
0
my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";
197
print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n"
198
0
0
0
if $self->{_debug3};
199
0
0
foreach my $child (@children) {
200
0
0
my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
0
0
201
0
0
0
print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};
202
0
0
my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
203
0
0
0
if ($last_feature_and_value_on_branch eq $feature_value_combo) {
204
0
0
%answer = %{$self->recursive_descent_for_classification($child,
0
0
205
$features_and_values,\%answer)};
206
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
207
0
0
$path_found = 1;
208
0
0
last;
209
}
210
}
211
0
0
0
return \%answer if $path_found;
212
}
213
}
214
0
0
0
if (! $path_found) {
215
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
216
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
217
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
218
}
219
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
220
}
221
0
0
return \%answer;
222
}
223
224
## If you want classification to be carried out by engaging a human user in a
225
## question-answer session, this is the method to use for that purpose. See, for
226
## example, the script classify_by_asking_questions.pl in the `examples'
227
## subdirectory for an illustration of how to do that.
228
sub classify_by_asking_questions {
229
0
0
1
0
my $self = shift;
230
0
0
my $root_node = shift;
231
0
0
my %answer = ();
232
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
233
0
0
$answer{$class_name} = undef;
234
}
235
0
0
$answer{'solution_path'} = [];
236
0
0
my %scratchpad_for_numeric_answers = ();
237
0
0
foreach my $feature_name (keys %{$self->{_prob_distribution_numeric_features_hash}}) {
0
0
238
0
0
$scratchpad_for_numeric_answers{$feature_name} = undef;
239
}
240
0
0
my %classification = %{$self->interactive_recursive_descent_for_classification($root_node,
0
0
241
\%answer, \%scratchpad_for_numeric_answers)};
242
0
0
@{$classification{'solution_path'}} = reverse @{$classification{'solution_path'}};
0
0
0
0
243
0
0
my %classification_for_display = ();
244
0
0
foreach my $item (keys %classification) {
245
0
0
0
if ($item ne 'solution_path') {
246
0
0
$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
247
} else {
248
0
0
my @outlist = ();
249
0
0
foreach my $x (@{$classification{$item}}) {
0
0
250
0
0
push @outlist, "NODE$x";
251
}
252
0
0
$classification_for_display{$item} = \@outlist;
253
}
254
}
255
0
0
return \%classification_for_display;
256
}
257
258
sub interactive_recursive_descent_for_classification {
259
0
0
0
0
my $self = shift;
260
0
0
my $node = shift;
261
0
0
my $answer = shift;
262
0
0
my $scratchpad_for_numerics = shift;
263
0
0
my %answer = %$answer;
264
0
0
my %scratchpad_for_numerics = %$scratchpad_for_numerics;
265
0
0
my $pattern1 = '(.+)<(.+)';
266
0
0
my $pattern2 = '(.+)>(.+)';
267
0
0
my $user_value_for_feature;
268
0
0
my @children = @{$node->get_children()};
0
0
269
0
0
0
if (@children == 0) {
270
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
271
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
272
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
273
}
274
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
275
0
0
return \%answer;
276
}
277
0
0
my @list_of_branch_attributes_to_children = ();
278
0
0
foreach my $child (@children) {
279
0
0
my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
0
0
280
0
0
my $feature_and_value_on_branch = $branch_features_and_values[-1];
281
0
0
push @list_of_branch_attributes_to_children, $feature_and_value_on_branch;
282
}
283
0
0
my $feature_tested_at_node = $node->get_feature();
284
0
0
my $feature_value_combo;
285
0
0
my $path_found = 0;
286
0
0
0
if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
0
0
287
0
0
0
if ($scratchpad_for_numerics{$feature_tested_at_node}) {
288
0
0
$user_value_for_feature = $scratchpad_for_numerics{$feature_tested_at_node};
289
} else {
290
0
0
my @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_tested_at_node}};
0
0
291
0
0
while (1) {
292
0
0
print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
293
0
0
print "\nEnter a value in the range (@valuerange): ";
294
0
0
$user_value_for_feature = ;
295
0
0
$user_value_for_feature =~ s/\r?\n?$//;
296
0
0
$user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
297
0
0
my $answer_found = 0;
298
0
0
0
0
if ($user_value_for_feature >= $valuerange[0] && $user_value_for_feature <= $valuerange[1]) {
299
0
0
$answer_found = 1;
300
0
0
last;
301
}
302
0
0
0
last if $answer_found;
303
0
0
print("You entered illegal value. Let's try again")
304
}
305
0
0
$scratchpad_for_numerics{$feature_tested_at_node} = $user_value_for_feature;
306
}
307
0
0
foreach my $i (0..@list_of_branch_attributes_to_children-1) {
308
0
0
my $branch_attribute = $list_of_branch_attributes_to_children[$i];
309
0
0
0
if ($branch_attribute =~ /$pattern1/) {
310
0
0
my ($feature,$threshold) = ($1,$2);
311
0
0
0
if ($user_value_for_feature <= $threshold) {
312
0
0
%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
0
0
313
\%answer, \%scratchpad_for_numerics)};
314
0
0
$path_found = 1;
315
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
316
0
0
last;
317
}
318
}
319
0
0
0
if ($branch_attribute =~ /$pattern2/) {
320
0
0
my ($feature,$threshold) = ($1,$2);
321
0
0
0
if ($user_value_for_feature > $threshold) {
322
0
0
%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
0
0
323
\%answer, \%scratchpad_for_numerics)};
324
0
0
$path_found = 1;
325
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
326
0
0
last;
327
}
328
}
329
}
330
0
0
0
return \%answer if $path_found;
331
} else {
332
0
0
my @possible_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature_tested_at_node}};
0
0
333
0
0
while (1) {
334
0
0
print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
335
0
0
print "\nEnter a value from the list (@possible_values_for_feature): ";
336
0
0
$user_value_for_feature = ;
337
0
0
$user_value_for_feature =~ s/\r?\n?$//;
338
0
0
$user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
339
0
0
my $answer_found = 0;
340
0
0
0
if (contained_in($user_value_for_feature, @possible_values_for_feature)) {
341
0
0
$answer_found = 1;
342
0
0
last;
343
}
344
0
0
0
last if $answer_found;
345
0
0
print("You entered illegal value. Let's try again");
346
}
347
0
0
$feature_value_combo = "$feature_tested_at_node=$user_value_for_feature";
348
0
0
foreach my $i (0..@list_of_branch_attributes_to_children-1) {
349
0
0
my $branch_attribute = $list_of_branch_attributes_to_children[$i];
350
0
0
0
if ($branch_attribute eq $feature_value_combo) {
351
0
0
%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
0
0
352
\%answer, \%scratchpad_for_numerics)};
353
0
0
$path_found = 1;
354
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
355
0
0
last;
356
}
357
}
358
0
0
0
return \%answer if $path_found;
359
}
360
0
0
0
if (! $path_found) {
361
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
362
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
363
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
364
}
365
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
366
}
367
0
0
return \%answer;
368
}
369
370
###################################### Decision Tree Construction ####################################
371
372
## At the root node, we find the best feature that yields the greatest reduction in
373
## class entropy from the entropy based on just the class priors. The logic for
374
## finding this feature is different for symbolic features and for numeric features.
375
## That logic is built into the method shown later for best feature calculations.
376
sub construct_decision_tree_classifier {
377
1
1
1
7
print "\nConstructing the decision tree ...\n";
378
1
1
my $self = shift;
379
1
50
3
if ($self->{_debug3}) {
380
0
0
$self->determine_data_condition();
381
0
0
print "\nStarting construction of the decision tree:\n";
382
}
383
1
5
my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
2
5
1
2
384
1
50
3
if ($self->{_debug3}) {
385
0
0
print "\nPrior class probabilities: @class_probabilities\n";
386
0
0
print "\nClass names: @{$self->{_class_names}}\n";
0
0
387
}
388
1
4
my $entropy = $self->class_entropy_on_priors();
389
1
50
3
print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};
390
1
7
my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');
391
1
1
$root_node->set_class_names(\@{$self->{_class_names}});
1
3
392
1
1
$self->{_root_node} = $root_node;
393
1
3
$self->recursive_descent($root_node);
394
1
5
return $root_node;
395
}
396
397
## After the root node of the decision tree is calculated by the previous methods,
398
## we invoke this method recursively to create the rest of the tree. At each node,
399
## we find the feature that achieves the largest entropy reduction with regard to
400
## the partitioning of the training data samples that correspond to that node.
401
sub recursive_descent {
402
30
30
0
21
my $self = shift;
403
30
20
my $node = shift;
404
print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"
405
30
50
44
if $self->{_debug3};
406
30
31
my $node_serial_number = $node->get_serial_num();
407
30
23
my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
30
32
408
30
35
my $existing_node_entropy = $node->get_node_entropy();
409
30
50
42
if ($self->{_debug3}) {
410
0
0
print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";
411
0
0
print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
412
0
0
print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
413
0
0
my @class_probs = @{$node->get_class_probabilities()};
0
0
414
0
0
print "\nRD4 Class probabilities: @class_probs\n";
415
}
416
30
100
61
if ($existing_node_entropy < $self->{_entropy_threshold}) {
417
1
50
3
print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
418
1
3
return;
419
}
420
29
19
my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
29
34
421
29
42
my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
422
$self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
423
29
46
$node->set_feature($best_feature);
424
29
50
38
$node->display_node() if $self->{_debug3};
425
29
50
33
47
if (defined($self->{_max_depth_desired}) &&
426
(@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
427
0
0
0
print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3};
428
0
0
return;
429
}
430
29
50
41
return if ! defined $best_feature;
431
29
50
32
if ($self->{_debug3}) {
432
0
0
print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
433
0
0
print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
434
0
0
print "\nRD9 Best feature entropy: $best_feature_entropy\n";
435
0
0
print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
436
}
437
29
32
my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
438
29
50
32
print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
439
29
100
35
if ($entropy_gain > $self->{_entropy_threshold}) {
440
19
50
33
39
if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} &&
441
$self->{_feature_values_how_many_uniques_hash}->{$best_feature} >
442
$self->{_symbolic_to_numeric_cardinality_threshold}) {
443
0
0
my $best_threshold = $decision_val; # as returned by best feature calculator
444
0
0
my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
445
my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
446
0
0
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
0
0
447
my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
448
0
0
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
0
0
449
0
0
my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
450
0
0
my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
451
0
0
push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
452
$feature_threshold_combo_for_less_than;
453
0
0
push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
454
$feature_threshold_combo_for_greater_than;
455
0
0
0
if ($self->{_debug3}) {
456
0
0
print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
457
"@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
458
0
0
print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
459
"@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
460
}
461
my @class_probabilities_for_lessthan_child_node =
462
0
0
map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
463
0
0
\@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
0
0
464
my @class_probabilities_for_greaterthan_child_node =
465
0
0
map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
466
0
0
\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
0
0
467
0
0
0
if ($self->{_debug3}) {
468
0
0
print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
469
0
0
print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
470
}
471
0
0
0
if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
472
0
0
my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
473
\@class_probabilities_for_lessthan_child_node,
474
\@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
475
0
0
$node->add_child_link($left_child_node);
476
0
0
$self->recursive_descent($left_child_node);
477
}
478
0
0
0
if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
479
0
0
my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
480
\@class_probabilities_for_greaterthan_child_node,
481
\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
482
0
0
$node->add_child_link($right_child_node);
483
0
0
$self->recursive_descent($right_child_node);
484
}
485
} else {
486
print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
487
19
50
22
if $self->{_debug3};
488
19
14
my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
19
36
489
19
50
25
print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
490
19
17
my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
61
109
491
19
19
my @class_entropies_for_children = ();
492
19
30
foreach my $feature_and_value_index (0..@feature_value_combos-1) {
493
print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
494
61
50
84
if $self->{_debug3};
495
61
33
my @extended_branch_features_and_values_or_thresholds;
496
61
100
71
if (! @features_and_values_or_thresholds_on_branch) {
497
4
6
@extended_branch_features_and_values_or_thresholds =
498
($feature_value_combos[$feature_and_value_index]);
499
} else {
500
@extended_branch_features_and_values_or_thresholds =
501
57
40
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
57
64
502
57
68
push @extended_branch_features_and_values_or_thresholds,
503
$feature_value_combos[$feature_and_value_index];
504
}
505
my @class_probabilities =
506
122
177
map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
507
61
35
\@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
61
72
508
61
73
my $class_entropy_for_child =
509
$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
510
\@extended_branch_features_and_values_or_thresholds);
511
61
50
84
if ($self->{_debug3}) {
512
0
0
print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
513
0
0
print "\nRD20 class entropy for child: $class_entropy_for_child\n";
514
}
515
61
100
79
if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
516
29
53
my $child_node = DTNode->new(undef, $class_entropy_for_child,
517
\@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
518
29
40
$node->add_child_link($child_node);
519
29
80
$self->recursive_descent($child_node);
520
} else {
521
32
50
92
print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
522
}
523
}
524
}
525
} else {
526
print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n"
527
10
50
17
if $self->{_debug3};
528
10
39
return;
529
}
530
}
531
532
## This is the heart of the decision tree constructor. Its main job is to figure
533
## out the best feature to use for partitioning the training data samples that
534
## correspond to the current node. The search for the best feature is carried out
535
## differently for symbolic features and for numeric features. For a symbolic
536
## feature, the method estimates the entropy for each value of the feature and then
537
## averages out these entropies as a measure of the discriminatory power of that
538
## features. For a numeric feature, on the other hand, it estimates the entropy
539
## reduction that can be achieved if were to partition the set of training samples
540
## for each possible threshold. For a numeric feature, all possible sampling points
541
## relevant to the node in question are considered as candidates for thresholds.
542
sub best_feature_calculator {
543
29
29
0
24
my $self = shift;
544
29
27
my $features_and_values_or_thresholds_on_branch = shift;
545
29
16
my $existing_node_entropy = shift;
546
29
29
my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
547
29
23
my $pattern1 = '(.+)=(.+)';
548
29
18
my $pattern2 = '(.+)<(.+)';
549
29
17
my $pattern3 = '(.+)>(.+)';
550
29
34
my @all_symbolic_features = ();
551
29
19
foreach my $feature_name (@{$self->{_feature_names}}) {
29
38
552
push @all_symbolic_features, $feature_name
553
116
50
171
if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
554
}
555
29
25
my @symbolic_features_already_used = ();
556
29
27
foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
557
81
50
309
push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
558
}
559
29
17
my @symbolic_features_not_yet_used;
560
29
71
foreach my $x (@all_symbolic_features) {
561
116
100
112
push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
562
}
563
29
26
my @true_numeric_types = ();
564
29
23
my @symbolic_types = ();
565
29
18
my @true_numeric_types_feature_names = ();
566
29
32
my @symbolic_types_feature_names = ();
567
29
26
foreach my $item (@features_and_values_or_thresholds_on_branch) {
568
81
50
250
if ($item =~ /$pattern2/) {
50
50
569
0
0
push @true_numeric_types, $item;
570
0
0
push @true_numeric_types_feature_names, $1;
571
} elsif ($item =~ /$pattern3/) {
572
0
0
push @true_numeric_types, $item;
573
0
0
push @true_numeric_types_feature_names, $1;
574
} elsif ($item =~ /$pattern1/) {
575
81
64
push @symbolic_types, $item;
576
81
103
push @symbolic_types_feature_names, $1;
577
} else {
578
0
0
die "format error in the representation of feature and values or thresholds";
579
}
580
}
581
29
34
my %seen = ();
582
29
0
27
@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
0
0
583
29
15
%seen = ();
584
29
50
26
@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
81
191
585
my @bounded_intervals_numeric_types =
586
29
23
@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
29
41
587
# Calculate the upper and the lower bounds to be used when searching for the best
588
# threshold for each of the numeric features that are in play at the current node:
589
29
25
my (%upperbound, %lowerbound);
590
29
26
foreach my $feature (@true_numeric_types_feature_names) {
591
0
0
$upperbound{$feature} = undef;
592
0
0
$lowerbound{$feature} = undef;
593
}
594
29
27
foreach my $item (@bounded_intervals_numeric_types) {
595
0
0
foreach my $feature_grouping (@$item) {
596
0
0
0
if ($feature_grouping->[1] eq '>') {
597
0
0
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
598
} else {
599
0
0
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
600
}
601
}
602
}
603
29
30
my %entropy_values_for_different_features = ();
604
29
17
my %partitioning_point_child_entropies_hash = ();
605
29
22
my %partitioning_point_threshold = ();
606
29
25
my %entropies_for_different_values_of_symbolic_feature = ();
607
29
34
foreach my $feature (@{$self->{_feature_names}}) {
29
37
608
116
113
$entropy_values_for_different_features{$feature} = [];
609
116
90
$partitioning_point_child_entropies_hash{$feature} = {};
610
116
89
$partitioning_point_threshold{$feature} = undef;
611
116
126
$entropies_for_different_values_of_symbolic_feature{$feature} = [];
612
}
613
29
22
foreach my $i (0..@{$self->{_feature_names}}-1) {
29
49
614
116
95
my $feature_name = $self->{_feature_names}->[$i];
615
116
50
139
print "\n\nBFC1 FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
616
116
100
33
116
if (contained_in($feature_name, @symbolic_features_already_used)) {
50
617
81
87
next;
618
35
58
} elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
619
$self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
620
$self->{_symbolic_to_numeric_cardinality_threshold}) {
621
0
0
my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
0
0
622
0
0
0
print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};
623
0
0
my @newvalues = ();
624
0
0
0
if (contained_in($feature_name, @true_numeric_types_feature_names)) {
625
0
0
0
0
if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
0
0
0
0
0
0
626
$lowerbound{$feature_name} >= $upperbound{$feature_name}) {
627
0
0
next;
628
} elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
629
$lowerbound{$feature_name} < $upperbound{$feature_name}) {
630
0
0
foreach my $x (@values) {
631
0
0
0
0
push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
632
}
633
} elsif (defined($upperbound{$feature_name})) {
634
0
0
foreach my $x (@values) {
635
0
0
0
push @newvalues, $x if $x <= $upperbound{$feature_name};
636
}
637
} elsif (defined($lowerbound{$feature_name})) {
638
0
0
foreach my $x (@values) {
639
0
0
0
push @newvalues, $x if $x > $lowerbound{$feature_name};
640
}
641
} else {
642
0
0
die "Error is bound specifications in best feature calculator";
643
}
644
} else {
645
0
0
@newvalues = @{deep_copy_array(\@values)};
0
0
646
}
647
0
0
0
next if @newvalues == 0;
648
0
0
my @partitioning_entropies = ();
649
0
0
foreach my $value (@newvalues) {
650
0
0
my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
651
0
0
my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
652
0
0
my @for_left_child;
653
my @for_right_child;
654
0
0
0
if (@features_and_values_or_thresholds_on_branch) {
655
0
0
@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
0
0
656
0
0
push @for_left_child, $feature_and_less_than_value_string;
657
0
0
@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
0
0
658
0
0
push @for_right_child, $feature_and_greater_than_value_string;
659
} else {
660
0
0
@for_left_child = ($feature_and_less_than_value_string);
661
0
0
@for_right_child = ($feature_and_greater_than_value_string);
662
}
663
0
0
my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(
664
\@features_and_values_or_thresholds_on_branch, $feature_name, $value);
665
0
0
my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
666
\@features_and_values_or_thresholds_on_branch, $feature_name, $value);
667
0
0
my $partitioning_entropy = $entropy1 *
668
$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
669
$entropy2 *
670
$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);
671
672
0
0
push @partitioning_entropies, $partitioning_entropy;
673
0
0
$partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
674
}
675
0
0
my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
676
0
0
0
if ($min_entropy < $existing_node_entropy) {
677
0
0
$partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
678
0
0
$entropy_values_for_different_features{$feature_name} = $min_entropy;
679
}
680
} else {
681
35
50
44
print "\nBFC2: Entering section reserved for symbolic features\n" if $self->{_debug3};
682
35
50
49
print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
683
35
21
my %seen;
684
110
50
359
my @values = grep {$_ ne 'NA' && !$seen{$_}++}
685
35
24
@{$self->{_features_and_unique_values_hash}->{$feature_name}};
35
47
686
35
65
@values = sort @values;
687
35
50
45
print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};
688
689
35
25
my $entropy = 0;
690
35
29
foreach my $value (@values) {
691
110
163
my $feature_value_string = "$feature_name" . '=' . "$value";
692
110
50
139
print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
693
110
67
my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
110
133
694
110
100
154
if (@features_and_values_or_thresholds_on_branch) {
695
96
79
push @extended_attributes, $feature_value_string;
696
} else {
697
14
15
@extended_attributes = ($feature_value_string);
698
}
699
110
143
$entropy +=
700
$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) *
701
$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
702
print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
703
110
50
160
"is $entropy\n" if $self->{_debug3};
704
110
68
push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
110
208
705
}
706
35
50
49
if ($entropy < $existing_node_entropy) {
707
35
88
$entropy_values_for_different_features{$feature_name} = $entropy;
708
}
709
}
710
}
711
29
25
my $min_entropy_for_best_feature;
712
my $best_feature_name;
713
29
53
foreach my $feature_nom (keys %entropy_values_for_different_features) {
714
116
100
112
if (!defined($best_feature_name)) {
715
29
24
$best_feature_name = $feature_nom;
716
29
44
$min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
717
} else {
718
87
100
133
if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
719
30
27
$best_feature_name = $feature_nom;
720
30
32
$min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
721
}
722
}
723
}
724
29
30
my $threshold_for_best_feature;
725
29
50
31
if (exists $partitioning_point_threshold{$best_feature_name}) {
726
29
22
$threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
727
} else {
728
0
0
$threshold_for_best_feature = undef;
729
}
730
29
31
my $best_feature_entropy = $min_entropy_for_best_feature;
731
29
22
my @val_based_entropies_to_be_returned;
732
my $decision_val_to_be_returned;
733
29
50
33
74
if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} &&
734
$self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} >
735
$self->{_symbolic_to_numeric_cardinality_threshold}) {
736
@val_based_entropies_to_be_returned =
737
0
0
@{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
0
0
738
} else {
739
29
25
@val_based_entropies_to_be_returned = ();
740
}
741
29
50
32
if (exists $partitioning_point_threshold{$best_feature_name}) {
742
29
20
$decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
743
} else {
744
0
0
$decision_val_to_be_returned = undef;
745
}
746
print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
747
29
50
47
"@val_based_entropies_to_be_returned\n" if $self->{_debug3};
748
29
164
return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned,
749
$decision_val_to_be_returned);
750
}
751
752
######################################### Entropy Calculators #####################################
753
754
sub class_entropy_on_priors {
755
1
1
0
2
my $self = shift;
756
return $self->{_entropy_cache}->{'priors'}
757
1
50
2
if exists $self->{_entropy_cache}->{"priors"};
758
1
2
my @class_names = @{$self->{_class_names}};
1
2
759
1
1
my $entropy;
760
1
2
foreach my $class (@class_names) {
761
2
6
my $prob = $self->prior_probability_for_class($class);
762
2
50
33
18
my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;
763
2
50
5
$log_prob = 0 if $prob < 0.0001; # since X.log(X)->0 as X->0
764
2
50
4
$log_prob = 0 if $prob > 0.999; # since log(1) = 0
765
2
100
4
if (!defined $entropy) {
766
1
1
$entropy = -1.0 * $prob * $log_prob;
767
1
2
next;
768
}
769
1
3
$entropy += -1.0 * $prob * $log_prob;
770
}
771
1
2
$self->{_entropy_cache}->{'priors'} = $entropy;
772
1
3
return $entropy;
773
}
774
775
sub entropy_scanner_for_a_numeric_feature {
776
0
0
0
0
local $| = 1;
777
0
0
my $self = shift;
778
0
0
my $feature = shift;
779
0
0
my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};
0
0
780
0
0
my @entropies_for_less_than_thresholds = ();
781
0
0
my @entropies_for_greater_than_thresholds = ();
782
0
0
foreach my $point (@all_sampling_points) {
783
0
0
print ". ";
784
0
0
push @entropies_for_less_than_thresholds,
785
$self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);
786
0
0
push @entropies_for_greater_than_thresholds,
787
$self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);
788
}
789
0
0
print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ".
790
"@entropies_for_less_than_thresholds\n";
791
0
0
print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ".
792
"@entropies_for_greater_than_thresholds\n";
793
}
794
795
sub class_entropy_for_less_than_threshold_for_feature {
796
0
0
0
0
my $self = shift;
797
0
0
my $arr = shift;
798
0
0
my $feature = shift;
799
0
0
my $threshold = shift;
800
0
0
my @array_of_features_and_values_or_thresholds = @$arr;
801
0
0
my $feature_threshold_combo = "$feature" . '<' . "$threshold";
802
0
0
my $sequence = join ":", @array_of_features_and_values_or_thresholds;
803
0
0
$sequence .= ":" . $feature_threshold_combo;
804
0
0
0
return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
805
my @copy_of_array_of_features_and_values_or_thresholds =
806
0
0
@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
0
0
807
0
0
push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
808
0
0
my $entropy = 0;
809
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
810
0
0
my $log_prob = undef;
811
0
0
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
812
$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
813
0
0
0
0
if ($prob >= 0.0001 && $prob <= 0.999) {
0
0
814
0
0
$log_prob = log($prob) / log(2.0);
815
} elsif ($prob < 0.0001) {
816
0
0
$log_prob = 0;
817
} elsif ($prob > 0.999) {
818
0
0
$log_prob = 0;
819
} else {
820
0
0
die "An error has occurred in log_prob calculation";
821
}
822
0
0
$entropy += -1.0 * $prob * $log_prob;
823
}
824
0
0
0
if (abs($entropy) < 0.0000001) {
825
0
0
$entropy = 0.0;
826
}
827
0
0
$self->{_entropy_cache}->{$sequence} = $entropy;
828
0
0
return $entropy;
829
}
830
831
sub class_entropy_for_greater_than_threshold_for_feature {
832
0
0
0
0
my $self = shift;
833
0
0
my $arr = shift;
834
0
0
my $feature = shift;
835
0
0
my $threshold = shift;
836
0
0
my @array_of_features_and_values_or_thresholds = @$arr;
837
0
0
my $feature_threshold_combo = "$feature" . '>' . "$threshold";
838
0
0
my $sequence = join ":", @array_of_features_and_values_or_thresholds;
839
0
0
$sequence .= ":" . $feature_threshold_combo;
840
0
0
0
return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
841
my @copy_of_array_of_features_and_values_or_thresholds =
842
0
0
@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
0
0
843
0
0
push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
844
0
0
my $entropy = 0;
845
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
846
0
0
my $log_prob = undef;
847
0
0
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
848
$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
849
0
0
0
0
if ($prob >= 0.0001 && $prob <= 0.999) {
0
0
850
0
0
$log_prob = log($prob) / log(2.0);
851
} elsif ($prob < 0.0001) {
852
0
0
$log_prob = 0;
853
} elsif ($prob > 0.999) {
854
0
0
$log_prob = 0;
855
} else {
856
0
0
die "An error has occurred in log_prob calculation";
857
}
858
0
0
$entropy += -1.0 * $prob * $log_prob;
859
}
860
0
0
0
if (abs($entropy) < 0.0000001) {
861
0
0
$entropy = 0.0;
862
}
863
0
0
$self->{_entropy_cache}->{$sequence} = $entropy;
864
0
0
return $entropy;
865
}
866
867
sub class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds {
868
171
171
0
119
my $self = shift;
869
171
100
my $array_of_features_and_values_or_thresholds = shift;
870
171
178
my @array_of_features_and_values_or_thresholds = @$array_of_features_and_values_or_thresholds;
871
171
191
my $sequence = join ":", @array_of_features_and_values_or_thresholds;
872
171
100
302
return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
873
110
82
my $entropy = 0;
874
110
75
foreach my $class_name (@{$self->{_class_names}}) {
110
138
875
220
147
my $log_prob = undef;
876
220
250
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
877
$class_name, \@array_of_features_and_values_or_thresholds);
878
220
100
100
585
if ($prob >= 0.0001 && $prob <= 0.999) {
100
50
879
218
242
$log_prob = log($prob) / log(2.0);
880
} elsif ($prob < 0.0001) {
881
1
2
$log_prob = 0;
882
} elsif ($prob > 0.999) {
883
1
2
$log_prob = 0;
884
} else {
885
0
0
die "An error has occurred in log_prob calculation";
886
}
887
220
258
$entropy += -1.0 * $prob * $log_prob;
888
}
889
110
100
156
if (abs($entropy) < 0.0000001) {
890
1
1
$entropy = 0.0;
891
}
892
110
150
$self->{_entropy_cache}->{$sequence} = $entropy;
893
110
172
return $entropy;
894
}
895
896
897
##################################### Probability Calculators ########################################
898
899
sub prior_probability_for_class {
900
4
4
0
3
my $self = shift;
901
4
2
my $class = shift;
902
4
5
my $class_name_in_cache = "prior" . '::' . $class;
903
return $self->{_probability_cache}->{$class_name_in_cache}
904
4
50
11
if exists $self->{_probability_cache}->{$class_name_in_cache};
905
0
0
my $total_num_of_samples = keys %{$self->{_samples_class_label_hash}};
0
0
906
0
0
my @values = values %{$self->{_samples_class_label_hash}};
0
0
907
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
908
0
0
my @trues = grep {$_ eq $class_name} @values;
0
0
909
0
0
my $prior_for_this_class = (1.0 * @trues) / $total_num_of_samples;
910
0
0
my $this_class_name_in_cache = "prior" . '::' . $class_name;
911
0
0
$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
912
}
913
0
0
return $self->{_probability_cache}->{$class_name_in_cache};
914
}
915
916
sub calculate_class_priors {
917
1
1
1
6
my $self = shift;
918
1
50
1
return if scalar keys %{$self->{_class_priors_hash}} > 1;
1
5
919
1
2
foreach my $class_name (@{$self->{_class_names}}) {
1
2
920
2
4
my $class_name_in_cache = "prior::$class_name";
921
2
1
my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};
2
4
922
2
2
my @all_values = values %{$self->{_samples_class_label_hash}};
2
10
923
2
3
my @trues_for_this_class = grep {$_ eq $class_name} @all_values;
70
54
924
2
4
my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;
925
2
2
$self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;
926
2
4
my $this_class_name_in_cache = "prior::$class_name";
927
2
7
$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
928
}
929
1
50
4
if ($self->{_debug1}) {
930
0
0
foreach my $class (sort keys %{$self->{_class_priors_hash}}) {
0
0
931
0
0
print "$class => $self->{_class_priors_hash}->{$class}\n";
932
}
933
}
934
}
935
936
sub calculate_first_order_probabilities {
937
1
1
1
15
print "\nEstimating probabilities...\n";
938
1
2
my $self = shift;
939
1
1
foreach my $feature (@{$self->{_feature_names}}) {
1
3
940
4
10
$self->probability_of_feature_value($feature, undef);
941
4
50
9
if ($self->{_debug2}) {
942
0
0
0
if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {
943
0
0
print "\nPresenting probability distribution for a numeric feature:\n";
944
0
0
foreach my $sampling_point (sort {$a <=> $b} keys
0
0
945
0
0
%{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {
946
0
0
my $sampling_pt_for_display = sprintf("%.2f", $sampling_point);
947
print "$feature :: $sampling_pt_for_display=" . sprintf("%.5f",
948
0
0
$self->{_prob_distribution_numeric_features_hash}->{$feature}{$sampling_point}) . "\n";
949
}
950
} else {
951
0
0
print "\nPresenting probabilities for the values of a feature considered to be symbolic:\n";
952
0
0
my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
0
0
953
0
0
foreach my $value (sort @values_for_feature) {
954
0
0
my $prob = $self->probability_of_feature_value($feature,$value);
955
0
0
print "$feature :: $value = " . sprintf("%.5f", $prob) . "\n";
956
}
957
}
958
}
959
}
960
}
961
962
sub probability_of_feature_value {
963
286
286
0
193
my $self = shift;
964
286
209
my $feature_name = shift;
965
286
211
my $value = shift;
966
286
50
66
1097
$value = sprintf("%.1f", $value) if defined($value) && $value =~ /^\d+$/;
967
286
50
66
535
if (defined($value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
968
$value = closest_sampling_point($value,
969
0
0
$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
970
}
971
286
153
my $feature_and_value;
972
286
100
363
if (defined($value)) {
973
282
291
$feature_and_value = "$feature_name=$value";
974
}
975
286
50
66
535
if (defined($value) && exists($self->{_probability_cache}->{$feature_and_value})) {
976
282
507
return $self->{_probability_cache}->{$feature_and_value};
977
}
978
4
7
my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
979
4
50
6
if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
980
0
0
0
if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
981
$self->{_symbolic_to_numeric_cardinality_threshold}) {
982
0
0
0
if (! exists $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}) {
983
0
0
@valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
0
0
984
0
0
$diffrange = $valuerange[1] - $valuerange[0];
985
0
0
my %seen = ();
986
0
0
0
0
my @unique_values_for_feature = sort {$a <=> $b} grep {$_ if $_ ne 'NA' && !$seen{$_}++}
0
0
987
0
0
@{$self->{_features_and_values_hash}->{$feature_name}};
0
0
988
0
0
my @diffs = sort {$a <=> $b} map {$unique_values_for_feature[$_] -
0
0
0
0
989
$unique_values_for_feature[$_-1]} 1..@unique_values_for_feature-1;
990
0
0
my $median_diff = $diffs[int(@diffs/2) - 1];
991
0
0
$histogram_delta = $median_diff * 2;
992
0
0
0
if ($histogram_delta < $diffrange / 500.0) {
993
0
0
0
if (defined $self->{_number_of_histogram_bins}) {
994
0
0
$histogram_delta = $diffrange / $self->{_number_of_histogram_bins};
995
} else {
996
0
0
$histogram_delta = $diffrange / 500.0;
997
}
998
}
999
0
0
$self->{_histogram_delta_hash}->{$feature_name} = $histogram_delta;
1000
0
0
$num_of_histogram_bins = int($diffrange / $histogram_delta) + 1;
1001
0
0
$self->{_num_of_histogram_bins_hash}->{$feature_name} = $num_of_histogram_bins;
1002
0
0
my @sampling_points_for_feature = map {$valuerange[0] + $histogram_delta * $_}
0
0
1003
0..$num_of_histogram_bins-1;
1004
0
0
$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name} =
1005
\@sampling_points_for_feature;
1006
}
1007
}
1008
}
1009
4
50
6
if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1010
0
0
0
if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1011
$self->{_symbolic_to_numeric_cardinality_threshold}) {
1012
my @sampling_points_for_feature =
1013
0
0
@{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
0
0
1014
0
0
my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1015
0
0
my @actual_values_for_feature = grep {$_ ne 'NA'}
1016
0
0
@{$self->{_features_and_values_hash}->{$feature_name}};
0
0
1017
0
0
foreach my $i (0..@sampling_points_for_feature-1) {
1018
0
0
foreach my $j (0..@actual_values_for_feature-1) {
1019
0
0
0
if (abs($sampling_points_for_feature[$i]-$actual_values_for_feature[$j]) < $histogram_delta) {
1020
0
0
$counts_at_sampling_points[$i]++
1021
}
1022
}
1023
}
1024
0
0
my $total_counts = 0;
1025
0
0
map {$total_counts += $_} @counts_at_sampling_points;
0
0
1026
0
0
my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
0
0
1027
0
0
my %bin_prob_hash = ();
1028
0
0
foreach my $i (0..@sampling_points_for_feature-1) {
1029
0
0
$bin_prob_hash{$sampling_points_for_feature[$i]} = $probs[$i];
1030
}
1031
0
0
$self->{_prob_distribution_numeric_features_hash}->{$feature_name} = \%bin_prob_hash;
1032
0
0
my @values_for_feature = map "$feature_name=$_", map {sprintf("%.5f", $_)}
0
0
1033
@sampling_points_for_feature;
1034
0
0
foreach my $i (0..@values_for_feature-1) {
1035
0
0
$self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1036
}
1037
0
0
0
0
if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1038
0
0
return $self->{_probability_cache}->{$feature_and_value};
1039
} else {
1040
0
0
return 0;
1041
}
1042
} else {
1043
0
0
my %seen = ();
1044
0
0
0
0
my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1045
0
0
@{$self->{_features_and_values_hash}->{$feature_name}};
0
0
1046
0
0
@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
0
0
1047
0
0
my @value_counts = (0) x @values_for_feature;
1048
# foreach my $sample (sort {sample_index($a) cmp sample_index($b)}keys %{$self->{_training_data_hash}}){
1049
0
0
foreach my $sample (sort {sample_index($a) <=> sample_index($b)}keys %{$self->{_training_data_hash}}){
0
0
0
0
1050
0
0
my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
0
0
1051
0
0
foreach my $i (0..@values_for_feature-1) {
1052
0
0
foreach my $current_value (@features_and_values) {
1053
0
0
0
$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1054
}
1055
}
1056
}
1057
0
0
my $total_counts = 0;
1058
0
0
map {$total_counts += $_} @value_counts;
0
0
1059
0
0
0
die "PFV Something is wrong with your training file. It contains no training samples \
1060
for feature named $feature_name" if $total_counts == 0;
1061
0
0
my @probs = map {$_ / (1.0 * $total_counts)} @value_counts;
0
0
1062
0
0
foreach my $i (0..@values_for_feature-1) {
1063
0
0
$self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1064
}
1065
0
0
0
0
if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1066
0
0
return $self->{_probability_cache}->{$feature_and_value};
1067
} else {
1068
0
0
return 0;
1069
}
1070
}
1071
} else {
1072
# This section is only for purely symbolic features:
1073
4
4
my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
4
18
1074
4
5
@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
140
187
1075
4
17
my @value_counts = (0) x @values_for_feature;
1076
# foreach my $sample (sort {sample_index($a) cmp sample_index($b)} keys %{$self->{_training_data_hash}}) {
1077
4
4
foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}}) {
564
505
4
21
1078
140
79
my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
140
180
1079
140
127
foreach my $i (0..@values_for_feature-1) {
1080
4900
3257
for my $current_value (@features_and_values) {
1081
19600
100
22722
$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1082
}
1083
}
1084
}
1085
4
13
foreach my $i (0..@values_for_feature-1) {
1086
$self->{_probability_cache}->{$values_for_feature[$i]} =
1087
140
82
$value_counts[$i] / (1.0 * scalar(keys %{$self->{_training_data_hash}}));
140
176
1088
}
1089
4
50
33
11
if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1090
0
0
return $self->{_probability_cache}->{$feature_and_value};
1091
} else {
1092
4
16
return 0;
1093
}
1094
}
1095
}
1096
1097
sub probability_of_feature_value_given_class {
1098
572
572
0
393
my $self = shift;
1099
572
423
my $feature_name = shift;
1100
572
327
my $feature_value = shift;
1101
572
384
my $class_name = shift;
1102
572
50
33
1647
$feature_value = sprintf("%.1f", $feature_value) if defined($feature_value) && $feature_value =~ /^\d+$/;
1103
572
50
33
1057
if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
1104
$feature_value = closest_sampling_point($feature_value,
1105
0
0
$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
1106
}
1107
572
308
my $feature_value_class;
1108
572
50
634
if (defined($feature_value)) {
1109
572
764
$feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";
1110
}
1111
572
100
33
1190
if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {
1112
print "\nNext answer returned by cache for feature $feature_name and " .
1113
564
50
653
"value $feature_value given class $class_name\n" if $self->{_debug2};
1114
564
1101
return $self->{_probability_cache}->{$feature_value_class};
1115
}
1116
8
9
my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
1117
1118
8
50
12
if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1119
0
0
0
if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1120
$self->{_symbolic_to_numeric_cardinality_threshold}) {
1121
0
0
$histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};
1122
0
0
$num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};
1123
0
0
@valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
0
0
1124
0
0
$diffrange = $valuerange[1] - $valuerange[0];
1125
}
1126
}
1127
8
7
my @samples_for_class = ();
1128
# Accumulate all samples names for the given class:
1129
8
3
foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
8
34
1130
280
100
320
if ($self->{_samples_class_label_hash}->{$sample_name} eq $class_name) {
1131
140
116
push @samples_for_class, $sample_name;
1132
}
1133
}
1134
8
50
18
if (exists($self->{_numeric_features_valuerange_hash}->{$feature_name})) {
1135
0
0
0
if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1136
$self->{_symbolic_to_numeric_cardinality_threshold}) {
1137
my @sampling_points_for_feature =
1138
0
0
@{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
0
0
1139
0
0
my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1140
0
0
my @actual_feature_values_for_samples_in_class = ();
1141
0
0
foreach my $sample (@samples_for_class) {
1142
0
0
foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
0
0
1143
0
0
my $pattern = '(.+)=(.+)';
1144
0
0
$feature_and_value =~ /$pattern/;
1145
0
0
my ($feature, $value) = ($1, $2);
1146
0
0
0
0
if (($feature eq $feature_name) && ($value ne 'NA')) {
1147
0
0
push @actual_feature_values_for_samples_in_class, $value;
1148
}
1149
}
1150
}
1151
0
0
foreach my $i (0..@sampling_points_for_feature-1) {
1152
0
0
foreach my $j (0..@actual_feature_values_for_samples_in_class-1) {
1153
0
0
0
if (abs($sampling_points_for_feature[$i] -
1154
$actual_feature_values_for_samples_in_class[$j]) < $histogram_delta) {
1155
0
0
$counts_at_sampling_points[$i]++;
1156
}
1157
}
1158
}
1159
0
0
my $total_counts = 0;
1160
0
0
map {$total_counts += $_} @counts_at_sampling_points;
0
0
1161
0
0
0
die "PFVC1 Something is wrong with your training file. It contains no training " .
1162
"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1163
0
0
my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
0
0
1164
0
0
my @values_for_feature_and_class = map {"$feature_name=$_" . "::" . "$class_name"}
0
0
1165
@sampling_points_for_feature;
1166
0
0
foreach my $i (0..@values_for_feature_and_class-1) {
1167
0
0
$self->{_probability_cache}->{$values_for_feature_and_class[$i]} = $probs[$i];
1168
}
1169
0
0
0
if (exists $self->{_probability_cache}->{$feature_value_class}) {
1170
0
0
return $self->{_probability_cache}->{$feature_value_class};
1171
} else {
1172
0
0
return 0;
1173
}
1174
} else {
1175
# This section is for numeric features that will be treated symbolically
1176
0
0
my %seen = ();
1177
0
0
0
0
my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1178
0
0
@{$self->{_features_and_values_hash}->{$feature_name}};
0
0
1179
0
0
@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
0
0
1180
0
0
my @value_counts = (0) x @values_for_feature;
1181
0
0
foreach my $sample (@samples_for_class) {
1182
0
0
my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
0
0
1183
0
0
foreach my $i (0..@values_for_feature-1) {
1184
0
0
foreach my $current_value (@features_and_values) {
1185
0
0
0
$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1186
}
1187
}
1188
}
1189
0
0
my $total_counts = 0;
1190
0
0
map {$total_counts += $_} @value_counts;
0
0
1191
0
0
0
die "PFVC2 Something is wrong with your training file. It contains no training " .
1192
"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1193
# We normalize by total_count because the probabilities are conditioned on a given class
1194
0
0
foreach my $i (0..@values_for_feature-1) {
1195
0
0
my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1196
0
0
$self->{_probability_cache}->{$feature_and_value_and_class} =
1197
$value_counts[$i] / (1.0 * $total_counts);
1198
}
1199
0
0
0
if (exists $self->{_probability_cache}->{$feature_value_class}) {
1200
0
0
return $self->{_probability_cache}->{$feature_value_class};
1201
} else {
1202
0
0
return 0;
1203
}
1204
}
1205
} else {
1206
# This section is for purely symbolic features
1207
8
5
my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
8
36
1208
8
8
my %seen = ();
1209
280
100
66
729
@values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1210
8
6
@{$self->{_features_and_values_hash}->{$feature_name}};
8
11
1211
8
9
@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
28
40
1212
8
12
my @value_counts = (0) x @values_for_feature;
1213
8
8
foreach my $sample (@samples_for_class) {
1214
140
73
my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
140
169
1215
140
123
foreach my $i (0..@values_for_feature-1) {
1216
490
327
foreach my $current_value (@features_and_values) {
1217
1960
100
2381
$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1218
}
1219
}
1220
}
1221
8
9
my $total_counts = 0;
1222
8
6
map {$total_counts += $_} @value_counts;
28
23
1223
8
50
10
die "PFVC2 Something is wrong with your training file. It contains no training " .
1224
"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1225
# We normalize by total_count because the probabilities are conditioned on a given class
1226
8
9
foreach my $i (0..@values_for_feature-1) {
1227
28
29
my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1228
28
55
$self->{_probability_cache}->{$feature_and_value_and_class} =
1229
$value_counts[$i] / (1.0 * $total_counts);
1230
}
1231
8
50
13
if (exists $self->{_probability_cache}->{$feature_value_class}) {
1232
8
32
return $self->{_probability_cache}->{$feature_value_class};
1233
} else {
1234
0
0
return 0;
1235
}
1236
}
1237
}
1238
1239
sub probability_of_feature_less_than_threshold {
1240
0
0
0
0
my $self = shift;
1241
0
0
my $feature_name = shift;
1242
0
0
my $threshold = shift;
1243
0
0
my $feature_threshold_combo = "$feature_name" . '<' . "$threshold";
1244
return $self->{_probability_cache}->{$feature_threshold_combo}
1245
0
0
0
if (exists $self->{_probability_cache}->{$feature_threshold_combo});
1246
0
0
0
my @all_values = grep {$_ if $_ ne 'NA'} @{$self->{_features_and_values_hash}->{$feature_name}};
0
0
0
0
1247
0
0
0
my @all_values_less_than_threshold = grep {$_ if $_ <= $threshold} @all_values;
0
0
1248
0
0
my $probability = (1.0 * @all_values_less_than_threshold) / @all_values;
1249
0
0
$self->{_probability_cache}->{$feature_threshold_combo} = $probability;
1250
0
0
return $probability;
1251
}
1252
1253
sub probability_of_feature_less_than_threshold_given_class {
1254
0
0
0
0
my $self = shift;
1255
0
0
my $feature_name = shift;
1256
0
0
my $threshold = shift;
1257
0
0
my $class_name = shift;
1258
0
0
my $feature_threshold_class_combo = "$feature_name" . "<" . "$threshold" . "::" . "$class_name";
1259
return $self->{_probability_cache}->{$feature_threshold_class_combo}
1260
0
0
0
if (exists $self->{_probability_cache}->{$feature_threshold_class_combo});
1261
0
0
my @data_samples_for_class = ();
1262
# Accumulate all samples names for the given class:
1263
0
0
foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
0
0
1264
push @data_samples_for_class, $sample_name
1265
0
0
0
if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;
1266
}
1267
0
0
my @actual_feature_values_for_samples_in_class = ();
1268
0
0
foreach my $sample (@data_samples_for_class) {
1269
0
0
foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
0
0
1270
0
0
my $pattern = '(.+)=(.+)';
1271
0
0
$feature_and_value =~ /$pattern/;
1272
0
0
my ($feature,$value) = ($1,$2);
1273
0
0
0
0
push @actual_feature_values_for_samples_in_class, $value
1274
if $feature eq $feature_name && $value ne 'NA';
1275
}
1276
}
1277
0
0
0
my @actual_points_for_feature_less_than_threshold = grep {$_ if $_ <= $threshold} @actual_feature_values_for_samples_in_class;
0
0
1278
# The condition in the assignment that follows was a bug correction in Version 3.20
1279
0
0
0
my $probability = @actual_feature_values_for_samples_in_class > 0 ? ((1.0 * @actual_points_for_feature_less_than_threshold) / @actual_feature_values_for_samples_in_class) : 0.0;
1280
0
0
$self->{_probability_cache}->{$feature_threshold_class_combo} = $probability;
1281
0
0
return $probability;
1282
}
1283
1284
# This method requires that all truly numeric types only be expressed as '<' or '>'
1285
# constructs in the array of branch features and thresholds
1286
sub probability_of_a_sequence_of_features_and_values_or_thresholds {
1287
329
329
0
213
my $self = shift;
1288
329
236
my $arr = shift;
1289
329
379
my @array_of_features_and_values_or_thresholds = @$arr;
1290
329
50
422
return if scalar @array_of_features_and_values_or_thresholds == 0;
1291
329
392
my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1292
329
100
650
return $self->{_probability_cache}->{$sequence} if exists $self->{_probability_cache}->{$sequence};
1293
96
74
my $probability = undef;
1294
96
64
my $pattern1 = '(.+)=(.+)';
1295
96
55
my $pattern2 = '(.+)<(.+)';
1296
96
68
my $pattern3 = '(.+)>(.+)';
1297
96
64
my @true_numeric_types = ();
1298
96
62
my @true_numeric_types_feature_names = ();
1299
96
66
my @symbolic_types = ();
1300
96
65
my @symbolic_types_feature_names = ();
1301
96
82
foreach my $item (@array_of_features_and_values_or_thresholds) {
1302
282
50
502
if ($item =~ /$pattern2/) {
50
1303
0
0
push @true_numeric_types, $item;
1304
0
0
my ($feature,$value) = ($1,$2);
1305
0
0
push @true_numeric_types_feature_names, $feature;
1306
} elsif ($item =~ /$pattern3/) {
1307
0
0
push @true_numeric_types, $item;
1308
0
0
my ($feature,$value) = ($1,$2);
1309
0
0
push @true_numeric_types_feature_names, $feature;
1310
} else {
1311
282
215
push @symbolic_types, $item;
1312
282
409
$item =~ /$pattern1/;
1313
282
323
my ($feature,$value) = ($1,$2);
1314
282
299
push @symbolic_types_feature_names, $feature;
1315
}
1316
}
1317
96
91
my %seen1 = ();
1318
96
0
89
@true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
0
0
1319
96
63
my %seen2 = ();
1320
96
50
85
@symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
282
622
1321
96
132
my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1322
print_array_with_msg("POS: Answer returned by find_bounded: ",
1323
96
50
124
$bounded_intervals_numeric_types) if $self->{_debug2};
1324
# Calculate the upper and the lower bounds to be used when searching for the best
1325
# threshold for each of the numeric features that are in play at the current node:
1326
96
68
my (%upperbound, %lowerbound);
1327
96
87
foreach my $feature_name (@true_numeric_types_feature_names) {
1328
0
0
$upperbound{$feature_name} = undef;
1329
0
0
$lowerbound{$feature_name} = undef;
1330
}
1331
96
88
foreach my $item (@$bounded_intervals_numeric_types) {
1332
0
0
foreach my $feature_grouping (@$item) {
1333
0
0
0
if ($feature_grouping->[1] eq '>') {
1334
0
0
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1335
} else {
1336
0
0
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1337
}
1338
}
1339
}
1340
96
85
foreach my $feature_name (@true_numeric_types_feature_names) {
1341
0
0
0
0
if (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name}) &&
0
0
0
0
0
0
0
1342
$upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1343
0
0
return 0;
1344
} elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1345
0
0
0
if (! $probability) {
1346
$probability = $self->probability_of_feature_less_than_threshold($feature_name,
1347
$upperbound{$feature_name}) -
1348
0
0
$self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name});
1349
} else {
1350
$probability *= ($self->probability_of_feature_less_than_threshold($feature_name,
1351
$upperbound{$feature_name}) -
1352
0
0
$self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name}))
1353
}
1354
} elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1355
0
0
0
if (! $probability) {
1356
$probability = $self->probability_of_feature_less_than_threshold($feature_name,
1357
0
0
$upperbound{$feature_name});
1358
} else {
1359
$probability *= $self->probability_of_feature_less_than_threshold($feature_name,
1360
0
0
$upperbound{$feature_name});
1361
}
1362
} elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1363
0
0
0
if (! $probability) {
1364
$probability = 1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1365
0
0
$lowerbound{$feature_name});
1366
} else {
1367
$probability *= (1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1368
0
0
$lowerbound{$feature_name}));
1369
}
1370
} else {
1371
0
0
die("Ill formatted call to 'probability_of_sequence' method");
1372
}
1373
}
1374
96
85
foreach my $feature_and_value (@symbolic_types) {
1375
282
50
750
if ($feature_and_value =~ /$pattern1/) {
1376
282
338
my ($feature,$value) = ($1,$2);
1377
282
100
276
if (! $probability) {
1378
96
138
$probability = $self->probability_of_feature_value($feature, $value);
1379
} else {
1380
186
203
$probability *= $self->probability_of_feature_value($feature, $value);
1381
}
1382
}
1383
}
1384
96
176
$self->{_probability_cache}->{$sequence} = $probability;
1385
96
197
return $probability;
1386
}
1387
1388
## The following method requires that all truly numeric types only be expressed as
1389
## '<' or '>' constructs in the array of branch features and thresholds
1390
sub probability_of_a_sequence_of_features_and_values_or_thresholds_given_class {
1391
220
220
0
150
my $self = shift;
1392
220
133
my $arr = shift;
1393
220
167
my $class_name = shift;
1394
220
229
my @array_of_features_and_values_or_thresholds = @$arr;
1395
220
50
315
return if scalar @array_of_features_and_values_or_thresholds == 0;
1396
220
245
my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1397
220
212
my $sequence_with_class = "$sequence" . "::" . $class_name;
1398
return $self->{_probability_cache}->{$sequence_with_class}
1399
220
100
343
if exists $self->{_probability_cache}->{$sequence_with_class};
1400
200
126
my $probability = undef;
1401
200
147
my $pattern1 = '(.+)=(.+)';
1402
200
103
my $pattern2 = '(.+)<(.+)';
1403
200
128
my $pattern3 = '(.+)>(.+)';
1404
200
134
my @true_numeric_types = ();
1405
200
144
my @true_numeric_types_feature_names = ();
1406
200
128
my @symbolic_types = ();
1407
200
132
my @symbolic_types_feature_names = ();
1408
200
171
foreach my $item (@array_of_features_and_values_or_thresholds) {
1409
572
50
1104
if ($item =~ /$pattern2/) {
50
1410
0
0
push @true_numeric_types, $item;
1411
0
0
my ($feature,$value) = ($1,$2);
1412
0
0
push @true_numeric_types_feature_names, $feature;
1413
} elsif ($item =~ /$pattern3/) {
1414
0
0
push @true_numeric_types, $item;
1415
0
0
my ($feature,$value) = ($1,$2);
1416
0
0
push @true_numeric_types_feature_names, $feature;
1417
} else {
1418
572
416
push @symbolic_types, $item;
1419
572
898
$item =~ /$pattern1/;
1420
572
733
my ($feature,$value) = ($1,$2);
1421
572
578
push @symbolic_types_feature_names, $feature;
1422
}
1423
}
1424
200
198
my %seen1 = ();
1425
200
0
187
@true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
0
0
1426
200
138
my %seen2 = ();
1427
200
50
161
@symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
572
1307
1428
200
269
my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1429
print_array_with_msg("POSC: Answer returned by find_bounded: ",
1430
200
50
250
$bounded_intervals_numeric_types) if $self->{_debug2};
1431
# Calculate the upper and the lower bounds to be used when searching for the best
1432
# threshold for each of the numeric features that are in play at the current node:
1433
200
135
my (%upperbound, %lowerbound);
1434
200
177
foreach my $feature_name (@true_numeric_types_feature_names) {
1435
0
0
$upperbound{$feature_name} = undef;
1436
0
0
$lowerbound{$feature_name} = undef;
1437
}
1438
200
171
foreach my $item (@$bounded_intervals_numeric_types) {
1439
0
0
foreach my $feature_grouping (@$item) {
1440
0
0
0
if ($feature_grouping->[1] eq '>') {
1441
0
0
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1442
} else {
1443
0
0
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1444
}
1445
}
1446
}
1447
200
158
foreach my $feature_name (@true_numeric_types_feature_names) {
1448
0
0
0
0
if ($lowerbound{$feature_name} && $upperbound{$feature_name} &&
0
0
0
0
0
0
0
1449
$upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1450
0
0
return 0;
1451
} elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1452
0
0
0
if (! $probability) {
1453
1454
$probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1455
$upperbound{$feature_name}, $class_name) -
1456
$self->probability_of_feature_less_than_threshold_given_class($feature_name,
1457
0
0
$lowerbound{$feature_name}, $class_name);
1458
} else {
1459
$probability *= ($self->probability_of_feature_less_than_threshold_given_class($feature_name,
1460
$upperbound{$feature_name}, $class_name) -
1461
$self->probability_of_feature_less_than_threshold_given_class($feature_name,
1462
0
0
$lowerbound{$feature_name}, $class_name))
1463
}
1464
} elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1465
0
0
0
if (! $probability) {
1466
$probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1467
0
0
$upperbound{$feature_name}, $class_name);
1468
} else {
1469
$probability *= $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1470
0
0
$upperbound{$feature_name}, $class_name);
1471
}
1472
} elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1473
0
0
0
if (! $probability) {
1474
$probability = 1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1475
0
0
$lowerbound{$feature_name}, $class_name);
1476
} else {
1477
$probability *= (1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1478
0
0
$lowerbound{$feature_name}, $class_name));
1479
}
1480
} else {
1481
0
0
die("Ill formatted call to 'probability of sequence given class' method");
1482
}
1483
}
1484
200
150
foreach my $feature_and_value (@symbolic_types) {
1485
572
50
1496
if ($feature_and_value =~ /$pattern1/) {
1486
572
688
my ($feature,$value) = ($1,$2);
1487
572
100
560
if (! $probability) {
1488
200
287
$probability = $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1489
} else {
1490
372
379
$probability *= $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1491
}
1492
}
1493
}
1494
200
383
$self->{_probability_cache}->{$sequence_with_class} = $probability;
1495
200
454
return $probability;
1496
}
1497
1498
sub probability_of_a_class_given_sequence_of_features_and_values_or_thresholds {
1499
342
342
0
222
my $self = shift;
1500
342
257
my $class_name = shift;
1501
342
187
my $arr = shift;
1502
342
355
my @array_of_features_and_values_or_thresholds = @$arr;
1503
342
344
my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1504
342
327
my $class_and_sequence = "$class_name" . "::" . $sequence;
1505
return $self->{_probability_cache}->{$class_and_sequence}
1506
342
100
822
if exists $self->{_probability_cache}->{$class_and_sequence};
1507
110
119
my @array_of_class_probabilities = (0) x scalar @{$self->{_class_names}};
110
174
1508
110
73
foreach my $i (0..@{$self->{_class_names}}-1) {
110
137
1509
220
173
my $class_name = $self->{_class_names}->[$i];
1510
220
261
my $prob = $self->probability_of_a_sequence_of_features_and_values_or_thresholds_given_class(
1511
\@array_of_features_and_values_or_thresholds, $class_name);
1512
220
100
307
if ($prob < 0.000001) {
1513
1
1
$array_of_class_probabilities[$i] = 0.0;
1514
1
5
next;
1515
}
1516
219
278
my $prob_of_feature_sequence = $self->probability_of_a_sequence_of_features_and_values_or_thresholds(
1517
\@array_of_features_and_values_or_thresholds);
1518
# die "PCS Something is wrong with your sequence of feature values and thresholds in " .
1519
# "probability_of_a_class_given_sequence_of_features_and_values_or_thresholds()"
1520
# if ! $prob_of_feature_sequence;
1521
219
272
my $prior = $self->{_class_priors_hash}->{$self->{_class_names}->[$i]};
1522
219
50
227
if ($prob_of_feature_sequence) {
1523
219
302
$array_of_class_probabilities[$i] = $prob * $prior / $prob_of_feature_sequence;
1524
} else {
1525
0
0
$array_of_class_probabilities[$i] = $prior;
1526
}
1527
}
1528
110
78
my $sum_probability;
1529
110
98
map {$sum_probability += $_} @array_of_class_probabilities;
220
229
1530
110
50
122
if ($sum_probability == 0) {
1531
0
0
@array_of_class_probabilities = map {1.0 / (scalar @{$self->{_class_names}})}
0
0
1532
0
0
(0..@{$self->{_class_names}}-1);
0
0
1533
} else {
1534
110
88
@array_of_class_probabilities = map {$_ * 1.0 / $sum_probability} @array_of_class_probabilities;
220
240
1535
}
1536
110
97
foreach my $i (0..@{$self->{_class_names}}-1) {
110
197
1537
220
266
my $this_class_and_sequence = "$self->{_class_names}->[$i]" . "::" . "$sequence";
1538
220
464
$self->{_probability_cache}->{$this_class_and_sequence} = $array_of_class_probabilities[$i];
1539
}
1540
110
192
return $self->{_probability_cache}->{$class_and_sequence};
1541
}
1542
1543
####################################### Class Based Utilities ##########################################
1544
1545
## Given a list of branch attributes for the numeric features of the form, say,
1546
## ['g2<1','g2<2','g2<3','age>34','age>36','age>37'], this method returns the
1547
## smallest list that is relevant for the purpose of calculating the probabilities.
1548
## To explain, the probability that the feature `g2' is less than 1 AND, at the same
1549
## time, less than 2, AND, at the same time, less than 3, is the same as the
1550
## probability that the feature less than 1. Similarly, the probability that 'age'
1551
## is greater than 34 and also greater than 37 is the same as `age' being greater
1552
## than 37.
1553
sub find_bounded_intervals_for_numeric_features {
1554
325
325
0
243
my $self = shift;
1555
325
194
my $arr = shift;
1556
325
277
my @arr = @$arr;
1557
325
180
my @features = @{$self->{_feature_names}};
325
454
1558
325
247
my @arr1 = map {my @x = split /(>|<)/, $_; \@x} @arr;
0
0
0
0
1559
325
50
436
print_array_with_msg("arr1", \@arr1) if $self->{_debug2};
1560
325
201
my @arr3 = ();
1561
325
287
foreach my $feature_name (@features) {
1562
1300
812
my @temp = ();
1563
1300
956
foreach my $x (@arr1) {
1564
0
0
0
0
push @temp, $x if @$x > 0 && $x->[0] eq $feature_name;
1565
}
1566
1300
50
1657
push @arr3, \@temp if @temp > 0;
1567
}
1568
325
50
403
print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
1569
# Sort each list so that '<' entries occur before '>' entries:
1570
325
190
my @arr4;
1571
325
237
foreach my $li (@arr3) {
1572
0
0
my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
0
0
1573
0
0
push @arr4, \@sorted;
1574
}
1575
325
50
388
print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
1576
325
179
my @arr5;
1577
325
274
foreach my $li (@arr4) {
1578
0
0
my @temp1 = ();
1579
0
0
my @temp2 = ();
1580
0
0
foreach my $inner (@$li) {
1581
0
0
0
if ($inner->[1] eq '<') {
1582
0
0
push @temp1, $inner;
1583
} else {
1584
0
0
push @temp2, $inner;
1585
}
1586
}
1587
0
0
0
0
if (@temp1 > 0 && @temp2 > 0) {
0
1588
0
0
push @arr5, [\@temp1, \@temp2];
1589
} elsif (@temp1 > 0) {
1590
0
0
push @arr5, [\@temp1];
1591
} else {
1592
0
0
push @arr5, [\@temp2];
1593
}
1594
}
1595
325
50
366
print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
1596
325
226
my @arr6 = ();
1597
325
224
foreach my $li (@arr5) {
1598
0
0
my @temp1 = ();
1599
0
0
foreach my $inner (@$li) {
1600
0
0
my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
0
0
1601
0
0
push @temp1, \@sorted;
1602
}
1603
0
0
push @arr6, \@temp1;
1604
}
1605
325
50
384
print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
1606
325
214
my @arr9 = ();
1607
325
276
foreach my $li (@arr6) {
1608
0
0
foreach my $alist (@$li) {
1609
0
0
my @newalist = ();
1610
0
0
0
if ($alist->[0][1] eq '<') {
1611
0
0
push @newalist, $alist->[0];
1612
} else {
1613
0
0
push @newalist, $alist->[-1];
1614
}
1615
0
0
0
if ($alist->[0][1] ne $alist->[-1][1]) {
1616
0
0
push @newalist, $alist->[-1];
1617
}
1618
0
0
push @arr9, \@newalist;
1619
}
1620
}
1621
325
50
366
print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
1622
325
405
return \@arr9;
1623
1624
}
1625
1626
## This method is used to verify that you used legal feature names in the test
1627
## sample that you want to classify with the decision tree.
1628
sub check_names_used {
1629
0
0
0
0
my $self = shift;
1630
0
0
my $features_and_values_test_data = shift;
1631
0
0
my @features_and_values_test_data = @$features_and_values_test_data;
1632
0
0
my $pattern = '(\S+)\s*=\s*(\S+)';
1633
0
0
foreach my $feature_and_value (@features_and_values_test_data) {
1634
0
0
$feature_and_value =~ /$pattern/;
1635
0
0
my ($feature,$value) = ($1,$2);
1636
0
0
0
0
die "Your test data has formatting error" unless defined($feature) && defined($value);
1637
0
0
0
return 0 unless contained_in($feature, @{$self->{_feature_names}});
0
0
1638
}
1639
0
0
return 1;
1640
}
1641
1642
####################################### Data Condition Calculator ######################################
1643
1644
## This method estimates the worst-case fan-out of the decision tree taking into
1645
## account the number of values (and therefore the number of branches emanating from
1646
## a node) for the symbolic features.
1647
sub determine_data_condition {
1648
0
0
0
0
my $self = shift;
1649
0
0
my $num_of_features = scalar @{$self->{_feature_names}};
0
0
1650
0
0
my @values = ();
1651
0
0
my @number_of_values;
1652
0
0
foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
0
0
1653
0
0
push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
1654
0
0
0
if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
0
0
1655
0
0
push @number_of_values, scalar @values;
1656
}
1657
0
0
0
return if ! @values;
1658
0
0
print "Number of features: $num_of_features\n";
1659
0
0
my @minmax = minmax(\@number_of_values);
1660
0
0
my $max_num_values = $minmax[1];
1661
0
0
print "Largest number of values for symbolic features is: $max_num_values\n";
1662
0
0
my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
1663
0
0
print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
1664
"nodes. The exact number of nodes created depends critically on " .
1665
"the entropy_threshold used for node expansion (the default value " .
1666
"for this threshold is 0.01) and on the value set for max_depth_desired " .
1667
"for the depth of the tree.\n";
1668
0
0
0
if ($estimated_number_of_nodes > 10000) {
1669
0
0
print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
1670
"large value for entropy_threshold and/or a small value for " .
1671
"for max_depth_desired to reduce the number of nodes created.\n";
1672
0
0
print "\nDo you wish to continue anyway? Enter 'y' for yes: ";
1673
0
0
my $answer = ;
1674
0
0
$answer =~ s/\r?\n?$//;
1675
0
0
0
while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
1676
0
0
print "\nAnswer not recognized. Let's try again. Enter 'y' or 'n': ";
1677
0
0
$answer = ;
1678
0
0
$answer =~ s/\r?\n?$//;
1679
}
1680
0
0
0
die unless $answer =~ /y(es)?/i;
1681
}
1682
}
1683
1684
1685
#################################### Read Training Data From File ######################################
1686
1687
1688
sub get_training_data {
1689
1
1
1
6
my $self = shift;
1690
1
50
8
die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
1691
1
1
my %class_names = ();
1692
1
3
my %all_record_ids_with_class_labels;
1693
my $firstline;
1694
0
0
my %data_hash;
1695
1
3
$|++;
1696
1
50
22
open FILEIN, $self->{_training_datafile} || die "unable to open $self->{_training_datafile}: $!";
1697
1
1
my $record_index = 0;
1698
1
1
my $firsetline;
1699
1
23
while () {
1700
36
50
65
next if /^[ ]*\r?\n?$/;
1701
36
75
$_ =~ s/\r?\n?$//;
1702
36
38
my $record = cleanup_csv($_);
1703
36
100
49
if ($record_index == 0) {
1704
1
1
$firstline = $record;
1705
1
1
$record_index++;
1706
1
3
next;
1707
}
1708
35
112
my @parts = split /,/, $record;
1709
35
35
my $classname = $parts[$self->{_csv_class_column_index}];
1710
35
25
$class_names{$classname} = 1;
1711
35
23
my $record_label = shift @parts;
1712
35
45
$record_label =~ s/^\s*\"|\"\s*$//g;
1713
35
40
$data_hash{$record_label} = \@parts;
1714
35
32
$all_record_ids_with_class_labels{$record_label} = $classname;
1715
35
50
43
print "." if $record_index % 10000 == 0;
1716
35
70
$record_index++;
1717
}
1718
1
5
close FILEIN;
1719
1
2
$|--;
1720
1
3
$self->{_how_many_total_training_samples} = $record_index - 1; # must subtract 1 for the header record
1721
1
50
3
print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
1722
1
4
my @all_feature_names = split /,/, substr($firstline, index($firstline,','));
1723
1
2
my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
1724
1
3
my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
2
6
1725
1
2
my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
4
5
1
2
1726
1
5
my %class_for_sample_hash = map {"sample_" . $_ => "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
35
57
1727
1
7
my @sample_names = map {"sample_$_"} keys %data_hash;
35
29
1728
1
50
5
my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $data_hash{$sampleID}->[$_-1] } } @{$self->{_csv_columns_for_features}} ] } keys %data_hash;
35
26
35
15
140
107
140
96
140
288
35
34
1729
1
50
5
my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};
4
4
4
3
4
11
140
81
140
193
140
108
1
2
1730
1
2
my %numeric_features_valuerange_hash = ();
1731
1
1
my %feature_values_how_many_uniques_hash = ();
1732
1
1
my %features_and_unique_values_hash = ();
1733
1
2
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
1734
1
3
foreach my $feature (keys %features_and_values_hash) {
1735
4
3
my %seen1 = ();
1736
140
100
66
398
my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
1737
4
4
@{$features_and_values_hash{$feature}};
4
4
1738
4
6
$feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
1739
4
3
my $not_all_values_float = 0;
1740
4
50
5
map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
14
89
1741
4
50
6
if ($not_all_values_float == 0) {
1742
0
0
my @minmaxvalues = minmax(\@unique_values_for_feature);
1743
0
0
$numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
1744
}
1745
4
10
$features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
1746
}
1747
1
50
7
if ($self->{_debug1}) {
1748
0
0
print "\nAll class names: @all_class_names\n";
1749
0
0
print "\nEach sample data record:\n";
1750
0
0
foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
0
0
1751
0
0
print "$sample => @{$feature_values_for_samples_hash{$sample}}\n";
0
0
1752
}
1753
0
0
print "\nclass label for each data sample:\n";
1754
0
0
foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %class_for_sample_hash) {
0
0
1755
0
0
print "$sample => $class_for_sample_hash{$sample}\n";
1756
}
1757
0
0
print "\nFeatures used: @feature_names\n\n";
1758
0
0
print "\nfeatures and the values taken by them:\n";
1759
0
0
foreach my $feature (sort keys %features_and_values_hash) {
1760
0
0
print "$feature => @{$features_and_values_hash{$feature}}\n";
0
0
1761
}
1762
0
0
print "\nnumeric features and their ranges:\n";
1763
0
0
foreach my $feature (sort keys %numeric_features_valuerange_hash) {
1764
0
0
print "$feature => @{$numeric_features_valuerange_hash{$feature}}\n";
0
0
1765
}
1766
0
0
print "\nnumber of unique values in each feature:\n";
1767
0
0
foreach my $feature (sort keys %feature_values_how_many_uniques_hash) {
1768
0
0
print "$feature => $feature_values_how_many_uniques_hash{$feature}\n";
1769
}
1770
}
1771
1
2
$self->{_class_names} = \@all_class_names;
1772
1
2
$self->{_feature_names} = \@feature_names;
1773
1
1
$self->{_samples_class_label_hash} = \%class_for_sample_hash;
1774
1
6
$self->{_training_data_hash} = \%feature_values_for_samples_hash;
1775
1
2
$self->{_features_and_values_hash} = \%features_and_values_hash;
1776
1
1
$self->{_features_and_unique_values_hash} = \%features_and_unique_values_hash;
1777
1
2
$self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
1778
1
19
$self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
1779
}
1780
1781
sub show_training_data {
1782
0
0
1
0
my $self = shift;
1783
0
0
my @class_names = @{$self->{_class_names}};
0
0
1784
0
0
my %features_and_values_hash = %{$self->{_features_and_values_hash}};
0
0
1785
0
0
my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};
0
0
1786
0
0
my %training_data_hash = %{$self->{_training_data_hash}};
0
0
1787
0
0
print "\n\nClass Names: @class_names\n";
1788
0
0
print "\n\nFeatures and Their Values:\n\n";
1789
0
0
while ( my ($k, $v) = each %features_and_values_hash ) {
1790
0
0
print "$k ---> @{$features_and_values_hash{$k}}\n";
0
0
1791
}
1792
0
0
print "\n\nSamples vs. Class Labels:\n\n";
1793
0
0
foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {
0
0
1794
0
0
print "$kee => $samples_class_label_hash{$kee}\n";
1795
}
1796
0
0
print "\n\nTraining Samples:\n\n";
1797
0
0
foreach my $kee (sort {sample_index($a) <=> sample_index($b)}
0
0
1798
keys %training_data_hash) {
1799
0
0
print "$kee => @{$training_data_hash{$kee}}\n";
0
0
1800
}
1801
}
1802
1803
sub get_class_names {
1804
0
0
0
0
my $self = shift;
1805
0
0
return @{$self->{_class_names}}
0
0
1806
}
1807
1808
########################################## Utility Routines ############################################
1809
1810
sub closest_sampling_point {
1811
0
0
0
0
my $value = shift;
1812
0
0
my $arr_ref = shift;
1813
0
0
my @arr = @{$arr_ref};
0
0
1814
0
0
my @compare = map {abs($_ - $value)} @arr;
0
0
1815
0
0
my ($minval,$index) = minimum(\@compare);
1816
0
0
return $arr[$index];
1817
}
1818
1819
## returns the array index that contains a specified STRING value: (meant only for array of strings)
1820
sub get_index_at_value {
1821
0
0
0
0
my $value = shift;
1822
0
0
my @array = @{shift @_};
0
0
1823
0
0
foreach my $i (0..@array-1) {
1824
0
0
0
return $i if $value eq $array[$i];
1825
}
1826
}
1827
1828
## When the training data is read from a CSV file, we assume that the first column
1829
## of each data record contains a unique integer identifier for the record in that
1830
## row. This training data is stored in a hash whose keys are the prefix 'sample_'
1831
## followed by the identifying integers. The purpose of this function is to return
1832
## the identifying integer associated with a data record.
1833
sub sample_index {
1834
1128
1128
0
629
my $arg = shift;
1835
1128
1115
$arg =~ /_(.+)$/;
1836
1128
1253
return $1;
1837
}
1838
1839
# Returns the minimum value and its positional index in an array
1840
sub minimum {
1841
0
0
0
0
my $arr = shift;
1842
0
0
my $min;
1843
my $index;
1844
0
0
foreach my $i (0..@{$arr}-1) {
0
0
1845
0
0
0
0
if ( (!defined $min) || ($arr->[$i] < $min) ) {
1846
0
0
$index = $i;
1847
0
0
$min = $arr->[$i];
1848
}
1849
}
1850
0
0
return ($min, $index);
1851
}
1852
1853
# Returns an array of two values, the min and the max, of an array of floats
1854
sub minmax {
1855
0
0
0
0
my $arr = shift;
1856
0
0
my ($min, $max);
1857
0
0
foreach my $i (0..@{$arr}-1) {
0
0
1858
0
0
0
0
if ( (!defined $min) || ($arr->[$i] < $min) ) {
1859
0
0
$min = $arr->[$i];
1860
}
1861
0
0
0
0
if ( (!defined $max) || ($arr->[$i] > $max) ) {
1862
0
0
$max = $arr->[$i];
1863
}
1864
}
1865
0
0
return ($min, $max);
1866
}
1867
1868
# checks whether an element is in an array:
1869
sub contained_in {
1870
267
267
0
176
my $ele = shift;
1871
267
269
my @array = @_;
1872
267
160
my $count = 0;
1873
267
100
181
map {$count++ if $ele eq $_} @array;
648
1035
1874
267
427
return $count;
1875
}
1876
1877
# Meant only for an array of strings (no nesting):
1878
sub deep_copy_array {
1879
196
196
0
152
my $ref_in = shift;
1880
196
116
my $ref_out;
1881
196
100
254
return [] if scalar @$ref_in == 0;
1882
181
133
foreach my $i (0..@{$ref_in}-1) {
181
232
1883
393
423
$ref_out->[$i] = $ref_in->[$i];
1884
}
1885
181
283
return $ref_out;
1886
}
1887
1888
sub check_for_illegal_params2 {
1889
1
1
0
2
my @params = @_;
1890
1
5
my @legal_params = qw / training_datafile
1891
entropy_threshold
1892
max_depth_desired
1893
csv_class_column_index
1894
csv_columns_for_features
1895
symbolic_to_numeric_cardinality_threshold
1896
number_of_histogram_bins
1897
debug1
1898
debug2
1899
debug3
1900
/;
1901
1
1
my $found_match_flag;
1902
1
2
foreach my $param (@params) {
1903
3
3
foreach my $legal (@legal_params) {
1904
10
9
$found_match_flag = 0;
1905
10
100
14
if ($param eq $legal) {
1906
3
1
$found_match_flag = 1;
1907
3
4
last;
1908
}
1909
}
1910
3
50
4
last if $found_match_flag == 0;
1911
}
1912
1
4
return $found_match_flag;
1913
}
1914
1915
sub print_array_with_msg {
1916
0
0
0
0
my $message = shift;
1917
0
0
my $arr = shift;
1918
0
0
print "\n$message: ";
1919
0
0
print_nested_array( $arr );
1920
}
1921
1922
sub print_nested_array {
1923
0
0
0
0
my $arr = shift;
1924
0
0
my @arr = @$arr;
1925
0
0
print "[";
1926
0
0
foreach my $item (@arr) {
1927
0
0
0
if (ref $item) {
1928
0
0
print_nested_array($item);
1929
} else {
1930
0
0
print "$item";
1931
}
1932
}
1933
0
0
print "]";
1934
}
1935
1936
## Introduced in Version 3.21, I wrote this function in response to a need to
1937
## create a decision tree for a very large national econometric database. The
1938
## fields in the CSV file for this database are allowed to be double quoted and such
1939
## fields may contain commas inside them. This function also replaces empty fields
1940
## with the generic string 'NA' as a shorthand for "Not Available". IMPORTANT: This
1941
## function skips over the first field in each record. It is assumed that the first
1942
## field in the first record that defines the feature names is the empty string ("")
1943
## and the same field in all other records is an ID number for the record.
1944
sub cleanup_csv {
1945
36
36
0
28
my $line = shift;
1946
36
33
$line =~ tr/()[]{}'/ /;
1947
36
51
my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
1948
36
35
for (@double_quoted) {
1949
0
0
my $item = $_;
1950
0
0
$item = substr($item, 1, -1);
1951
0
0
$item =~ s/^s+|,|\s+$//g;
1952
0
0
$item = join '_', split /\s+/, $item;
1953
0
0
substr($line, index($line, $_), length($_)) = $item;
1954
}
1955
36
124
my @white_spaced = $line =~ /,\s*[^,]+\s+[^,]+\s*,/g;
1956
36
30
for (@white_spaced) {
1957
0
0
my $item = $_;
1958
0
0
$item = substr($item, 0, -1);
1959
0
0
0
$item = join '_', split /\s+/, $item unless $item =~ /,\s+$/;
1960
0
0
substr($line, index($line, $_), length($_)) = "$item,";
1961
}
1962
36
52
$line =~ s/,\s*(?=,)/,NA/g;
1963
36
38
return $line;
1964
}
1965
1966
######################################### Class EvalTrainingData ########################################
1967
1968
## This subclass of the DecisionTree class is used to evaluate the quality of your
1969
## training data by running a 10-fold cross-validation test on it. This test divides
1970
## all of the training data into ten parts, with nine parts used for training a
1971
## decision tree and one part used for testing its ability to classify correctly.
1972
## This selection of nine parts for training and one part for testing is carried out
1973
## in all of the ten different possible ways. This testing functionality can also
1974
## be used to find the best values to use for the constructor parameters
1975
## entropy_threshold, max_depth_desired, and
1976
## symbolic_to_numeric_cardinality_threshold.
1977
1978
## Only the CSV training files can be evaluated in this manner (because only CSV
1979
## training are allowed to have numeric features --- which is the more interesting
1980
## case for evaluation analytics.
1981
1982
package EvalTrainingData;
1983
1984
@EvalTrainingData::ISA = ('Algorithm::DecisionTree');
1985
1986
sub new {
1987
0
0
0
my $class = shift;
1988
0
0
my $instance = Algorithm::DecisionTree->new(@_);
1989
0
0
bless $instance, $class;
1990
}
1991
1992
sub evaluate_training_data {
1993
0
0
0
my $self = shift;
1994
0
0
my $evaldebug = 0;
1995
die "The data evaluation function in the module can only be used when your " .
1996
0
0
0
"training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
1997
0
0
print "\nWill run a 10-fold cross-validation test on your training data to test its " .
1998
"class-discriminatory power:\n";
1999
0
0
my %all_training_data = %{$self->{_training_data_hash}};
0
0
2000
0
0
my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=>
0
0
2001
Algorithm::DecisionTree::sample_index($b)} keys %all_training_data;
2002
0
0
my $fold_size = int(0.1 * (scalar keys %all_training_data));
2003
0
0
print "fold size: $fold_size\n";
2004
0
0
my %confusion_matrix = ();
2005
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
2006
0
0
foreach my $inner_class_name (@{$self->{_class_names}}) {
0
0
2007
0
0
$confusion_matrix{$class_name}->{$inner_class_name} = 0;
2008
}
2009
}
2010
0
0
foreach my $fold_index (0..9) {
2011
0
0
print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n";
2012
0
0
my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
2013
0
0
my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],
2014
@all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
2015
0
0
my %testing_data = ();
2016
0
0
foreach my $x (@testing_samples) {
2017
0
0
$testing_data{$x} = $all_training_data{$x};
2018
}
2019
0
0
my %training_data = ();
2020
0
0
foreach my $x (@training_samples) {
2021
0
0
$training_data{$x} = $all_training_data{$x};
2022
}
2023
0
0
my $trainingDT = Algorithm::DecisionTree->new('evalmode');
2024
0
0
$trainingDT->{_training_data_hash} = \%training_data;
2025
0
0
$trainingDT->{_class_names} = $self->{_class_names};
2026
0
0
$trainingDT->{_feature_names} = $self->{_feature_names};
2027
0
0
$trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
2028
0
0
$trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
2029
$trainingDT->{_symbolic_to_numeric_cardinality_threshold} =
2030
0
0
$self->{_symbolic_to_numeric_cardinality_threshold};
2031
0
0
foreach my $sample_name (@training_samples) {
2032
$trainingDT->{_samples_class_label_hash}->{$sample_name} =
2033
0
0
$self->{_samples_class_label_hash}->{$sample_name};
2034
}
2035
0
0
foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
0
0
2036
0
0
$trainingDT->{_features_and_values_hash}->{$feature} = ();
2037
}
2038
0
0
my $pattern = '(\S+)\s*=\s*(\S+)';
2039
0
0
foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=>
0
0
2040
Algorithm::DecisionTree::sample_index($b)}
2041
0
0
keys %{$trainingDT->{_training_data_hash}}) {
2042
0
0
foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
0
0
2043
0
0
$feature_and_value =~ /$pattern/;
2044
0
0
my ($feature,$value) = ($1,$2);
2045
0
0
0
push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
0
0
2046
}
2047
}
2048
0
0
foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
0
0
2049
0
0
my %seen = ();
2050
0
0
0
0
my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
2051
0
0
@{$trainingDT->{_features_and_values_hash}->{$feature}};
0
0
2052
0
0
0
if (Algorithm::DecisionTree::contained_in($feature,
2053
0
0
keys %{$self->{_numeric_features_valuerange_hash}})) {
2054
0
0
@unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;
0
0
2055
} else {
2056
0
0
@unique_values_for_feature = sort @unique_values_for_feature;
2057
}
2058
0
0
$trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
2059
}
2060
0
0
foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
0
0
2061
my @minmaxvalues = Algorithm::DecisionTree::minmax(
2062
0
0
\@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
0
0
2063
0
0
$trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
2064
}
2065
0
0
0
if ($evaldebug) {
2066
0
0
print "\n\nprinting samples in the testing set: @testing_samples\n";
2067
0
0
print "\n\nPrinting features and their values in the training set:\n";
2068
0
0
foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
0
0
2069
0
0
print "$item => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
0
0
2070
}
2071
0
0
print "\n\nPrinting unique values for features:\n";
2072
0
0
foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
0
0
2073
0
0
print "$item => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
0
0
2074
}
2075
0
0
print "\n\nPrinting unique value ranges for features:\n";
2076
0
0
foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
0
0
2077
0
0
print "$item => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
0
0
2078
}
2079
}
2080
0
0
foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
0
0
2081
$trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} =
2082
0
0
scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
0
0
2083
}
2084
0
0
0
$trainingDT->{_debug2} = 1 if $evaldebug;
2085
0
0
$trainingDT->calculate_first_order_probabilities();
2086
0
0
$trainingDT->calculate_class_priors();
2087
0
0
my $root_node = $trainingDT->construct_decision_tree_classifier();
2088
0
0
0
$root_node->display_decision_tree(" ") if $evaldebug;
2089
0
0
foreach my $test_sample_name (@testing_samples) {
2090
0
0
my @test_sample_data = @{$all_training_data{$test_sample_name}};
0
0
2091
0
0
0
print "original data in test sample: @test_sample_data\n" if $evaldebug;
2092
0
0
0
0
@test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
0
0
2093
0
0
0
print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
2094
0
0
my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
0
0
2095
0
0
my @solution_path = @{$classification{'solution_path'}};
0
0
2096
0
0
delete $classification{'solution_path'};
2097
0
0
my @which_classes = keys %classification;
2098
0
0
@which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
0
0
2099
0
0
my $most_likely_class_label = $which_classes[0];
2100
0
0
0
if ($evaldebug) {
2101
0
0
print "\nClassification:\n\n";
2102
0
0
print " class probability\n";
2103
0
0
print " ---------- -----------\n";
2104
0
0
foreach my $which_class (@which_classes) {
2105
0
0
my $classstring = sprintf("%-30s", $which_class);
2106
0
0
my $valuestring = sprintf("%-30s", $classification{$which_class});
2107
0
0
print " $classstring $valuestring\n";
2108
}
2109
0
0
print "\nSolution path in the decision tree: @solution_path\n";
2110
0
0
print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
2111
}
2112
0
0
my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
2113
0
0
0
print "$test_sample_name: true_class: $true_class_label_for_sample " .
2114
"estimated_class: $most_likely_class_label\n" if $evaldebug;
2115
0
0
$confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
2116
}
2117
}
2118
0
0
print "\n\n DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
2119
0
0
my $matrix_header = " " x 30;
2120
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
2121
0
0
$matrix_header .= sprintf("%-30s", $class_name);
2122
}
2123
0
0
print "\n" . $matrix_header . "\n\n";
2124
0
0
foreach my $row_class_name (sort keys %confusion_matrix) {
2125
0
0
my $row_display = sprintf("%-30s", $row_class_name);
2126
0
0
foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
0
0
2127
0
0
$row_display .= sprintf( "%-30u", $confusion_matrix{$row_class_name}->{$col_class_name} );
2128
}
2129
0
0
print "$row_display\n\n";
2130
}
2131
0
0
print "\n\n";
2132
0
0
my ($diagonal_sum, $off_diagonal_sum) = (0,0);
2133
0
0
foreach my $row_class_name (sort keys %confusion_matrix) {
2134
0
0
foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
0
0
2135
0
0
0
if ($row_class_name eq $col_class_name) {
2136
0
0
$diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2137
} else {
2138
0
0
$off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2139
}
2140
}
2141
}
2142
0
0
my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
2143
0
0
print "\nTraining Data Quality Index: $data_quality_index (out of a possible maximum of 100)\n";
2144
0
0
0
0
if ($data_quality_index <= 80) {
0
0
0
0
0
2145
0
0
print "\nYour training data does not possess much class discriminatory " .
2146
"information. It could be that the classes are inherently not well " .
2147
"separable or that your constructor parameter choices are not appropriate.\n";
2148
} elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
2149
0
0
print "\nYour training data possesses some class discriminatory information " .
2150
"but it may not be sufficient for real-world applications. You might " .
2151
"try tweaking the constructor parameters to see if that improves the " .
2152
"class discriminations.\n";
2153
} elsif ($data_quality_index > 90 && $data_quality_index <= 95) {
2154
0
0
print "\nYour training data appears to possess good class discriminatory " .
2155
"information. Whether or not it is acceptable would depend on your " .
2156
"application.\n";
2157
} elsif ($data_quality_index > 95 && $data_quality_index <= 98) {
2158
0
0
print "\nYour training data is of excellent quality.\n";
2159
} else {
2160
0
0
print "\nYour training data is perfect.\n";
2161
}
2162
2163
}
2164
2165
2166
############################################# Class DTNode #############################################
2167
2168
# The nodes of the decision tree are instances of this class:
2169
2170
package DTNode;
2171
2172
1
1
11
use strict;
1
1
1
19
2173
1
1
3
use Carp;
1
1
1
806
2174
2175
# $feature is the feature test at the current node. $branch_features_and_values is
2176
# an anonymous array holding the feature names and corresponding values on the path
2177
# from the root to the current node:
2178
sub new {
2179
30
30
35
my ($class, $feature, $entropy, $class_probabilities,
2180
$branch_features_and_values_or_thresholds, $dt, $root_or_not) = @_;
2181
30
100
42
$root_or_not = '' if !defined $root_or_not;
2182
30
100
37
if ($root_or_not eq 'root') {
2183
1
2
$dt->{nodes_created} = -1;
2184
1
2
$dt->{class_names} = undef;
2185
}
2186
30
96
my $self = {
2187
_dt => $dt,
2188
_feature => $feature,
2189
_node_creation_entropy => $entropy,
2190
_class_probabilities => $class_probabilities,
2191
_branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
2192
_linked_to => [],
2193
};
2194
30
33
bless $self, $class;
2195
30
42
$self->{_serial_number} = $self->get_next_serial_num();
2196
30
30
return $self;
2197
}
2198
2199
sub how_many_nodes {
2200
0
0
0
my $self = shift;
2201
0
0
return $self->{_dt}->{nodes_created} + 1;
2202
}
2203
2204
sub set_class_names {
2205
1
1
1
my $self = shift;
2206
1
1
my $class_names_list = shift;
2207
1
2
$self->{_dt}->{class_names} = $class_names_list;
2208
}
2209
2210
sub get_class_names {
2211
0
0
0
my $self = shift;
2212
0
0
return $self->{_dt}->{class_names};
2213
}
2214
2215
sub get_next_serial_num {
2216
30
30
16
my $self = shift;
2217
30
33
$self->{_dt}->{nodes_created} += 1;
2218
30
42
return $self->{_dt}->{nodes_created};
2219
}
2220
2221
sub get_serial_num {
2222
30
30
27
my $self = shift;
2223
30
29
$self->{_serial_number};
2224
}
2225
2226
# this returns the feature test at the current node
2227
sub get_feature {
2228
0
0
0
my $self = shift;
2229
0
0
return $self->{ _feature };
2230
}
2231
2232
sub set_feature {
2233
29
29
25
my $self = shift;
2234
29
19
my $feature = shift;
2235
29
36
$self->{_feature} = $feature;
2236
}
2237
2238
sub get_node_entropy {
2239
30
30
21
my $self = shift;
2240
30
29
return $self->{_node_creation_entropy};
2241
}
2242
2243
sub get_class_probabilities {
2244
0
0
0
my $self = shift;
2245
0
0
return $self->{ _class_probabilities};
2246
}
2247
2248
sub get_branch_features_and_values_or_thresholds {
2249
30
30
19
my $self = shift;
2250
30
39
return $self->{_branch_features_and_values_or_thresholds};
2251
}
2252
2253
sub add_to_branch_features_and_values {
2254
0
0
0
my $self = shift;
2255
0
0
my $feature_and_value = shift;
2256
0
0
push @{$self->{ _branch_features_and_values }}, $feature_and_value;
0
0
2257
}
2258
2259
sub get_children {
2260
0
0
0
my $self = shift;
2261
0
0
return $self->{_linked_to};
2262
}
2263
2264
sub add_child_link {
2265
29
29
20
my ($self, $new_node, ) = @_;
2266
29
17
push @{$self->{_linked_to}}, $new_node;
29
40
2267
}
2268
2269
sub delete_all_links {
2270
0
0
0
my $self = shift;
2271
0
0
$self->{_linked_to} = undef;
2272
}
2273
2274
sub display_node {
2275
0
0
0
my $self = shift;
2276
0
0
0
my $feature_at_node = $self->get_feature() || " ";
2277
0
0
my $node_creation_entropy_at_node = $self->get_node_entropy();
2278
0
0
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2279
0
0
my @class_probabilities = @{$self->get_class_probabilities()};
0
0
2280
0
0
my @class_probabilities_for_display = map {sprintf("%0.3f", $_)} @class_probabilities;
0
0
2281
0
0
my $serial_num = $self->get_serial_num();
2282
0
0
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
0
0
2283
0
0
print "\n\nNODE $serial_num" .
2284
":\n Branch features and values to this node: @branch_features_and_values_or_thresholds" .
2285
"\n Class probabilities at current node: @class_probabilities_for_display" .
2286
"\n Entropy at current node: $print_node_creation_entropy_at_node" .
2287
"\n Best feature test at current node: $feature_at_node\n\n";
2288
}
2289
2290
sub display_decision_tree {
2291
0
0
0
my $self = shift;
2292
0
0
my $offset = shift;
2293
0
0
my $serial_num = $self->get_serial_num();
2294
0
0
0
if (@{$self->get_children()} > 0) {
0
0
2295
0
0
0
my $feature_at_node = $self->get_feature() || " ";
2296
0
0
my $node_creation_entropy_at_node = $self->get_node_entropy();
2297
0
0
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2298
0
0
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
0
0
2299
0
0
my @class_probabilities = @{$self->get_class_probabilities()};
0
0
2300
0
0
my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
0
0
2301
0
0
my @class_names = @{$self->get_class_names()};
0
0
2302
my @print_class_probabilities_with_class =
2303
0
0
map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
0
0
2304
0
0
print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
2305
0
0
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2306
0
0
print "$second_line_offset" . "Decision Feature: $feature_at_node Node Creation Entropy: " ,
2307
"$print_node_creation_entropy_at_node Class Probs: @print_class_probabilities_with_class\n\n";
2308
0
0
$offset .= " ";
2309
0
0
foreach my $child (@{$self->get_children()}) {
0
0
2310
0
0
$child->display_decision_tree($offset);
2311
}
2312
} else {
2313
0
0
my $node_creation_entropy_at_node = $self->get_node_entropy();
2314
0
0
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2315
0
0
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
0
0
2316
0
0
my @class_probabilities = @{$self->get_class_probabilities()};
0
0
2317
0
0
my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
0
0
2318
0
0
my @class_names = @{$self->get_class_names()};
0
0
2319
my @print_class_probabilities_with_class =
2320
0
0
map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
0
0
2321
0
0
print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
2322
0
0
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2323
0
0
print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node " .
2324
"Class Probs: @print_class_probabilities_with_class\n\n";
2325
}
2326
}
2327
2328
2329
############################## Generate Your Own Numeric Training Data #################################
2330
############################# Class TrainingDataGeneratorNumeric ################################
2331
2332
## See the script generate_training_data_numeric.pl in the examples
2333
## directory on how to use this class for generating your own numeric training and
2334
## test data. The training and test data are generated in accordance with the
2335
## specifications you place in the parameter file that is supplied as an argument to
2336
## the constructor of this class.
2337
2338
package TrainingDataGeneratorNumeric;
2339
2340
1
1
4
use strict;
1
2
1
15
2341
1
1
3
use Carp;
1
1
1
744
2342
2343
sub new {
2344
0
0
0
my ($class, %args) = @_;
2345
0
0
my @params = keys %args;
2346
0
0
0
croak "\nYou have used a wrong name for a keyword argument " .
2347
"--- perhaps a misspelling\n"
2348
if check_for_illegal_params3(@params) == 0;
2349
bless {
2350
_output_training_csv_file => $args{'output_training_csv_file'}
2351
|| croak("name for output_training_csv_file required"),
2352
_output_test_csv_file => $args{'output_test_csv_file'}
2353
|| croak("name for output_test_csv_file required"),
2354
_parameter_file => $args{'parameter_file'}
2355
|| croak("parameter_file required"),
2356
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
2357
|| croak("number_of_samples_for_training"),
2358
_number_of_samples_for_testing => $args{'number_of_samples_for_testing'}
2359
|| croak("number_of_samples_for_testing"),
2360
0
0
0
_debug => $args{debug} || 0,
0
0
0
0
0
2361
_class_names => [],
2362
_class_names_and_priors => {},
2363
_features_with_value_range => {},
2364
_features_ordered => [],
2365
_classes_and_their_param_values => {},
2366
}, $class;
2367
}
2368
2369
sub check_for_illegal_params3 {
2370
0
0
0
my @params = @_;
2371
0
0
my @legal_params = qw / output_training_csv_file
2372
output_test_csv_file
2373
parameter_file
2374
number_of_samples_for_training
2375
number_of_samples_for_testing
2376
debug
2377
/;
2378
0
0
my $found_match_flag;
2379
0
0
foreach my $param (@params) {
2380
0
0
foreach my $legal (@legal_params) {
2381
0
0
$found_match_flag = 0;
2382
0
0
0
if ($param eq $legal) {
2383
0
0
$found_match_flag = 1;
2384
0
0
last;
2385
}
2386
}
2387
0
0
0
last if $found_match_flag == 0;
2388
}
2389
0
0
return $found_match_flag;
2390
}
2391
2392
## The training data generated by an instance of the class
2393
## TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
2394
## you supply to the class constructor through a constructor variable called
2395
## `parameter_file'. This method is for parsing the parameter file in order to
2396
## order to determine the names to be used for the different data classes, their
2397
## means, and their variances.
2398
sub read_parameter_file_numeric {
2399
0
0
0
my $self = shift;
2400
0
0
my @class_names = ();
2401
0
0
my %class_names_and_priors = ();
2402
0
0
my %features_with_value_range = ();
2403
0
0
my %classes_and_their_param_values = ();
2404
# my $regex8 = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
2405
0
0
0
open FILE, $self->{_parameter_file} || die "unable to open parameter file: $!";
2406
0
0
my @params = ;
2407
0
0
my $params = join "", @params;
2408
0
0
my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
2409
0
0
$params =~ /$regex/si;
2410
0
0
my ($class_names, $class_priors) = ($1, $2);
2411
0
0
@class_names = split ' ', $class_names;
2412
0
0
my @class_priors = split ' ', $class_priors;
2413
0
0
foreach my $i (0..@class_names-1) {
2414
0
0
$class_names_and_priors{$class_names[$i]} = $class_priors[$i];
2415
}
2416
0
0
0
if ($self->{_debug}) {
2417
0
0
foreach my $cname (keys %class_names_and_priors) {
2418
0
0
print "$cname => $class_names_and_priors{$cname}\n";
2419
}
2420
}
2421
0
0
$regex = 'feature name: \w*.*?value range: [\d\. -]+';
2422
0
0
my @features = $params =~ /$regex/gsi;
2423
0
0
my @features_ordered;
2424
0
0
$regex = 'feature name: (\w+)\W*?value range:\s*([\d. -]+)';
2425
0
0
foreach my $feature (@features) {
2426
0
0
$feature =~ /$regex/i;
2427
0
0
my $feature_name = $1;
2428
0
0
push @features_ordered, $feature_name;
2429
0
0
my @value_range = split ' ', $2;
2430
0
0
$features_with_value_range{$feature_name} = \@value_range;
2431
}
2432
0
0
0
if ($self->{_debug}) {
2433
0
0
foreach my $fname (keys %features_with_value_range) {
2434
0
0
print "$fname => @{$features_with_value_range{$fname}}\n";
0
0
2435
}
2436
}
2437
0
0
foreach my $i (0..@class_names-1) {
2438
0
0
$classes_and_their_param_values{$class_names[$i]} = {};
2439
}
2440
0
0
$regex = 'params for class: \w*?\W+?mean:[\d\. ]+\W*?covariance:\W+?(?:[ \d.]+\W+?)+';
2441
0
0
my @class_params = $params =~ /$regex/gsi;
2442
0
0
$regex = 'params for class: (\w+)\W*?mean:\s*([\d. -]+)\W*covariance:\s*([\s\d.]+)';
2443
0
0
foreach my $class_param (@class_params) {
2444
0
0
$class_param =~ /$regex/gsi;
2445
0
0
my $class_name = $1;
2446
0
0
my @class_mean = split ' ', $2;
2447
0
0
$classes_and_their_param_values{$class_name}->{'mean'} = \@class_mean;
2448
0
0
my $class_param_string = $3;
2449
0
0
my @covar_rows = split '\n', $class_param_string;
2450
0
0
my @covar_matrix;
2451
0
0
foreach my $row (@covar_rows) {
2452
0
0
my @row = split ' ', $row;
2453
0
0
push @covar_matrix, \@row;
2454
}
2455
0
0
$classes_and_their_param_values{$class_name}->{'covariance'} = \@covar_matrix;
2456
}
2457
0
0
0
if ($self->{_debug}) {
2458
0
0
print "\nThe class parameters are:\n\n";
2459
0
0
foreach my $cname (keys %classes_and_their_param_values) {
2460
0
0
print "\nFor class name $cname:\n";
2461
0
0
my %params_hash = %{$classes_and_their_param_values{$cname}};
0
0
2462
0
0
foreach my $x (keys %params_hash) {
2463
0
0
0
if ($x eq 'mean') {
2464
0
0
print " $x => @{$params_hash{$x}}\n";
0
0
2465
} else {
2466
0
0
0
if ($x eq 'covariance') {
2467
0
0
print " The covariance matrix:\n";
2468
0
0
my @matrix = @{$params_hash{'covariance'}};
0
0
2469
0
0
foreach my $row (@matrix) {
2470
0
0
print " @$row\n";
2471
}
2472
}
2473
}
2474
}
2475
}
2476
}
2477
0
0
$self->{_class_names} = \@class_names;
2478
0
0
$self->{_class_names_and_priors} = \%class_names_and_priors;
2479
0
0
$self->{_features_with_value_range} = \%features_with_value_range;
2480
0
0
$self->{_classes_and_their_param_values} = \%classes_and_their_param_values;
2481
0
0
$self->{_features_ordered} = \@features_ordered;
2482
}
2483
2484
## After the parameter file is parsed by the previous method, this method calls on
2485
## Math::Random::random_multivariate_normal() to generate the training and test data
2486
## samples. Your training and test data can be of any number of of dimensions, can
2487
## have any mean, and any covariance. The training and test data must obviously be
2488
## drawn from the same distribution.
2489
sub gen_numeric_training_and_test_data_and_write_to_csv {
2490
1
1
579
use Math::Random;
1
4123
1
654
2491
0
0
0
my $self = shift;
2492
0
0
my %training_samples_for_class;
2493
my %test_samples_for_class;
2494
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
2495
0
0
$training_samples_for_class{$class_name} = [];
2496
0
0
$test_samples_for_class{$class_name} = [];
2497
}
2498
0
0
foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
0
0
2499
0
0
my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
0
0
2500
0
0
my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
0
0
2501
my @new_training_data = Math::Random::random_multivariate_normal(
2502
0
0
$self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
2503
@mean, @covariance );
2504
my @new_test_data = Math::Random::random_multivariate_normal(
2505
0
0
$self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
2506
@mean, @covariance );
2507
0
0
0
if ($self->{_debug}) {
2508
0
0
print "training data for class $class_name:\n";
2509
0
0
foreach my $x (@new_training_data) {print "@$x\n";}
0
0
2510
0
0
print "\n\ntest data for class $class_name:\n";
2511
0
0
foreach my $x (@new_test_data) {print "@$x\n";}
0
0
2512
}
2513
0
0
$training_samples_for_class{$class_name} = \@new_training_data;
2514
0
0
$test_samples_for_class{$class_name} = \@new_test_data;
2515
}
2516
0
0
my @training_data_records = ();
2517
0
0
my @test_data_records = ();
2518
0
0
foreach my $class_name (keys %training_samples_for_class) {
2519
my $num_of_samples_for_training = $self->{_number_of_samples_for_training} *
2520
0
0
$self->{_class_names_and_priors}->{$class_name};
2521
my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} *
2522
0
0
$self->{_class_names_and_priors}->{$class_name};
2523
0
0
foreach my $sample_index (0..$num_of_samples_for_training-1) {
2524
0
0
my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};
0
0
2525
0
0
@training_vector = map {sprintf("%.3f", $_)} @training_vector;
0
0
2526
0
0
my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";
2527
0
0
push @training_data_records, $training_data_record;
2528
}
2529
0
0
foreach my $sample_index (0..$num_of_samples_for_testing-1) {
2530
0
0
my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
0
0
2531
0
0
@test_vector = map {sprintf("%.3f", $_)} @test_vector;
0
0
2532
0
0
my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
2533
0
0
push @test_data_records, $test_data_record;
2534
}
2535
}
2536
0
0
fisher_yates_shuffle(\@training_data_records);
2537
0
0
fisher_yates_shuffle(\@test_data_records);
2538
0
0
0
if ($self->{_debug}) {
2539
0
0
foreach my $record (@training_data_records) {
2540
0
0
print "$record";
2541
}
2542
0
0
foreach my $record (@test_data_records) {
2543
0
0
print "$record";
2544
}
2545
}
2546
0
0
open OUTPUT, ">$self->{_output_training_csv_file}";
2547
0
0
my @feature_names_training = @{$self->{_features_ordered}};
0
0
2548
0
0
my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
0
0
2549
0
0
my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
2550
0
0
print OUTPUT "$first_row_training\n";
2551
0
0
foreach my $i (0..@training_data_records-1) {
2552
0
0
my $i1 = $i+1;
2553
0
0
my $sample_record = "\"$i1\",$training_data_records[$i]";
2554
0
0
print OUTPUT "$sample_record";
2555
}
2556
0
0
close OUTPUT;
2557
0
0
open OUTPUT, ">$self->{_output_test_csv_file}";
2558
0
0
my @feature_names_testing = keys %{$self->{_features_with_value_range}};
0
0
2559
0
0
my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
0
0
2560
0
0
my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
2561
0
0
print OUTPUT "$first_row_testing\n";
2562
0
0
foreach my $i (0..@test_data_records-1) {
2563
0
0
my $i1 = $i+1;
2564
0
0
my $sample_record = "\"$i1\",$test_data_records[$i]";
2565
0
0
print OUTPUT "$sample_record";
2566
}
2567
0
0
close OUTPUT;
2568
}
2569
2570
# from perl docs:
2571
sub fisher_yates_shuffle {
2572
0
0
0
my $arr = shift;
2573
0
0
my $i = @$arr;
2574
0
0
while (--$i) {
2575
0
0
my $j = int rand( $i + 1 );
2576
0
0
@$arr[$i, $j] = @$arr[$j, $i];
2577
}
2578
}
2579
2580
########################### Generate Your Own Symbolic Training Data ###############################
2581
########################### Class TrainingDataGeneratorSymbolic #############################
2582
2583
## See the sample script generate_training_and_test_data_symbolic.pl for how to use
2584
## this class for generating purely symbolic training and test data. The data is
2585
## generated according to the specifications you place in a parameter file whose
2586
## name you supply as one of constructor arguments.
2587
package TrainingDataGeneratorSymbolic;
2588
2589
1
1
8
use strict;
1
2
1
16
2590
1
1
3
use Carp;
1
1
1
1641
2591
2592
sub new {
2593
1
1
12
my ($class, %args) = @_;
2594
1
3
my @params = keys %args;
2595
1
50
3
croak "\nYou have used a wrong name for a keyword argument " .
2596
"--- perhaps a misspelling\n"
2597
if check_for_illegal_params4(@params) == 0;
2598
bless {
2599
_output_training_datafile => $args{'output_training_datafile'}
2600
|| die("name for output_training_datafile required"),
2601
_parameter_file => $args{'parameter_file'}
2602
|| die("parameter_file required"),
2603
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
2604
|| die("number_of_samples_for_training required"),
2605
1
50
18
_debug => $args{debug} || 0,
50
50
50
2606
_class_names => [],
2607
_class_priors => [],
2608
_features_and_values_hash => {},
2609
_bias_hash => {},
2610
_training_sample_records => {},
2611
}, $class;
2612
}
2613
2614
sub check_for_illegal_params4 {
2615
1
1
1
my @params = @_;
2616
1
3
my @legal_params = qw / output_training_datafile
2617
parameter_file
2618
number_of_samples_for_training
2619
debug
2620
/;
2621
1
1
my $found_match_flag;
2622
1
2
foreach my $param (@params) {
2623
3
3
foreach my $legal (@legal_params) {
2624
6
4
$found_match_flag = 0;
2625
6
100
9
if ($param eq $legal) {
2626
3
1
$found_match_flag = 1;
2627
3
3
last;
2628
}
2629
}
2630
3
50
5
last if $found_match_flag == 0;
2631
}
2632
1
3
return $found_match_flag;
2633
}
2634
2635
## Read a parameter file for generating symbolic training data. See the script
2636
## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
2637
## pass the name of the parameter file to the constructor of the
2638
## TrainingDataGeneratorSymbolic class.
2639
sub read_parameter_file_symbolic {
2640
1
1
4
my $self = shift;
2641
1
4
my $debug = $self->{_debug};
2642
1
2
my $number_of_training_samples = $self->{_number_of_samples_for_training};
2643
1
1
my $input_parameter_file = $self->{_parameter_file};
2644
1
50
4
croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
2645
1
1
my $output_file_training = $self->{_output_training_datafile};
2646
1
20
my $output_file_testing = $self->{_output_test_datafile};
2647
1
1
my @all_params;
2648
my $param_string;
2649
1
33
25
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
2650
1
28
@all_params = ;
2651
1
3
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
40
46
2652
1
2
@all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
36
58
2653
1
9
$param_string = join ' ', @all_params;
2654
1
7
my ($class_names, $class_priors, $rest_param) =
2655
$param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
2656
1
50
21
my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
3
11
2657
1
2
push @{$self->{_class_names}}, @class_names;
1
2
2658
1
50
4
my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
3
10
2659
1
1
push @{$self->{_class_priors}}, @class_priors;
1
2
2660
1
7
my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;
2661
1
1
my %features_and_values_hash;
2662
1
9
my @features = split /(feature[:])/, $feature_string;
2663
1
50
2
@features = grep {defined($_) && length($_) > 0} @features;
9
22
2664
1
2
foreach my $item (@features) {
2665
8
100
14
next if $item =~ /feature/;
2666
4
11
my @splits = split / /, $item;
2667
4
50
5
@splits = grep {defined($_) && length($_) > 0} @splits;
27
59
2668
4
7
foreach my $i (0..@splits-1) {
2669
22
100
23
if ($i == 0) {
2670
4
7
$features_and_values_hash{$splits[0]} = [];
2671
} else {
2672
18
100
26
next if $splits[$i] =~ /values/;
2673
14
9
push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
14
18
2674
}
2675
}
2676
}
2677
1
2
$self->{_features_and_values_hash} = \%features_and_values_hash;
2678
1
2
my %bias_hash = %{$self->{_bias_hash}};
1
2
2679
1
6
my @biases = split /(bias[:]\s*class[:])/, $bias_string;
2680
1
50
1
@biases = grep {defined($_) && length($_) > 0} @biases;
5
14
2681
1
1
foreach my $item (@biases) {
2682
4
100
11
next if $item =~ /bias/;
2683
2
15
my @splits = split /\s+/, $item;
2684
2
50
2
@splits = grep {defined($_) && length($_) > 0} @splits;
18
40
2685
2
3
my $feature_name;
2686
2
3
foreach my $i (0..@splits-1) {
2687
16
100
29
if ($i == 0) {
100
2688
2
3
$bias_hash{$splits[0]} = {};
2689
} elsif ($splits[$i] =~ /(^.+)[:]$/) {
2690
8
5
$feature_name = $1;
2691
8
15
$bias_hash{$splits[0]}->{$feature_name} = [];
2692
} else {
2693
6
50
8
next if !defined $feature_name;
2694
6
50
8
push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
6
11
2695
if defined $feature_name;
2696
}
2697
}
2698
}
2699
1
1
$self->{_bias_hash} = \%bias_hash;
2700
1
50
10
if ($debug) {
2701
0
0
print "\n\nClass names: @class_names\n";
2702
0
0
my $num_of_classes = @class_names;
2703
0
0
print "Class priors: @class_priors\n";
2704
0
0
print "Number of classes: $num_of_classes\n";
2705
0
0
print "\nHere are the features and their possible values:\n";
2706
0
0
while ( my ($k, $v) = each %features_and_values_hash ) {
2707
0
0
print "$k ===> @$v\n";
2708
}
2709
0
0
print "\nHere is the biasing for each class:\n";
2710
0
0
while ( my ($k, $v) = each %bias_hash ) {
2711
0
0
print "$k:\n";
2712
0
0
while ( my ($k1, $v1) = each %$v ) {
2713
0
0
print " $k1 ===> @$v1\n";
2714
}
2715
}
2716
}
2717
}
2718
2719
## This method generates training data according to the specifications placed in a
2720
## parameter file that is read by the previous method.
2721
sub gen_symbolic_training_data {
2722
1
1
4
my $self = shift;
2723
1
2
my @class_names = @{$self->{_class_names}};
1
2
2724
1
2
my @class_priors = @{$self->{_class_priors}};
1
2
2725
1
1
my %training_sample_records;
2726
1
0
my %features_and_values_hash = %{$self->{_features_and_values_hash}};
1
4
2727
1
2
my %bias_hash = %{$self->{_bias_hash}};
1
1
2728
1
2
my $how_many_training_samples = $self->{_number_of_samples_for_training};
2729
1
1
my $how_many_test_samples = $self->{_number_of_samples_for_testing};
2730
1
1
my %class_priors_to_unit_interval_map;
2731
1
1
my $accumulated_interval = 0;
2732
1
2
foreach my $i (0..@class_names-1) {
2733
2
4
$class_priors_to_unit_interval_map{$class_names[$i]}
2734
= [$accumulated_interval, $accumulated_interval + $class_priors[$i]];
2735
2
3
$accumulated_interval += $class_priors[$i];
2736
}
2737
1
50
3
if ($self->{_debug}) {
2738
0
0
print "Mapping of class priors to unit interval: \n";
2739
0
0
while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {
2740
0
0
print "$k => @$v\n";
2741
}
2742
0
0
print "\n\n";
2743
}
2744
1
1
my $ele_index = 0;
2745
1
2
while ($ele_index < $how_many_training_samples) {
2746
35
33
my $sample_name = "sample" . "_$ele_index";
2747
35
50
$training_sample_records{$sample_name} = [];
2748
# Generate class label for this training sample:
2749
35
47
my $roll_the_dice = rand(1.0);
2750
35
22
my $class_label;
2751
35
38
foreach my $class_name (keys %class_priors_to_unit_interval_map ) {
2752
49
32
my $v = $class_priors_to_unit_interval_map{$class_name};
2753
49
100
66
123
if ( ($roll_the_dice >= $v->[0]) && ($roll_the_dice <= $v->[1]) ) {
2754
35
25
push @{$training_sample_records{$sample_name}},
35
51
2755
"class=" . $class_name;
2756
35
23
$class_label = $class_name;
2757
35
26
last;
2758
}
2759
}
2760
35
47
foreach my $feature (keys %features_and_values_hash) {
2761
140
90
my @values = @{$features_and_values_hash{$feature}};
140
190
2762
140
125
my $bias_string = $bias_hash{$class_label}->{$feature}->[0];
2763
140
111
my $no_bias = 1.0 / @values;
2764
140
100
252
$bias_string = "$values[0]" . "=$no_bias" if !defined $bias_string;
2765
140
84
my %value_priors_to_unit_interval_map;
2766
140
325
my @splits = split /\s*=\s*/, $bias_string;
2767
140
103
my $chosen_for_bias_value = $splits[0];
2768
140
77
my $chosen_bias = $splits[1];
2769
140
135
my $remaining_bias = 1 - $chosen_bias;
2770
140
115
my $remaining_portion_bias = $remaining_bias / (@values -1);
2771
140
50
112
@splits = grep {defined($_) && length($_) > 0} @splits;
280
745
2772
140
95
my $accumulated = 0;
2773
140
152
foreach my $i (0..@values-1) {
2774
490
100
461
if ($values[$i] eq $chosen_for_bias_value) {
2775
140
188
$value_priors_to_unit_interval_map{$values[$i]}
2776
= [$accumulated, $accumulated + $chosen_bias];
2777
140
132
$accumulated += $chosen_bias;
2778
} else {
2779
350
415
$value_priors_to_unit_interval_map{$values[$i]}
2780
= [$accumulated, $accumulated + $remaining_portion_bias];
2781
350
269
$accumulated += $remaining_portion_bias;
2782
}
2783
}
2784
140
107
my $roll_the_dice = rand(1.0);
2785
140
82
my $value_label;
2786
140
160
foreach my $value_name (keys %value_priors_to_unit_interval_map ) {
2787
299
185
my $v = $value_priors_to_unit_interval_map{$value_name};
2788
299
100
100
702
if ( ($roll_the_dice >= $v->[0])
2789
&& ($roll_the_dice <= $v->[1]) ) {
2790
140
75
push @{$training_sample_records{$sample_name}},
140
244
2791
$feature . "=" . $value_name;
2792
140
99
$value_label = $value_name;
2793
140
95
last;
2794
}
2795
}
2796
140
50
373
if ($self->{_debug}) {
2797
0
0
print "mapping feature value priors for '$feature' " .
2798
"to unit interval: \n";
2799
0
0
while ( my ($k, $v) =
2800
each %value_priors_to_unit_interval_map ) {
2801
0
0
print "$k => @$v\n";
2802
}
2803
0
0
print "\n\n";
2804
}
2805
}
2806
35
55
$ele_index++;
2807
}
2808
1
1
$self->{_training_sample_records} = \%training_sample_records;
2809
1
50
3
if ($self->{_debug}) {
2810
0
0
print "\n\nPRINTING TRAINING RECORDS:\n\n";
2811
0
0
foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {
0
0
2812
0
0
print "$kee => @{$training_sample_records{$kee}}\n\n";
0
0
2813
}
2814
}
2815
1
2
my $output_training_file = $self->{_output_training_datafile};
2816
1
50
2
print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};
2817
1
82
open FILEHANDLE, ">$output_training_file";
2818
1
7
my @features = sort keys %features_and_values_hash;
2819
1
2
my $title_string = ',class';
2820
1
1
foreach my $feature_name (@features) {
2821
4
6
$title_string .= ',' . $feature_name;
2822
}
2823
1
10
print FILEHANDLE "$title_string\n";
2824
1
11
my @sample_names = sort {$a <=> $b} map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;
105
72
35
49
35
45
2825
1
4
my $record_string = '';
2826
1
2
foreach my $sample_name (@sample_names) {
2827
35
40
$record_string .= "$sample_name,";
2828
35
23
my @record = @{$training_sample_records{"sample_$sample_name"}};
35
50
2829
35
46
my %item_parts_hash;
2830
35
26
foreach my $item (@record) {
2831
175
228
my @splits = grep $_, split /=/, $item;
2832
175
191
$item_parts_hash{$splits[0]} = $splits[1];
2833
}
2834
35
27
$record_string .= $item_parts_hash{"class"};
2835
35
26
delete $item_parts_hash{"class"};
2836
35
61
my @kees = sort keys %item_parts_hash;
2837
35
33
foreach my $kee (@kees) {
2838
140
108
$record_string .= ",$item_parts_hash{$kee}";
2839
}
2840
35
33
print FILEHANDLE "$record_string\n";
2841
35
54
$record_string = '';
2842
}
2843
1
50
close FILEHANDLE;
2844
}
2845
2846
sub sample_index {
2847
0
0
my $arg = shift;
2848
0
$arg =~ /_(.+)$/;
2849
0
return $1;
2850
}
2851
2852
################################# Decision Tree Introspection #######################################
2853
################################# Class DTIntrospection #######################################
2854
2855
package DTIntrospection;
2856
2857
## Instances constructed from this class can provide explanations for the
2858
## classification decisions at the nodes of a decision tree.
2859
##
2860
## When used in the interactive mode, the decision-tree introspection made possible
2861
## by this class provides answers to the following three questions: (1) List of the
2862
## training samples that fall in the portion of the feature space that corresponds
2863
## to a node of the decision tree; (2) The probabilities associated with the last
2864
## feature test that led to the node; and (3) The class probabilities predicated on
2865
## just the last feature test on the path to that node.
2866
##
2867
## CAVEAT: It is possible for a node to exist even when there are no training
2868
## samples in the portion of the feature space that corresponds to the node. That
2869
## is because a decision tree is based on the probability densities estimated from
2870
## the training data. When training data is non-uniformly distributed, it is
2871
## possible for the probability associated with a point in the feature space to be
2872
## non-zero even when there are no training samples at or in the vicinity of that
2873
## point.
2874
##
2875
## For a node to exist even where there are no training samples in the portion of
2876
## the feature space that belongs to the node is an indication of the generalization
2877
## ability of decision-tree based classification.
2878
##
2879
## When used in a non-interactive mode, an instance of this class can be used to
2880
## create a tabular display that shows what training samples belong directly to the
2881
## portion of the feature space that corresponds to each node of the decision tree.
2882
## An instance of this class can also construct a tabular display that shows how the
2883
## influence of each training sample propagates in the decision tree. For each
2884
## training sample, this display first shows the list of nodes that came into
2885
## existence through feature test(s) that used the data provided by that sample.
2886
## This list for each training sample is followed by a subtree of the nodes that owe
2887
## their existence indirectly to the training sample. A training sample influences a
2888
## node indirectly if the node is a descendant of another node that is affected
2889
## directly by the training sample.
2890
2891
1
1
5
use strict;
1
0
1
20
2892
1
1
4
use Carp;
1
0
1
1847
2893
2894
sub new {
2895
0
0
my ($class, $dt) = @_;
2896
0
0
croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
2897
unless ref($dt) eq "Algorithm::DecisionTree";
2898
bless {
2899
_dt => $dt,
2900
_root_dtnode => $dt->{_root_node},
2901
0
_samples_at_nodes_hash => {},
2902
_branch_features_to_nodes_hash => {},
2903
_sample_to_node_mapping_direct_hash => {},
2904
_node_serial_num_to_node_hash => {},
2905
_awareness_raising_msg_shown => 0,
2906
_debug => 0,
2907
}, $class;
2908
}
2909
2910
sub initialize {
2911
0
0
my $self = shift;
2912
croak "You must first construct the decision tree before using the DTIntrospection class"
2913
0
0
unless $self->{_root_dtnode};
2914
0
$self->recursive_descent($self->{_root_dtnode});
2915
}
2916
2917
sub recursive_descent {
2918
0
0
my $self = shift;
2919
0
my $node = shift;
2920
0
my $node_serial_number = $node->get_serial_num();
2921
0
my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2922
0
0
print "\nAt node $node_serial_number: the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
0
2923
0
$self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
2924
0
$self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
2925
0
my @samples_at_node = ();
2926
0
foreach my $item (@$branch_features_and_values_or_thresholds) {
2927
0
my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
2928
0
0
unless (@samples_at_node) {
2929
0
@samples_at_node = @$samples_for_feature_value_combo;
2930
} else {
2931
0
my @accum;
2932
0
foreach my $sample (@samples_at_node) {
2933
0
0
push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);
2934
}
2935
0
@samples_at_node = @accum;
2936
}
2937
0
0
last unless @samples_at_node;
2938
}
2939
0
@samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node;
0
2940
0
0
print "Node: $node_serial_number the samples are: [@samples_at_node]\n" if ($self->{_debug});
2941
0
$self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
2942
0
0
if (@samples_at_node) {
2943
0
foreach my $sample (@samples_at_node) {
2944
0
0
if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2945
0
$self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number];
2946
} else {
2947
0
push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
0
2948
}
2949
}
2950
}
2951
0
my $children = $node->get_children();
2952
0
foreach my $child (@$children) {
2953
0
$self->recursive_descent($child);
2954
}
2955
}
2956
2957
sub display_training_samples_at_all_nodes_direct_influence_only {
2958
0
0
my $self = shift;
2959
croak "You must first construct the decision tree before using the DT Introspection class."
2960
0
0
unless $self->{_root_dtnode};
2961
0
$self->recursive_descent_for_showing_samples_at_a_node($self->{_root_dtnode});
2962
}
2963
2964
sub recursive_descent_for_showing_samples_at_a_node{
2965
0
0
my $self = shift;
2966
0
my $node = shift;
2967
0
my $node_serial_number = $node->get_serial_num();
2968
0
my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2969
0
0
if (exists $self->{_samples_at_nodes_hash}->{$node_serial_number}) {
2970
0
0
print "\nAt node $node_serial_number: the branch features and values are: [@{$branch_features_and_values_or_thresholds}]\n" if $self->{_debug};
0
2971
0
print "Node $node_serial_number: the samples are: [@{$self->{_samples_at_nodes_hash}->{$node_serial_number}}]\n";
0
2972
}
2973
0
map $self->recursive_descent_for_showing_samples_at_a_node($_), @{$node->get_children()};
0
2974
}
2975
2976
sub display_training_samples_to_nodes_influence_propagation {
2977
0
0
my $self = shift;
2978
0
foreach my $sample (sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} keys %{$self->{_dt}->{_training_data_hash}}) {
0
0
2979
0
0
if (exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2980
0
my $nodes_directly_affected = $self->{_sample_to_node_mapping_direct_hash}->{$sample};
2981
0
print "\n$sample:\n nodes affected directly: [@{$nodes_directly_affected}]\n";
0
2982
0
print " nodes affected through probabilistic generalization:\n";
2983
0
map $self->recursive_descent_for_sample_to_node_influence($_, $nodes_directly_affected, " "), @$nodes_directly_affected;
2984
}
2985
}
2986
}
2987
2988
sub recursive_descent_for_sample_to_node_influence {
2989
0
0
my $self = shift;
2990
0
my $node_serial_num = shift;
2991
0
my $nodes_already_accounted_for = shift;
2992
0
my $offset = shift;
2993
0
$offset .= " ";
2994
0
my $node = $self->{_node_serial_num_to_node_hash}->{$node_serial_num};
2995
0
my @children = map $_->get_serial_num(), @{$node->get_children()};
0
2996
0
my @children_affected = grep {!Algorithm::DecisionTree::contained_in($_, @{$nodes_already_accounted_for})} @children;
0
0
2997
0
0
if (@children_affected) {
2998
0
print "$offset $node_serial_num => [@children_affected]\n";
2999
}
3000
0
map $self->recursive_descent_for_sample_to_node_influence($_, \@children_affected, $offset), @children_affected;
3001
}
3002
3003
sub get_samples_for_feature_value_combo {
3004
0
0
my $self = shift;
3005
0
my $feature_value_combo = shift;
3006
0
my ($feature,$op,$value) = $self->extract_feature_op_val($feature_value_combo);
3007
0
my @samples = ();
3008
0
0
if ($op eq '=') {
0
0
3009
0
@samples = grep Algorithm::DecisionTree::contained_in($feature_value_combo, @{$self->{_dt}->{_training_data_hash}->{$_}}), keys %{$self->{_dt}->{_training_data_hash}};
0
0
3010
} elsif ($op eq '<') {
3011
0
foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
0
3012
0
my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
0
3013
0
foreach my $item (@features_and_values) {
3014
0
my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3015
0
0
0
if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data <= $value)) {
0
3016
0
push @samples, $sample;
3017
0
last;
3018
}
3019
}
3020
}
3021
} elsif ($op eq '>') {
3022
0
foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
0
3023
0
my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
0
3024
0
foreach my $item (@features_and_values) {
3025
0
my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3026
0
0
0
if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data > $value)) {
0
3027
0
push @samples, $sample;
3028
0
last;
3029
}
3030
}
3031
}
3032
} else {
3033
0
die "Something strange is going on";
3034
}
3035
0
return \@samples;
3036
}
3037
3038
sub extract_feature_op_val {
3039
0
0
my $self = shift;
3040
0
my $feature_value_combo = shift;
3041
0
my $pattern1 = '(.+)=(.+)';
3042
0
my $pattern2 = '(.+)<(.+)';
3043
0
my $pattern3 = '(.+)>(.+)';
3044
0
my ($feature,$value,$op);
3045
0
0
if ($feature_value_combo =~ /$pattern2/) {
0
0
3046
0
($feature,$op,$value) = ($1,'<',$2);
3047
} elsif ($feature_value_combo =~ /$pattern3/) {
3048
0
($feature,$op,$value) = ($1,'>',$2);
3049
} elsif ($feature_value_combo =~ /$pattern1/) {
3050
0
($feature,$op,$value) = ($1,'=',$2);
3051
}
3052
0
return ($feature,$op,$value);
3053
}
3054
3055
sub explain_classifications_at_multiple_nodes_interactively {
3056
0
0
my $self = shift;
3057
croak "You called explain_classification_at_multiple_nodes_interactively() without " .
3058
"first initializing the DTIntrospection instance in your code. Aborting."
3059
0
0
unless $self->{_samples_at_nodes_hash};
3060
0
print "\n\nIn order for the decision tree to introspect\n\n";
3061
0
print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3062
" that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3063
" with it even when there are NO training data points in that region?\n" .
3064
"\nEnter 'y' for yes or any other character for no: ";
3065
0
my $ans = ;
3066
0
$ans =~ s/^\s*|\s*$//g;
3067
0
0
die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decisions in the decision tree. Aborting!\n" if $ans !~ /^ye?s?$/;
3068
0
$self->{_awareness_raising_msg_shown} = 1;
3069
0
while (1) {
3070
0
my $node_id;
3071
my $ans;
3072
0
while (1) {
3073
0
print "\nEnter the integer ID of a node: ";
3074
0
$ans = ;
3075
0
$ans =~ s/^\s*|\s*$//g;
3076
0
0
return if $ans =~ /^exit$/;
3077
0
0
last if Algorithm::DecisionTree::contained_in($ans, keys %{$self->{_samples_at_nodes_hash}});
0
3078
0
print "\nYour answer must be an integer ID of a node. Try again or enter 'exit'.\n";
3079
}
3080
0
$node_id = $ans;
3081
0
$self->explain_classification_at_one_node($node_id)
3082
}
3083
}
3084
3085
sub explain_classification_at_one_node {
3086
0
0
my $self = shift;
3087
0
my $node_id = shift;
3088
croak "You called explain_classification_at_one_node() without first initializing " .
3089
0
0
"the DTIntrospection instance in your code. Aborting." unless $self->{_samples_at_nodes_hash};
3090
0
0
unless (exists $self->{_samples_at_nodes_hash}->{$node_id}) {
3091
0
print "Node $node_id is not a node in the tree\n";
3092
0
return;
3093
}
3094
0
0
unless ($self->{_awareness_raising_msg_shown}) {
3095
0
print "\n\nIn order for the decision tree to introspect at Node $node_id: \n\n";
3096
0
print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3097
" that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3098
" with it even when there are NO training data points in that region?\n" .
3099
"\nEnter 'y' for yes or any other character for no: ";
3100
0
my $ans = ;
3101
0
$ans =~ s/^\s*|\s*$//g;
3102
0
0
die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decision at node $node_id\n" if $ans !~ /^ye?s?$/;
3103
}
3104
0
my @samples_at_node = @{$self->{_samples_at_nodes_hash}->{$node_id}};
0
3105
0
my @branch_features_to_node = @{$self->{_branch_features_to_nodes_hash}->{$node_id}};
0
3106
# my @class_names = @{DTNode->get_class_names()};
3107
0
my @class_names = @{$self->get_class_names()};
0
3108
0
my $class_probabilities = $self->{_root_dtnode}->get_class_probabilities();
3109
0
my ($feature,$op,$value) = $self->extract_feature_op_val( $branch_features_to_node[-1] );
3110
0
0
my $msg = @samples_at_node == 0
3111
? "\n\n There are NO training data samples directly in the region of the feature space assigned to node $node_id: @samples_at_node\n\n"
3112
: "\n Samples in the portion of the feature space assigned to Node $node_id: @samples_at_node\n";
3113
0
$msg .= "\n Features tests on the branch to node $node_id: [@branch_features_to_node]\n\n";
3114
0
$msg .= "\n Would you like to see the probability associated with the last feature test on the branch leading to Node $node_id?\n";
3115
0
$msg .= "\n Enter 'y' if yes and any other character for 'no': ";
3116
0
print $msg;
3117
0
my $ans = ;
3118
0
$ans =~ s/^\s*|\s*$//g;
3119
0
0
if ($ans =~ /^ye?s?$/) {
3120
0
my $sequence = [$branch_features_to_node[-1]];
3121
0
my $prob = $self->{_dt}->probability_of_a_sequence_of_features_and_values_or_thresholds($sequence);
3122
0
print "\n probability of @{$sequence} is: $prob\n";
0
3123
}
3124
0
$msg = "\n Using Bayes rule, would you like to see the class probabilities predicated on just the last feature test on the branch leading to Node $node_id?\n";
3125
0
$msg .= "\n Enter 'y' for yes and any other character for no: ";
3126
0
print $msg;
3127
0
$ans = ;
3128
0
$ans =~ s/^\s*|\s*$//g;
3129
0
0
if ($ans =~ /^ye?s?$/) {
3130
0
my $sequence = [$branch_features_to_node[-1]];
3131
0
foreach my $cls (@class_names) {
3132
0
my $prob = $self->{_dt}->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($cls, $sequence);
3133
0
print "\n probability of class $cls given just the feature test @{$sequence} is: $prob\n";
0
3134
}
3135
} else {
3136
0
print "goodbye\n";
3137
}
3138
0
print "\n Finished supplying information on Node $node_id\n\n";
3139
}
3140
3141
1;
3142
3143
=pod
3144
3145
=head1 NAME
3146
3147
Algorithm::DecisionTree - A Perl module for decision-tree based classification of
3148
multidimensional data.
3149
3150
3151
=head1 SYNOPSIS
3152
3153
# FOR CONSTRUCTING A DECISION TREE AND FOR CLASSIFYING A SAMPLE:
3154
3155
# In general, your call for constructing an instance of the DecisionTree class
3156
# will look like:
3157
3158
my $training_datafile = "stage3cancer.csv";
3159
my $dt = Algorithm::DecisionTree->new(
3160
training_datafile => $training_datafile,
3161
csv_class_column_index => 2,
3162
csv_columns_for_features => [3,4,5,6,7,8],
3163
entropy_threshold => 0.01,
3164
max_depth_desired => 8,
3165
symbolic_to_numeric_cardinality_threshold => 10,
3166
);
3167
3168
# The constructor option `csv_class_column_index' informs the module as to which
3169
# column of your CSV file contains the class label. THE COLUMN INDEXING IS ZERO
3170
# BASED. The constructor option `csv_columns_for_features' specifies which columns
3171
# are to be used for feature values. The first row of the CSV file must specify
3172
# the names of the features. See examples of CSV files in the `Examples'
3173
# subdirectory.
3174
3175
# The option `symbolic_to_numeric_cardinality_threshold' is also important. For
3176
# the example shown above, if an ostensibly numeric feature takes on only 10 or
3177
# fewer different values in your training datafile, it will be treated like a
3178
# symbolic features. The option `entropy_threshold' determines the granularity
3179
# with which the entropies are sampled for the purpose of calculating entropy gain
3180
# with a particular choice of decision threshold for a numeric feature or a feature
3181
# value for a symbolic feature.
3182
3183
# After you have constructed an instance of the DecisionTree class as shown above,
3184
# you read in the training data file and initialize the probability cache by
3185
# calling:
3186
3187
$dt->get_training_data();
3188
$dt->calculate_first_order_probabilities();
3189
$dt->calculate_class_priors();
3190
3191
# Next you construct a decision tree for your training data by calling:
3192
3193
$root_node = $dt->construct_decision_tree_classifier();
3194
3195
# where $root_node is an instance of the DTNode class that is also defined in the
3196
# module file. Now you are ready to classify a new data record. Let's say that
3197
# your data record looks like:
3198
3199
my @test_sample = qw / g2=4.2
3200
grade=2.3
3201
gleason=4
3202
eet=1.7
3203
age=55.0
3204
ploidy=diploid /;
3205
3206
# You can classify it by calling:
3207
3208
my $classification = $dt->classify($root_node, \@test_sample);
3209
3210
# The call to `classify()' returns a reference to a hash whose keys are the class
3211
# names and the values the associated classification probabilities. This hash also
3212
# includes another key-value pair for the solution path from the root node to the
3213
# leaf node at which the final classification was carried out.
3214
3215
3216
=head1 CHANGES
3217
3218
B All the changes made in this version relate to the construction of
3219
regression trees. I have fixed a couple of bugs in the calculation of the regression
3220
coefficients. Additionally, the C class now comes with a new
3221
constructor parameter named C. For most cases, you'd set this
3222
parameter to 0, which causes the regression coefficients to be estimated through
3223
linear least-squares minimization.
3224
3225
B In addition to constructing decision trees, this version of the
3226
module also allows you to construct regression trees. The regression tree capability
3227
has been packed into a separate subclass, named C, of the main
3228
C class. The subdirectory C in the main
3229
installation directory illustrates how you can use this new functionality of the
3230
module.
3231
3232
B This version incorporates four very significant upgrades/changes to
3233
the C module: B<(1)> The CSV cleanup is now the default. So you do not
3234
have to set any special parameters in the constructor calls to initiate CSV
3235
cleanup. B<(2)> In the form of a new Perl class named C,
3236
this module provides you with an easy-to-use programming interface for attempting
3237
needle-in-a-haystack solutions for the case when your training data is overwhelmingly
3238
dominated by a single class. You need to set the constructor parameter
3239
C to invoke the logic that constructs multiple
3240
decision trees, each using the minority class samples along with samples drawn
3241
randomly from the majority class. The final classification is made through a
3242
majority vote from all the decision trees. B<(3)> Assuming you are faced with a
3243
big-data problem --- in the sense that you have been given a training database with a
3244
very large number of training records --- the class C will
3245
also let you construct multiple decision trees by pulling training data randomly from
3246
your training database (without paying attention to the relative populations of the
3247
classes). The final classification decision for a test sample is based on a majority
3248
vote from all the decision trees thus constructed. See the C
3249
directory for how to use these new features of the module. And, finally, B<(4)>
3250
Support for the old-style '.dat' training files has been dropped in this version.
3251
3252
B This version makes it easier to use a CSV training file that
3253
violates the assumption that a comma be used only to separate the different field
3254
values in a line record. Some large econometrics databases use double-quoted values
3255
for fields, and these values may contain commas (presumably for better readability).
3256
This version also allows you to specify the leftmost entry in the first CSV record
3257
that names all the fields. Previously, this entry was required to be an empty
3258
double-quoted string. I have also made some minor changes to the
3259
'C' method to make it more user friendly for large
3260
training files that may contain tens of thousands of records. When pulling training
3261
data from such files, this method prints out a dot on the terminal screen for every
3262
10000 records it has processed.
3263
3264
B This version brings the boosting capability to the C
3265
module.
3266
3267
B This version adds bagging to the C module. If your
3268
training dataset is large enough, you can ask the module to construct multiple
3269
decision trees using data bags extracted from your dataset. The module can show you
3270
the results returned by the individual decision trees and also the results obtained
3271
by taking a majority vote of the classification decisions made by the individual
3272
trees. You can specify any arbitrary extent of overlap between the data bags.
3273
3274
B The introspection capability in this version packs more of a punch.
3275
For each training data sample, you can now figure out not only the decision-tree
3276
nodes that are affected directly by that sample, but also those nodes that are
3277
affected indirectly through the generalization achieved by the probabilistic modeling
3278
of the data. The 'examples' directory of this version includes additional scripts
3279
that illustrate these enhancements to the introspection capability. See the section
3280
"The Introspection API" for a declaration of the introspection related methods, old
3281
and new.
3282
3283
B In response to requests from several users, this version includes a new
3284
capability: You can now ask the module to introspect about the classification
3285
decisions returned by the decision tree. Toward that end, the module includes a new
3286
class named C. Perhaps the most important bit of information you
3287
are likely to seek through DT introspection is the list of the training samples that
3288
fall directly in the portion of the feature space that is assigned to a node.
3289
B When training samples are non-uniformly distributed in the underlying
3290
feature space, IT IS POSSIBLE FOR A NODE TO EXIST EVEN WHEN NO TRAINING SAMPLES FALL
3291
IN THE PORTION OF THE FEATURE SPACE ASSIGNED TO THE NODE. B<(This is an important
3292
part of the generalization achieved by probabilistic modeling of the training data.)>
3293
For additional information related to DT introspection, see the section titled
3294
"DECISION TREE INTROSPECTION" in this documentation page.
3295
3296
B makes the logic of tree construction from the old-style '.dat' training
3297
files more consistent with how trees are constructed from the data in `.csv' files.
3298
The inconsistency in the past was with respect to the naming convention for the class
3299
labels associated with the different data records.
3300
3301
B fixes a bug in the part of the module that some folks use for generating
3302
synthetic data for experimenting with decision tree construction and classification.
3303
In the class C that is a part of the module, there
3304
was a problem with the order in which the features were recorded from the
3305
user-supplied parameter file. The basic code for decision tree construction and
3306
classification remains unchanged.
3307
3308
B further downshifts the required version of Perl for this module. This
3309
was a result of testing the module with Version 5.10.1 of Perl. Only one statement
3310
in the module code needed to be changed for the module to work with the older version
3311
of Perl.
3312
3313
B fixes the C restriction on the required Perl version. This
3314
version should work with Perl versions 5.14.0 and higher.
3315
3316
B changes the required version of Perl from 5.18.0 to 5.14.0. Everything
3317
else remains the same.
3318
3319
B should prove more robust when the probability distribution for the
3320
values of a feature is expected to be heavy-tailed; that is, when the supposedly rare
3321
observations can occur with significant probabilities. A new option in the
3322
DecisionTree constructor lets the user specify the precision with which the
3323
probability distributions are estimated for such features.
3324
3325
B fixes a bug that was caused by the explicitly set zero values for
3326
numerical features being misconstrued as "false" in the conditional statements in
3327
some of the method definitions.
3328
3329
B makes it easier to write code for classifying in one go all of your test
3330
data samples in a CSV file. The bulk classifications obtained can be written out to
3331
either a CSV file or to a regular text file. See the script
3332
C in the C directory for how to
3333
classify all of your test data records in a CSV file. This version also includes
3334
improved code for generating synthetic numeric/symbolic training and test data
3335
records for experimenting with the decision tree classifier.
3336
3337
B allows you to test the quality of your training data by running a 10-fold
3338
cross-validation test on the data. This test divides all of the training data into
3339
ten parts, with nine parts used for training a decision tree and one part used for
3340
testing its ability to classify correctly. This selection of nine parts for training
3341
and one part for testing is carried out in all of the ten different ways that are
3342
possible. This testing functionality in Version 2.1 can also be used to find the
3343
best values to use for the constructor parameters C,
3344
C, and C.
3345
3346
B Now you can use both numeric and
3347
symbolic features for constructing a decision tree. A feature is numeric if it can
3348
take any floating-point value over an interval.
3349
3350
B fixes a bug in the code that was triggered by 0 being declared as one of
3351
the features values in the training datafile. Version 1.71 also include an additional
3352
safety feature that is useful for training datafiles that contain a very large number
3353
of features. The new version makes sure that the number of values you declare for
3354
each sample record matches the number of features declared at the beginning of the
3355
training datafile.
3356
3357
B includes safety checks on the consistency of the data you place in your
3358
training datafile. When a training file contains thousands of samples, it is
3359
difficult to manually check that you used the same class names in your sample records
3360
that you declared at top of your training file or that the values you have for your
3361
features are legal vis-a-vis the earlier declarations of the values in the training
3362
file. Another safety feature incorporated in this version is the non-consideration
3363
of classes that are declared at the top of the training file but that have no sample
3364
records in the file.
3365
3366
B uses probability caching much more extensively compared to the previous
3367
versions. This should result in faster construction of large decision trees.
3368
Another new feature in Version 1.6 is the use of a decision tree for interactive
3369
classification. In this mode, after you have constructed a decision tree from the
3370
training data, the user is prompted for answers to the questions pertaining to the
3371
feature tests at the nodes of the tree.
3372
3373
Some key elements of the documentation were cleaned up and made more readable in
3374
B. The implementation code remains unchanged from Version 1.4.
3375
3376
B should make things faster (and easier) for folks who want to use this
3377
module with training data that creates very large decision trees (that is, trees with
3378
tens of thousands or more decision nodes). The speedup in Version 1.4 has been
3379
achieved by eliminating duplicate calculation of probabilities as the tree grows. In
3380
addition, this version provides an additional constructor parameter,
3381
C for controlling the size of the decision tree. This is in
3382
addition to the tree size control achieved by the parameter C that
3383
was introduced in Version 1.3. Since large decision trees can take a long time to
3384
create, you may find yourself wishing you could store the tree you just created in a
3385
disk file and that, subsequently, you could use the stored tree for classification
3386
work. The C directory contains two scripts, C and
3387
C, that show how you can do exactly that with the
3388
help of Perl's C module.
3389
3390
B addresses the issue that arises when the header of a training datafile
3391
declares a certain possible value for a feature but that (feature,value) pair does
3392
NOT show up anywhere in the training data. Version 1.3 also makes it possible for a
3393
user to control the size of the decision tree by changing the value of the parameter
3394
C Additionally, Version 1.3 includes a method called
3395
C that displays useful information regarding the size and
3396
some other attributes of the training data. It also warns the user if the training
3397
data might result in a decision tree that would simply be much too large --- unless
3398
the user controls the size with the entropy_threshold parameter.
3399
3400
In addition to the removal of a couple of serious bugs, B incorporates a
3401
number of enhancements: (1) Version 1.2 includes checks on the names of the features
3402
and values used in test data --- this is the data you want to classify with the
3403
decision tree classifier constructed by this module. (2) Version 1.2 includes a
3404
separate constructor for generating test data. To make it easier to generate test
3405
data whose probabilistic parameters may not be identical to that used for the
3406
training data, I have used separate routines for generating the test data. (3)
3407
Version 1.2 also includes in its examples directory a script that classifies the test
3408
data in a file and outputs the class labels into another file. This is for folks who
3409
do not wish to write their own scripts using this module. (4) Version 1.2 also
3410
includes addition to the documentation regarding the issue of numeric values for
3411
features.
3412
3413
=head1 DESCRIPTION
3414
3415
B is a I module for constructing a decision tree from
3416
a training datafile containing multidimensional data. In one form or another,
3417
decision trees have been around for about fifty years. From a statistical
3418
perspective, they are closely related to classification and regression by recursive
3419
partitioning of multidimensional data. Early work that demonstrated the usefulness
3420
of such partitioning of data for classification and regression can be traced to the
3421
work of Terry Therneau in the early 1980's in the statistics community, and to the
3422
work of Ross Quinlan in the mid 1990's in the machine learning community.
3423
3424
For those not familiar with decision tree ideas, the traditional way to classify
3425
multidimensional data is to start with a feature space whose dimensionality is the
3426
same as that of the data. Each feature in this space corresponds to the attribute
3427
that each dimension of the data measures. You then use the training data to carve up
3428
the feature space into different regions, each corresponding to a different class.
3429
Subsequently, when you try to classify a new data sample, you locate it in the
3430
feature space and find the class label of the region to which it belongs. One can
3431
also give the new data point the same class label as that of the nearest training
3432
sample. This is referred to as the nearest neighbor classification. There exist
3433
hundreds of variations of varying power on these two basic approaches to the
3434
classification of multidimensional data.
3435
3436
A decision tree classifier works differently. When you construct a decision tree,
3437
you select for the root node a feature test that partitions the training data in a
3438
way that causes maximal disambiguation of the class labels associated with the data.
3439
In terms of information content as measured by entropy, such a feature test would
3440
cause maximum reduction in class entropy in going from all of the training data taken
3441
together to the data as partitioned by the feature test. You then drop from the root
3442
node a set of child nodes, one for each partition of the training data created by the
3443
feature test at the root node. When your features are purely symbolic, you'll have
3444
one child node for each value of the feature chosen for the feature test at the root.
3445
When the test at the root involves a numeric feature, you find the decision threshold
3446
for the feature that best bipartitions the data and you drop from the root node two
3447
child nodes, one for each partition. Now at each child node you pose the same
3448
question that you posed when you found the best feature to use at the root: Which
3449
feature at the child node in question would maximally disambiguate the class labels
3450
associated with the training data corresponding to that child node?
3451
3452
As the reader would expect, the two key steps in any approach to decision-tree based
3453
classification are the construction of the decision tree itself from a file
3454
containing the training data, and then using the decision tree thus obtained for
3455
classifying new data.
3456
3457
What is cool about decision tree classification is that it gives you soft
3458
classification, meaning it may associate more than one class label with a given data
3459
vector. When this happens, it may mean that your classes are indeed overlapping in
3460
the underlying feature space. It could also mean that you simply have not supplied
3461
sufficient training data to the decision tree classifier. For a tutorial
3462
introduction to how a decision tree is constructed and used, visit
3463
L
3464
3465
This module also allows you to generate your own synthetic training and test
3466
data. Generating your own training data, using it for constructing a decision-tree
3467
classifier, and subsequently testing the classifier on a synthetically generated
3468
test set of data is a good way to develop greater proficiency with decision trees.
3469
3470
3471
=head1 WHAT PRACTICAL PROBLEM IS SOLVED BY THIS MODULE
3472
3473
If you are new to the concept of a decision tree, their practical utility is best
3474
understood with an example that only involves symbolic features. However, as
3475
mentioned earlier, versions of the module higher than 2.0 allow you to use both
3476
symbolic and numeric features.
3477
3478
Consider the following scenario: Let's say you are running a small investment company
3479
that employs a team of stockbrokers who make buy/sell decisions for the customers of
3480
your company. Assume that your company has asked the traders to make each investment
3481
decision on the basis of the following four criteria:
3482
3483
price_to_earnings_ratio (P_to_E)
3484
3485
price_to_sales_ratio (P_to_S)
3486
3487
return_on_equity (R_on_E)
3488
3489
market_share (MS)
3490
3491
Since you are the boss, you keep track of the buy/sell decisions made by the
3492
individual traders. But one unfortunate day, all of your traders decide to quit
3493
because you did not pay them enough. So what do you do? If you had a module like
3494
the one here, you could still run your company and do so in such a way that, on the
3495
average, would do better than any of the individual traders who worked for your
3496
company. This is what you do: You pool together the individual trader buy/sell
3497
decisions you have accumulated during the last one year. This pooled information is
3498
likely to look like:
3499
3500
3501
example buy/sell P_to_E P_to_S R_on_E MS
3502
============================================================+=
3503
3504
example_1 buy high low medium low
3505
example_2 buy medium medium low low
3506
example_3 sell low medium low high
3507
....
3508
....
3509
3510
This data, when formatted according to CSV, would constitute your training file. You
3511
could feed this file into the module by calling:
3512
3513
my $dt = Algorithm::DecisionTree->new(
3514
training_datafile => $training_datafile,
3515
csv_class_column_index => 1,
3516
csv_columns_for_features => [2,3,4,5],
3517
);
3518
$dt->get_training_data();
3519
$dt->calculate_first_order_probabilities();
3520
$dt->calculate_class_priors();
3521
3522
Subsequently, you would construct a decision tree by calling:
3523
3524
my $root_node = $dt->construct_decision_tree_classifier();
3525
3526
Now you and your company (with practically no employees) are ready to service the
3527
customers again. Suppose your computer needs to make a buy/sell decision about an
3528
investment prospect that is best described by:
3529
3530
price_to_earnings_ratio = low
3531
price_to_sales_ratio = very_low
3532
return_on_equity = none
3533
market_share = medium
3534
3535
All that your computer would need to do would be to construct a data vector like
3536
3537
my @data = qw / P_to_E=low
3538
P_to_S=very_low
3539
R_on_E=none
3540
MS=medium /;
3541
3542
and call the decision tree classifier you just constructed by
3543
3544
$dt->classify($root_node, \@data);
3545
3546
The answer returned will be 'buy' and 'sell', along with the associated
3547
probabilities. So if the probability of 'buy' is considerably greater than the
3548
probability of 'sell', that's what you should instruct your computer to do.
3549
3550
The chances are that, on the average, this approach would beat the performance of any
3551
of your individual traders who worked for you previously since the buy/sell decisions
3552
made by the computer would be based on the collective wisdom of all your previous
3553
traders. B
3554
captured by the silly little example here. However, it does nicely the convey the
3555
sense in which the current module could be used.>
3556
3557
=head1 SYMBOLIC FEATURES VERSUS NUMERIC FEATURES
3558
3559
A feature is symbolic when its values are compared using string comparison operators.
3560
By the same token, a feature is numeric when its values are compared using numeric
3561
comparison operators. Having said that, features that take only a small number of
3562
numeric values in the training data can be treated symbolically provided you are
3563
careful about handling their values in the test data. At the least, you have to set
3564
the test data value for such a feature to its closest value in the training data.
3565
The module does that automatically for you for those numeric features for which the
3566
number different numeric values is less than a user-specified threshold. For those
3567
numeric features that the module is allowed to treat symbolically, this snapping of
3568
the values of the features in the test data to the small set of values in the training
3569
data is carried out automatically by the module. That is, after a user has told the
3570
module which numeric features to treat symbolically, the user need not worry about
3571
how the feature values appear in the test data.
3572
3573
The constructor parameter C let's you tell
3574
the module when to consider an otherwise numeric feature symbolically. Suppose you
3575
set this parameter to 10, that means that all numeric looking features that take 10
3576
or fewer different values in the training datafile will be considered to be symbolic
3577
features by the module. See the tutorial at
3578
L for
3579
further information on the implementation issues related to the symbolic and numeric
3580
features.
3581
3582
=head1 FEATURES WITH NOT SO "NICE" STATISTICAL PROPERTIES
3583
3584
For the purpose of estimating the probabilities, it is necessary to sample the range
3585
of values taken on by a numerical feature. For features with "nice" statistical
3586
properties, this sampling interval is set to the median of the differences between
3587
the successive feature values in the training data. (Obviously, as you would expect,
3588
you first sort all the values for a feature before computing the successive
3589
differences.) This logic will not work for the sort of a feature described below.
3590
3591
Consider a feature whose values are heavy-tailed, and, at the same time, the values
3592
span a million to one range. What I mean by heavy-tailed is that rare values can
3593
occur with significant probabilities. It could happen that most of the values for
3594
such a feature are clustered at one of the two ends of the range. At the same time,
3595
there may exist a significant number of values near the end of the range that is less
3596
populated. (Typically, features related to human economic activities --- such as
3597
wealth, incomes, etc. --- are of this type.) With the logic described in the
3598
previous paragraph, you could end up with a sampling interval that is much too small,
3599
which could result in millions of sampling points for the feature if you are not
3600
careful.
3601
3602
Beginning with Version 2.22, you have two options in dealing with such features. You
3603
can choose to go with the default behavior of the module, which is to sample the
3604
value range for such a feature over a maximum of 500 points. Or, you can supply an
3605
additional option to the constructor that sets a user-defined value for the number of
3606
points to use. The name of the option is C. The following
3607
script
3608
3609
construct_dt_for_heavytailed.pl
3610
3611
in the C directory shows an example of how to call the constructor of the
3612
module with the C option.
3613
3614
3615
=head1 TESTING THE QUALITY OF YOUR TRAINING DATA
3616
3617
Versions 2.1 and higher include a new class named C, derived from
3618
the main class C, that runs a 10-fold cross-validation test on your
3619
training data to test its ability to discriminate between the classes mentioned in
3620
the training file.
3621
3622
The 10-fold cross-validation test divides all of the training data into ten parts,
3623
with nine parts used for training a decision tree and one part used for testing its
3624
ability to classify correctly. This selection of nine parts for training and one part
3625
for testing is carried out in all of the ten different possible ways.
3626
3627
The following code fragment illustrates how you invoke the testing function of the
3628
EvalTrainingData class:
3629
3630
my $training_datafile = "training.csv";
3631
my $eval_data = EvalTrainingData->new(
3632
training_datafile => $training_datafile,
3633
csv_class_column_index => 1,
3634
csv_columns_for_features => [2,3],
3635
entropy_threshold => 0.01,
3636
max_depth_desired => 3,
3637
symbolic_to_numeric_cardinality_threshold => 10,
3638
);
3639
$eval_data->get_training_data();
3640
$eval_data->evaluate_training_data()
3641
3642
The last statement above prints out a Confusion Matrix and the value of Training Data
3643
Quality Index on a scale of 0 to 100, with 100 designating perfect training data.
3644
The Confusion Matrix shows how the different classes were mislabeled in the 10-fold
3645
cross-validation test.
3646
3647
This testing functionality can also be used to find the best values to use for the
3648
constructor parameters C, C, and
3649
C.
3650
3651
The following two scripts in the C directory illustrate the use of the
3652
C class for testing the quality of your data:
3653
3654
evaluate_training_data1.pl
3655
evaluate_training_data2.pl
3656
3657
3658
=head1 HOW TO MAKE THE BEST CHOICES FOR THE CONSTRUCTOR PARAMETERS
3659
3660
Assuming your training data is good, the quality of the results you get from a
3661
decision tree would depend on the choices you make for the constructor parameters
3662
C, C, and
3663
C. You can optimize your choices for
3664
these parameters by running the 10-fold cross-validation test that is made available
3665
in Versions 2.2 and higher through the new class C that is included
3666
in the module file. A description of how to run this test is in the previous section
3667
of this document.
3668
3669
3670
=head1 DECISION TREE INTROSPECTION
3671
3672
Starting with Version 2.30, you can ask the C class of the module to
3673
explain the classification decisions made at the different nodes of the decision
3674
tree.
3675
3676
Perhaps the most important bit of information you are likely to seek through DT
3677
introspection is the list of the training samples that fall directly in the portion
3678
of the feature space that is assigned to a node.
3679
3680
However, note that, when training samples are non-uniformly distributed in the
3681
underlying feature space, it is possible for a node to exist even when there are no
3682
training samples in the portion of the feature space assigned to the node. That is
3683
because the decision tree is constructed from the probability densities estimated
3684
from the training data. When the training samples are non-uniformly distributed, it
3685
is entirely possible for the estimated probability densities to be non-zero in a
3686
small region around a point even when there are no training samples specifically in
3687
that region. (After you have created a statistical model for, say, the height
3688
distribution of people in a community, the model may return a non-zero probability
3689
for the height values in a small interval even if the community does not include a
3690
single individual whose height falls in that interval.)
3691
3692
That a decision-tree node can exist even when there are no training samples in that
3693
portion of the feature space that belongs to the node is an important indication of
3694
the generalization ability of a decision-tree-based classifier.
3695
3696
In light of the explanation provided above, before the DTIntrospection class supplies
3697
any answers at all, it asks you to accept the fact that features can take on non-zero
3698
probabilities at a point in the feature space even though there are zero training
3699
samples at that point (or in a small region around that point). If you do not accept
3700
this rudimentary fact, the introspection class will not yield any answers (since you
3701
are not going to believe the answers anyway).
3702
3703
The point made above implies that the path leading to a node in the decision tree may
3704
test a feature for a certain value or threshold despite the fact that the portion of
3705
the feature space assigned to that node is devoid of any training data.
3706
3707
See the following three scripts in the Examples directory for how to carry out DT
3708
introspection:
3709
3710
introspection_in_a_loop_interactive.pl
3711
3712
introspection_show_training_samples_at_all_nodes_direct_influence.pl
3713
3714
introspection_show_training_samples_to_nodes_influence_propagation.pl
3715
3716
The first script places you in an interactive session in which you will first be
3717
asked for the node number you are interested in. Subsequently, you will be asked for
3718
whether or not you are interested in specific questions that the introspection can
3719
provide answers for. The second script descends down the decision tree and shows for
3720
each node the training samples that fall directly in the portion of the feature space
3721
assigned to that node. The third script shows for each training sample how it
3722
affects the decision-tree nodes either directly or indirectly through the
3723
generalization achieved by the probabilistic modeling of the data.
3724
3725
The output of the script
3726
C looks like:
3727
3728
Node 0: the samples are: None
3729
Node 1: the samples are: [sample_46 sample_58]
3730
Node 2: the samples are: [sample_1 sample_4 sample_7 .....]
3731
Node 3: the samples are: []
3732
Node 4: the samples are: []
3733
...
3734
...
3735
3736
The nodes for which no samples are listed come into existence through
3737
the generalization achieved by the probabilistic modeling of the data.
3738
3739
The output produced by the script
3740
C looks like
3741
3742
sample_1:
3743
nodes affected directly: [2 5 19 23]
3744
nodes affected through probabilistic generalization:
3745
2=> [3 4 25]
3746
25=> [26]
3747
5=> [6]
3748
6=> [7 13]
3749
7=> [8 11]
3750
8=> [9 10]
3751
11=> [12]
3752
13=> [14 18]
3753
14=> [15 16]
3754
16=> [17]
3755
19=> [20]
3756
20=> [21 22]
3757
23=> [24]
3758
3759
sample_4:
3760
nodes affected directly: [2 5 6 7 11]
3761
nodes affected through probabilistic generalization:
3762
2=> [3 4 25]
3763
25=> [26]
3764
5=> [19]
3765
19=> [20 23]
3766
20=> [21 22]
3767
23=> [24]
3768
6=> [13]
3769
13=> [14 18]
3770
14=> [15 16]
3771
16=> [17]
3772
7=> [8]
3773
8=> [9 10]
3774
11=> [12]
3775
3776
...
3777
...
3778
...
3779
3780
For each training sample, the display shown above first presents the list of nodes
3781
that are directly affected by the sample. A node is affected directly by a sample if
3782
the latter falls in the portion of the feature space that belongs to the former.
3783
Subsequently, for each training sample, the display shows a subtree of the nodes that
3784
are affected indirectly by the sample through the generalization achieved by the
3785
probabilistic modeling of the data. In general, a node is affected indirectly by a
3786
sample if it is a descendant of another node that is affected directly.
3787
3788
Also see the section titled B regarding how to invoke the
3789
introspection capabilities of the module in your own code.
3790
3791
=head1 METHODS
3792
3793
The module provides the following methods for constructing a decision tree from
3794
training data in a disk file and for classifying new data records with the decision
3795
tree thus constructed:
3796
3797
=over 4
3798
3799
=item B
3800
3801
my $dt = Algorithm::DecisionTree->new(
3802
training_datafile => $training_datafile,
3803
csv_class_column_index => 2,
3804
csv_columns_for_features => [3,4,5,6,7,8],
3805
entropy_threshold => 0.01,
3806
max_depth_desired => 8,
3807
symbolic_to_numeric_cardinality_threshold => 10,
3808
);
3809
3810
A call to C constructs a new instance of the C class.
3811
For this call to make sense, the training data in the training datafile must be
3812
in the CSV format.
3813
3814
=back
3815
3816
=head2 Constructor Parameters
3817
3818
=over 8
3819
3820
=item C:
3821
3822
This parameter supplies the name of the file that contains the training data.
3823
3824
=item C:
3825
3826
When using a CSV file for your training data, this parameter supplies the zero-based
3827
column index for the column that contains the class label for each data record in the
3828
training file.
3829
3830
=item C:
3831
3832
When using a CSV file for your training data, this parameter supplies a list of
3833
columns corresponding to the features you wish to use for decision tree construction.
3834
Each column is specified by its zero-based index.
3835
3836
=item C:
3837
3838
This parameter sets the granularity with which the entropies are sampled by the
3839
module. For example, a feature test at a node in the decision tree is acceptable if
3840
the entropy gain achieved by the test exceeds this threshold. The larger the value
3841
you choose for this parameter, the smaller the tree. Its default value is 0.001.
3842
3843
=item C:
3844
3845
This parameter sets the maximum depth of the decision tree. For obvious reasons, the
3846
smaller the value you choose for this parameter, the smaller the tree.
3847
3848
=item C:
3849
3850
This parameter allows the module to treat an otherwise numeric feature symbolically
3851
if the number of different values the feature takes in the training data file does
3852
not exceed the value of this parameter.
3853
3854
=item C:
3855
3856
This parameter gives the user the option to set the number of points at which the
3857
value range for a feature should be sampled for estimating the probabilities. This
3858
parameter is effective only for those features that occupy a large value range and
3859
whose probability distributions are heavy tailed. B
3860
when you have a very large training dataset:> In general, the larger the dataset, the
3861
smaller the smallest difference between any two values for a numeric feature in
3862
relation to the overall range of values for that feature. In such cases, the module
3863
may use too large a number of bins for estimating the probabilities and that may slow
3864
down the calculation of the decision tree. You can get around this difficulty by
3865
explicitly giving a value to the 'C' parameter.
3866
3867
=back
3868
3869
3870
You can choose the best values to use for the last three constructor parameters by
3871
running a 10-fold cross-validation test on your training data through the class
3872
C that comes with Versions 2.1 and higher of this module. See the
3873
section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.
3874
3875
=over
3876
3877
=item B
3878
3879
After you have constructed a new instance of the C class,
3880
you must now read in the training data that is the file named in the call to the
3881
constructor. This you do by:
3882
3883
$dt->get_training_data();
3884
3885
3886
=item B
3887
3888
If you wish to see the training data that was just digested by the module,
3889
call
3890
3891
$dt->show_training_data();
3892
3893
=item B
3894
3895
=item B
3896
3897
After the module has read the training data file, it needs to initialize the
3898
probability cache. This you do by invoking:
3899
3900
$dt->calculate_first_order_probabilities()
3901
$dt->calculate_class_priors()
3902
3903
=item B
3904
3905
With the probability cache initialized, it is time to construct a decision tree
3906
classifier. This you do by
3907
3908
my $root_node = $dt->construct_decision_tree_classifier();
3909
3910
This call returns an instance of type C. The C class is defined
3911
within the main package file. So, don't forget, that C<$root_node> in the above
3912
example call will be instantiated to an object of type C.
3913
3914
=item B<$root_nodeC<< -> >>display_decision_tree(" "):>
3915
3916
$root_node->display_decision_tree(" ");
3917
3918
This will display the decision tree in your terminal window by using a recursively
3919
determined offset for each node as the display routine descends down the tree.
3920
3921
I have intentionally left the syntax fragment C<$root_node> in the above call to
3922
remind the reader that C is NOT called on the instance of
3923
the C we constructed earlier, but on the C instance returned by
3924
the call to C.
3925
3926
=item B
3927
3928
Let's say you want to classify the following data record:
3929
3930
my @test_sample = qw / g2=4.2
3931
grade=2.3
3932
gleason=4
3933
eet=1.7
3934
age=55.0
3935
ploidy=diploid /;
3936
3937
you'd make the following call:
3938
3939
my $classification = $dt->classify($root_node, \@test_sample);
3940
3941
where, again, C<$root_node> is an instance of type C returned by the call to
3942
C. The variable C<$classification> holds a
3943
reference to a hash whose keys are the class names and whose values the associated
3944
probabilities. The hash that is returned by the above call also includes a special
3945
key-value pair for a key named C. The value associated with this key
3946
is an anonymous array that holds the path, in the form of a list of nodes, from the
3947
root node to the leaf node in the decision tree where the final classification was
3948
made.
3949
3950
3951
=item B
3952
3953
This method allows you to use a decision-tree based classifier in an interactive
3954
mode. In this mode, a user is prompted for answers to the questions pertaining to
3955
the feature tests at the nodes of the tree. The syntax for invoking this method is:
3956
3957
my $classification = $dt->classify_by_asking_questions($root_node);
3958
3959
where C<$dt> is an instance of the C class returned by a
3960
call to C and C<$root_node> the root node of the decision tree returned by a
3961
call to C.
3962
3963
=back
3964
3965
3966
=head1 THE INTROSPECTION API
3967
3968
To construct an instance of C, you call
3969
3970
my $introspector = DTIntrospection->new($dt);
3971
3972
where you supply the instance of the C class you used for constructing
3973
the decision tree through the parameter C<$dt>. After you have constructed an
3974
instance of the introspection class, you must initialize it by
3975
3976
$introspector->initialize();
3977
3978
Subsequently, you can invoke either of the following methods:
3979
3980
$introspector->explain_classification_at_one_node($node);
3981
3982
$introspector->explain_classifications_at_multiple_nodes_interactively();
3983
3984
depending on whether you want introspection at a single specified node or inside an
3985
infinite loop for an arbitrary number of nodes.
3986
3987
If you want to output a tabular display that shows for each node in the decision tree
3988
all the training samples that fall in the portion of the feature space that belongs
3989
to that node, call
3990
3991
$introspector->display_training_samples_at_all_nodes_direct_influence_only();
3992
3993
If you want to output a tabular display that shows for each training sample a list of
3994
all the nodes that are affected directly AND indirectly by that sample, call
3995
3996
$introspector->display_training_training_samples_to_nodes_influence_propagation();
3997
3998
A training sample affects a node directly if the sample falls in the portion of the
3999
features space assigned to that node. On the other hand, a training sample is
4000
considered to affect a node indirectly if the node is a descendant of a node that is
4001
affected directly by the sample.
4002
4003
4004
=head1 BULK CLASSIFICATION OF DATA RECORDS
4005
4006
For large test datasets, you would obviously want to process an entire file of test
4007
data at a time. The following scripts in the C directory illustrate how you
4008
can do that:
4009
4010
classify_test_data_in_a_file.pl
4011
4012
This script requires three command-line arguments, the first argument names the
4013
training datafile, the second the test datafile, and the third the file in which the
4014
classification results are to be deposited.
4015
4016
The other examples directories, C, C, and
4017
C, also contain scripts that illustrate how to carry out
4018
bulk classification of data records when you wish to take advantage of bagging,
4019
boosting, or tree randomization. In their respective directories, these scripts are
4020
named:
4021
4022
bagging_for_bulk_classification.pl
4023
boosting_for_bulk_classification.pl
4024
classify_database_records.pl
4025
4026
4027
=head1 HOW THE CLASSIFICATION RESULTS ARE DISPLAYED
4028
4029
It depends on whether you apply the classifier at once to all the data samples in a
4030
file, or whether you feed one data sample at a time into the classifier.
4031
4032
In general, the classifier returns soft classification for a test data vector. What
4033
that means is that, in general, the classifier will list all the classes to which a
4034
given data vector could belong and the probability of each such class label for the
4035
data vector. Run the examples scripts in the Examples directory to see how the output
4036
of classification can be displayed.
4037
4038
With regard to the soft classifications returned by this classifier, if the
4039
probability distributions for the different classes overlap in the underlying feature
4040
space, you would want the classifier to return all of the applicable class labels for
4041
a data vector along with the corresponding class probabilities. Another reason for
4042
why the decision tree classifier may associate significant probabilities with
4043
multiple class labels is that you used inadequate number of training samples to
4044
induce the decision tree. The good thing is that the classifier does not lie to you
4045
(unlike, say, a hard classification rule that would return a single class label
4046
corresponding to the partitioning of the underlying feature space). The decision
4047
tree classifier give you the best classification that can be made given the training
4048
data you fed into it.
4049
4050
4051
=head1 USING BAGGING
4052
4053
Starting with Version 3.0, you can use the class C that
4054
comes with the module to incorporate bagging in your decision tree based
4055
classification. Bagging means constructing multiple decision trees for different
4056
(possibly overlapping) segments of the data extracted from your training dataset and
4057
then aggregating the decisions made by the individual decision trees for the final
4058
classification. The aggregation of the classification decisions can average out the
4059
noise and bias that may otherwise affect the classification decision obtained from
4060
just one tree.
4061
4062
=over 4
4063
4064
=item B
4065
4066
A typical call to the constructor for the C class looks
4067
like:
4068
4069
use Algorithm::DecisionTreeWithBagging;
4070
4071
my $training_datafile = "stage3cancer.csv";
4072
4073
my $dtbag = Algorithm::DecisionTreeWithBagging->new(
4074
training_datafile => $training_datafile,
4075
csv_class_column_index => 2,
4076
csv_columns_for_features => [3,4,5,6,7,8],
4077
entropy_threshold => 0.01,
4078
max_depth_desired => 8,
4079
symbolic_to_numeric_cardinality_threshold => 10,
4080
how_many_bags => 4,
4081
bag_overlap_fraction => 0.2,
4082
);
4083
4084
Note in particular the following two constructor parameters:
4085
4086
how_many_bags
4087
4088
bag_overlap_fraction
4089
4090
where, as the name implies, the parameter C controls how many bags
4091
(and, therefore, how many decision trees) will be constructed from your training
4092
dataset; and where the parameter C controls the degree of
4093
overlap between the bags. To understand what exactly is achieved by setting the
4094
parameter C to 0.2 in the above example, let's say that the
4095
non-overlapping partitioning of the training data between the bags results in 100
4096
training samples per bag. With bag_overlap_fraction set to 0.2, additional 20 samples
4097
drawn randomly from the other bags will be added to the data in each bag.
4098
4099
=back
4100
4101
=head2 B class>
4102
4103
=over 8
4104
4105
=item B
4106
4107
This method reads your training datafile, randomizes it, and then partitions it into
4108
the specified number of bags. Subsequently, if the constructor parameter
4109
C is non-zero, it adds to each bag additional samples drawn at
4110
random from the other bags. The number of these additional samples added to each bag
4111
is controlled by the constructor parameter C. If this
4112
parameter is set to, say, 0.2, the size of each bag will grow by 20% with the samples
4113
drawn from the other bags.
4114
4115
=item B
4116
4117
Shows for each bag the names of the training data samples in that bag.
4118
4119
=item B
4120
4121
Calls on the appropriate methods of the main C class to estimate the
4122
first-order probabilities from the data samples in each bag.
4123
4124
=item B
4125
4126
Calls on the appropriate method of the main C class to estimate the
4127
class priors for the data samples in each bag.
4128
4129
=item B
4130
4131
Calls on the appropriate method of the main C class to construct a
4132
decision tree from the training data in each bag.
4133
4134
=item B
4135
4136
Display separately the decision tree for each bag..
4137
4138
=item B
4139
4140
Calls on the appropriate methods of the main C class to classify the
4141
argument test sample.
4142
4143
=item B
4144
4145
Displays separately the classification decision made by each the decision tree
4146
constructed for each bag.
4147
4148
=item B
4149
4150
Using majority voting, this method aggregates the classification decisions made by
4151
the individual decision trees into a single decision.
4152
4153
=back
4154
4155
See the example scripts in the directory C for how to call these
4156
methods for classifying individual samples and for bulk classification when you place
4157
all your test samples in a single file.
4158
4159
=head1 USING BOOSTING
4160
4161
Starting with Version 3.20, you can use the class C for
4162
constructing a boosted decision-tree classifier. Boosting results in a cascade of
4163
decision trees in which each decision tree is constructed with samples that are
4164
mostly those that are misclassified by the previous decision tree. To be precise,
4165
you create a probability distribution over the training samples for the selection of
4166
samples for training each decision tree in the cascade. To start out, the
4167
distribution is uniform over all of the samples. Subsequently, this probability
4168
distribution changes according to the misclassifications by each tree in the cascade:
4169
if a sample is misclassified by a given tree in the cascade, the probability of its
4170
being selected for training the next tree is increased significantly. You also
4171
associate a trust factor with each decision tree depending on its power to classify
4172
correctly all of the training data samples. After a cascade of decision trees is
4173
constructed in this manner, you construct a final classifier that calculates the
4174
class label for a test data sample by taking into account the classification
4175
decisions made by each individual tree in the cascade, the decisions being weighted
4176
by the trust factors associated with the individual classifiers. These boosting
4177
notions --- generally referred to as the AdaBoost algorithm --- are based on a now
4178
celebrated paper "A Decision-Theoretic Generalization of On-Line Learning and an
4179
Application to Boosting" by Yoav Freund and Robert Schapire that appeared in 1995 in
4180
the Proceedings of the 2nd European Conf. on Computational Learning Theory. For a
4181
tutorial introduction to AdaBoost, see L
4182
4183
Keep in mind the fact that, ordinarily, the theoretical guarantees provided by
4184
boosting apply only to the case of binary classification. Additionally, your
4185
training dataset must capture all of the significant statistical variations in the
4186
classes represented therein.
4187
4188
=over 4
4189
4190
=item B
4191
4192
If you'd like to experiment with boosting, a typical call to the constructor for the
4193
C class looks like:
4194
4195
use Algorithm::BoostedDecisionTree;
4196
my $training_datafile = "training6.csv";
4197
my $boosted = Algorithm::BoostedDecisionTree->new(
4198
training_datafile => $training_datafile,
4199
csv_class_column_index => 1,
4200
csv_columns_for_features => [2,3],
4201
entropy_threshold => 0.01,
4202
max_depth_desired => 8,
4203
symbolic_to_numeric_cardinality_threshold => 10,
4204
how_many_stages => 4,
4205
);
4206
4207
Note in particular the constructor parameter:
4208
4209
how_many_stages
4210
4211
As its name implies, this parameter controls how many stages will be used in the
4212
boosted decision tree classifier. As mentioned above, a separate decision tree is
4213
constructed for each stage of boosting using a set of training samples that are drawn
4214
through a probability distribution maintained over the entire training dataset.
4215
4216
=back
4217
4218
=head2 B class>
4219
4220
=over 8
4221
4222
=item B
4223
4224
This method reads your training datafile, creates the data structures from the data
4225
ingested for constructing the base decision tree.
4226
4227
=item B
4228
4229
Writes to the standard output the training data samples and also some relevant
4230
properties of the features used in the training dataset.
4231
4232
=item B
4233
4234
Calls on the appropriate methods of the main C class to estimate the
4235
first-order probabilities and the class priors.
4236
4237
=item B
4238
4239
Calls on the appropriate method of the main C class to construct the
4240
base decision tree.
4241
4242
=item B
4243
4244
Displays the base decision tree in your terminal window. (The textual form of the
4245
decision tree is written out to the standard output.)
4246
4247
=item B
4248
4249
Uses the AdaBoost algorithm to construct a cascade of decision trees. As mentioned
4250
earlier, the training samples for each tree in the cascade are drawn using a
4251
probability distribution over the entire training dataset. This probability
4252
distribution for any given tree in the cascade is heavily influenced by which
4253
training samples are misclassified by the previous tree.
4254
4255
=item B
4256
4257
Displays separately in your terminal window the decision tree constructed for each
4258
stage of the cascade. (The textual form of the trees is written out to the standard
4259
output.)
4260
4261
=item B
4262
4263
Calls on each decision tree in the cascade to classify the argument C<$test_sample>.
4264
4265
=item B
4266
4267
You can call this method to display in your terminal window the classification
4268
decision made by each decision tree in the cascade. The method also prints out the
4269
trust factor associated with each decision tree. It is important to look
4270
simultaneously at the classification decision and the trust factor for each tree ---
4271
since a classification decision made by a specific tree may appear bizarre for a
4272
given test sample. This method is useful primarily for debugging purposes.
4273
4274
=item B
4275
4276
As with the previous method, this method is useful mostly for debugging. It returns
4277
class labels for the samples misclassified by the stage whose integer index is
4278
supplied as an argument to the method. Say you have 10 stages in your cascade. The
4279
value of the argument C would go from 0 to 9, with 0 corresponding to
4280
the base tree.
4281
4282
=item B
4283
4284
Uses the "final classifier" formula of the AdaBoost algorithm to pool together the
4285
classification decisions made by the individual trees while taking into account the
4286
trust factors associated with the trees. As mentioned earlier, we associate with
4287
each tree of the cascade a trust factor that depends on the overall misclassification
4288
rate associated with that tree.
4289
4290
=back
4291
4292
See the example scripts in the C subdirectory for how to call the
4293
methods listed above for classifying individual data samples with boosting and for
4294
bulk classification when you place all your test samples in a single file.
4295
4296
4297
=head1 USING RANDOMIZED DECISION TREES
4298
4299
As mentioned earlier, the new C class allows you to solve
4300
the following two problems: (1) Data classification using the needle-in-a-haystack
4301
metaphor, that is, when a vast majority of your training samples belong to just one
4302
class. And (2) You have access to a very large database of training samples and you
4303
wish to construct an ensemble of decision trees for classification.
4304
4305
=over 4
4306
4307
=item B
4308
4309
Here is how you'd call the C constructor for
4310
needle-in-a-haystack classification:
4311
4312
use Algorithm::RandomizedTreesForBigData;
4313
my $training_datafile = "your_database.csv";
4314
my $rt = Algorithm::RandomizedTreesForBigData->new(
4315
training_datafile => $training_datafile,
4316
csv_class_column_index => 48,
4317
csv_columns_for_features => [24,32,33,34,41],
4318
entropy_threshold => 0.01,
4319
max_depth_desired => 8,
4320
symbolic_to_numeric_cardinality_threshold => 10,
4321
how_many_trees => 5,
4322
looking_for_needles_in_haystack => 1,
4323
);
4324
4325
Note in particular the constructor parameters:
4326
4327
looking_for_needles_in_haystack
4328
how_many_trees
4329
4330
The first of these parameters, C, invokes the logic for
4331
constructing an ensemble of decision trees, each based on a training dataset that
4332
uses all of the minority class samples, and a random drawing from the majority class
4333
samples.
4334
4335
Here is how you'd call the C constructor for a more
4336
general attempt at constructing an ensemble of decision trees, with each tree trained
4337
with randomly drawn samples from a large database of training data (without paying
4338
attention to the differences in the sizes of the populations for the different
4339
classes):
4340
4341
use Algorithm::RandomizedTreesForBigData;
4342
my $training_datafile = "your_database.csv";
4343
my $rt = Algorithm::RandomizedTreesForBigData->new(
4344
training_datafile => $training_datafile,
4345
csv_class_column_index => 2,
4346
csv_columns_for_features => [3,4,5,6,7,8],
4347
entropy_threshold => 0.01,
4348
max_depth_desired => 8,
4349
symbolic_to_numeric_cardinality_threshold => 10,
4350
how_many_trees => 3,
4351
how_many_training_samples_per_tree => 50,
4352
);
4353
4354
Note in particular the constructor parameters:
4355
4356
how_many_training_samples_per_tree
4357
how_many_trees
4358
4359
When you set the C parameter, you are not allowed
4360
to also set the C parameter, and vice versa.
4361
4362
=back
4363
4364
=head2 B class>
4365
4366
=over 8
4367
4368
=item B
4369
4370
What this method does depends on which of the two constructor parameters ---
4371
C or C --- is
4372
set. When the former is set, it creates a collection of training datasets for
4373
C number of decision trees, with each dataset being a mixture of the
4374
minority class and sample drawn randomly from the majority class. However, when the
4375
latter option is set, all the datasets are drawn randomly from the training database
4376
with no particular attention given to the relative populations of the two classes.
4377
4378
=item B
4379
4380
As the name implies, this method shows the training data being used for all the
4381
decision trees. This method is useful for debugging purposes using small datasets.
4382
4383
=item B
4384
4385
Calls on the appropriate method of the main C to estimate the
4386
first-order probabilities for the training dataset to be used for each decision tree.
4387
4388
=item B
4389
4390
Calls on the appropriate method of the main C class to estimate the
4391
class priors for the training dataset to be used for each decision tree.
4392
4393
=item B
4394
4395
Calls on the appropriate method of the main C class to construct the
4396
decision trees.
4397
4398
=item B
4399
4400
Displays all the decision trees in your terminal window. (The textual form of the
4401
decision trees is written out to the standard output.)
4402
4403
=item B
4404
4405
The test_sample is sent to each decision tree for classification.
4406
4407
=item B
4408
4409
The classification decisions returned by the individual decision trees are written
4410
out to the standard output.
4411
4412
=item B
4413
4414
This method aggregates the classification results returned by the individual decision
4415
trees and returns the majority decision.
4416
4417
=back
4418
4419
=head1 CONSTRUCTING REGRESSION TREES:
4420
4421
Decision tree based modeling requires that the class labels be distinct. That is,
4422
the training dataset must contain a relatively small number of discrete class labels
4423
for all of your data records if you want to model the data with one or more decision
4424
trees. However, when one is trying to understand all of the associational
4425
relationships that exist in a large database, one often runs into situations where,
4426
instead of discrete class labels, you have a continuously valued variable as a
4427
dependent variable whose values are predicated on a set of feature values. It is for
4428
such situations that you will find useful the new class C that is now
4429
a part of the C module. The C class has been
4430
programmed as a subclass of the main C class.
4431
4432
You can think of regression with a regression tree as a powerful generalization of
4433
the very commonly used Linear Regression algorithms. Although you can certainly
4434
carry out polynomial regression with run-of-the-mill Linear Regression algorithms for
4435
modeling nonlinearities between the predictor variables and the dependent variable,
4436
specifying the degree of the polynomial is often tricky. Additionally, a polynomial
4437
can inject continuities between the predictor and the predicted variables that may
4438
not really exist in the real data. Regression trees, on the other hand, give you a
4439
piecewise linear relationship between the predictor and the predicted variables that
4440
is freed from the constraints of superimposed continuities at the joins between the
4441
different segments. See the following tutorial for further information regarding the
4442
standard linear regression approach and the regression that can be achieved with the
4443
RegressionTree class in this module:
4444
L
4445
4446
The RegressionTree class in the current version of the module assumes that all of
4447
your data is numerical. That is, unlike what is possible with the DecisionTree class
4448
(and the other more closely related classes in this module) that allow your training
4449
file to contain a mixture of numerical and symbolic data, the RegressionTree class
4450
requires that ALL of your data be numerical. I hope to relax this constraint in
4451
future versions of this module. Obviously, the dependent variable will always be
4452
numerical for regression.
4453
4454
See the example scripts in the directory C if you wish to become
4455
more familiar with the regression capabilities of the module.
4456
4457
=over 4
4458
4459
=item B
4460
4461
my $training_datafile = "gendata5.csv";
4462
my $rt = Algorithm::RegressionTree->new(
4463
training_datafile => $training_datafile,
4464
dependent_variable_column => 2,
4465
predictor_columns => [1],
4466
mse_threshold => 0.01,
4467
max_depth_desired => 2,
4468
jacobian_choice => 0,
4469
);
4470
4471
Note in particular the constructor parameters:
4472
4473
dependent_variable
4474
predictor_columns
4475
mse_threshold
4476
jacobian_choice
4477
4478
The first of these parameters, C, is set to the column index in
4479
the CSV file for the dependent variable. The second constructor parameter,
4480
C, tells the system as to which columns contain values for the
4481
predictor variables. The third parameter, C, is for deciding when to
4482
partition the data at a node into two child nodes as a regression tree is being
4483
constructed. If the minmax of MSE (Mean Squared Error) that can be achieved by
4484
partitioning any of the features at a node is smaller than C, that
4485
node becomes a leaf node of the regression tree.
4486
4487
The last parameter, C, must be set to either 0 or 1 or 2. Its
4488
default value is 0. When this parameter equals 0, the regression coefficients are
4489
calculated using the linear least-squares method and no further "refinement" of the
4490
coefficients is carried out using gradient descent. This is the fastest way to
4491
calculate the regression coefficients. When C is set to 1, you get
4492
a weak version of gradient descent in which the Jacobian is set to the "design
4493
matrix" itself. Choosing 2 for C results in a more reasonable
4494
approximation to the Jacobian. That, however, is at a cost of much longer
4495
computation time. B For most cases, using 0 for C is the
4496
best choice. See my tutorial "I" for why
4497
that is the case.
4498
4499
=back
4500
4501
=head2 B class>
4502
4503
=over 8
4504
4505
=item B
4506
4507
Only CSV training datafiles are allowed. Additionally, the first record in the file
4508
must list the names of the fields, and the first column must contain an integer ID
4509
for each record.
4510
4511
=item B
4512
4513
As the name implies, this is the method that construct a regression tree.
4514
4515
=item B
4516
4517
Displays the regression tree, as the name implies. The white-space string argument
4518
specifies the offset to use in displaying the child nodes in relation to a parent
4519
node.
4520
4521
=item B
4522
4523
You call this method after you have constructed a regression tree if you want to
4524
calculate the prediction for one sample. The parameter C<$root_node> is what is
4525
returned by the call C. The formatting of the argument
4526
bound to the C<$test_sample> parameter is important. To elaborate, let's say you are
4527
using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In
4528
this case, the C<$test_sample> parameter will be bound to a list that will look like
4529
4530
['xvar1 = 23.4', 'xvar2 = 12.9']
4531
4532
Arbitrary amount of white space, including none, on the two sides of the equality
4533
symbol is allowed in the construct shown above. A call to this method returns a
4534
dictionary with two key-value pairs. One of the keys is called C and
4535
the other C. The value associated with key C is the path
4536
in the regression tree to the leaf node that yielded the prediction. And the value
4537
associated with the key C is the answer you are looking for.
4538
4539
=item B
4540
4541
This call calculates the predictions for all of the predictor variables data in your
4542
training file. The parameter C<$root_node> is what is returned by the call to
4543
C. The values for the dependent variable thus predicted
4544
can be seen by calling C, which is the method mentioned below.
4545
4546
=item B
4547
4548
This method displays the results obtained by calling the prediction method of the
4549
previous entry. This method also creates a hardcopy of the plots and saves it as a
4550
C<.png> disk file. The name of this output file is always C.
4551
4552
=item B
4553
4554
This method carries out an error analysis of the predictions for the samples in your
4555
training datafile. It shows you the overall MSE (Mean Squared Error) with tree-based
4556
regression, the MSE for the data samples at each of the leaf nodes of the regression
4557
tree, and the MSE for the plain old Linear Regression as applied to all of the data.
4558
The parameter C<$root_node> in the call syntax is what is returned by the call to
4559
C.
4560
4561
=item B
4562
4563
Call this method if you want to apply the regression tree to all your test data in a
4564
disk file. The predictions for all of the test samples in the disk file are written
4565
out to another file whose name is the same as that of the test file except for the
4566
addition of C<_output> in the name of the file. The parameter C<$filename> is the
4567
name of the disk file that contains the test data. And the parameter C<$columns> is a
4568
list of the column indices for the predictor variables in the test file.
4569
4570
=back
4571
4572
=head1 GENERATING SYNTHETIC TRAINING DATA
4573
4574
The module file contains the following additional classes: (1)
4575
C, and (2) C for
4576
generating synthetic training data.
4577
4578
The class C outputs one CSV file for the
4579
training data and another one for the test data for experimenting with numeric
4580
features. The numeric values are generated using a multivariate Gaussian
4581
distribution whose mean and covariance are specified in a parameter file. See the
4582
file C in the C directory for an example of such a
4583
parameter file. Note that the dimensionality of the data is inferred from the
4584
information you place in the parameter file.
4585
4586
The class C generates synthetic training for the
4587
purely symbolic case. The relative frequencies of the different possible values for
4588
the features is controlled by the biasing information you place in a parameter file.
4589
See C for an example of such a file.
4590
4591
4592
=head1 THE C DIRECTORY
4593
4594
See the C directory in the distribution for how to construct a decision
4595
tree, and how to then classify new data using the decision tree. To become more
4596
familiar with the module, run the scripts
4597
4598
construct_dt_and_classify_one_sample_case1.pl
4599
construct_dt_and_classify_one_sample_case2.pl
4600
construct_dt_and_classify_one_sample_case3.pl
4601
construct_dt_and_classify_one_sample_case4.pl
4602
4603
The first script is for the purely symbolic case, the second for the case that
4604
involves both numeric and symbolic features, the third for the case of purely numeric
4605
features, and the last for the case when the training data is synthetically generated
4606
by the script C.
4607
4608
Next run the following script as it is for bulk classification of data records placed
4609
in a CSV file:
4610
4611
classify_test_data_in_a_file.pl training4.csv test4.csv out4.csv
4612
4613
The script first constructs a decision tree using the training data in the training
4614
file supplied by the first argument file C. The script then
4615
calculates the class label for each data record in the test data file supplied
4616
through the second argument file, C. The estimated class labels are
4617
written out to the output file which in the call shown above is C. An
4618
important thing to note here is that your test file --- in this case C ---
4619
must have a column for class labels. Obviously, in real-life situations, there will
4620
be no class labels in this column. What that is the case, you can place an empty
4621
string C<""> there for each data record. This is demonstrated by the following call:
4622
4623
classify_test_data_in_a_file.pl training4.csv test4_no_class_labels.csv out4.csv
4624
4625
The following script in the C directory
4626
4627
classify_by_asking_questions.pl
4628
4629
shows how you can use a decision-tree classifier interactively. In this mode, you
4630
first construct the decision tree from the training data and then the user is
4631
prompted for answers to the feature tests at the nodes of the tree.
4632
4633
If your training data has a feature whose values span a large range and, at the same
4634
time, are characterized by a heavy-tail distribution, you should look at the script
4635
4636
construct_dt_for_heavytailed.pl
4637
4638
to see how to use the option C in the call to the
4639
constructor. This option was introduced in Version 2.22 for dealing with such
4640
features. If you do not set this option, the module will use the default value of
4641
500 for the number of points at which to sample the value range for such a feature.
4642
4643
The C directory also contains the following scripts:
4644
4645
generate_training_data_numeric.pl
4646
generate_training_data_symbolic.pl
4647
4648
that show how you can use the module to generate synthetic training. Synthetic
4649
training is generated according to the specifications laid out in a parameter file.
4650
There are constraints on how the information is laid out in a parameter file. See
4651
the files C and C in the C directory
4652
for how to structure these files.
4653
4654
The C directory of Versions 2.1 and higher of the module also contains the
4655
following two scripts:
4656
4657
evaluate_training_data1.pl
4658
evaluate_training_data2.pl
4659
4660
that illustrate how the Perl class C can be used to evaluate the
4661
quality of your training data (as long as it resides in a `C<.csv>' file.) This new
4662
class is a subclass of the C class in the module file. See the README
4663
in the C directory for further information regarding these two scripts.
4664
4665
The C directory of Versions 2.31 and higher of the module contains the
4666
following three scripts:
4667
4668
introspection_in_a_loop_interactive.pl
4669
4670
introspection_show_training_samples_at_all_nodes_direct_influence.pl
4671
4672
introspection_show_training_samples_to_nodes_influence_propagation.pl
4673
4674
The first script illustrates how to use the C class of the module
4675
interactively for generating explanations for the classification decisions made at
4676
the nodes of the decision tree. In the interactive session you are first asked for
4677
the node number you are interested in. Subsequently, you are asked for whether or
4678
not you are interested in specific questions that the introspector can provide
4679
answers for. The second script generates a tabular display that shows for each node
4680
of the decision tree a list of the training samples that fall directly in the portion
4681
of the feature space assigned that node. (As mentioned elsewhere in this
4682
documentation, when this list is empty for a node, that means the node is a result of
4683
the generalization achieved by probabilistic modeling of the data. Note that this
4684
module constructs a decision tree NOT by partitioning the set of training samples,
4685
BUT by partitioning the domains of the probability density functions.) The third
4686
script listed above also generates a tabular display, but one that shows how the
4687
influence of each training sample propagates in the tree. This display first shows
4688
the list of nodes that are affected directly by the data in a training sample. This
4689
list is followed by an indented display of the nodes that are affected indirectly by
4690
the training sample. A training sample affects a node indirectly if the node is a
4691
descendant of one of the nodes affected directly.
4692
4693
4694
=head1 THE C DIRECTORY
4695
4696
The C directory contains the following scripts:
4697
4698
bagging_for_classifying_one_test_sample.pl
4699
4700
bagging_for_bulk_classification.pl
4701
4702
As the names of the scripts imply, the first shows how to call the different methods
4703
of the C class for classifying a single test sample. When
4704
you are classifying a single test sample, you can also see how each bag is
4705
classifying the test sample. You can, for example, display the training data used in
4706
each bag, the decision tree constructed for each bag, etc.
4707
4708
The second script is for the case when you place all of the test samples in a single
4709
file. The demonstration script displays for each test sample a single aggregate
4710
classification decision that is obtained through majority voting by all the decision
4711
trees.
4712
4713
4714
=head1 THE C DIRECTORY
4715
4716
The C subdirectory in the main installation directory contains the
4717
following three scripts:
4718
4719
boosting_for_classifying_one_test_sample_1.pl
4720
4721
boosting_for_classifying_one_test_sample_2.pl
4722
4723
boosting_for_bulk_classification.pl
4724
4725
As the names of the first two scripts imply, these show how to call the different
4726
methods of the C class for classifying a single test sample.
4727
When you are classifying a single test sample, you can see how each stage of the
4728
cascade of decision trees is classifying the test sample. You can also view each
4729
decision tree separately and also see the trust factor associated with the tree.
4730
4731
The third script is for the case when you place all of the test samples in a single
4732
file. The demonstration script outputs for each test sample a single aggregate
4733
classification decision that is obtained through trust-factor weighted majority
4734
voting by all the decision trees.
4735
4736
=head1 THE C DIRECTORY
4737
4738
The C directory shows example scripts that you can use to
4739
become more familiar with the C class for solving
4740
needle-in-a-haystack and big-data data classification problems. These scripts are:
4741
4742
randomized_trees_for_classifying_one_test_sample_1.pl
4743
4744
randomized_trees_for_classifying_one_test_sample_2.pl
4745
4746
classify_database_records.pl
4747
4748
The first script shows the constructor options to use for solving a
4749
needle-in-a-haystack problem --- that is, a problem in which a vast majority of the
4750
training data belongs to just one class. The second script shows the constructor
4751
options for using randomized decision trees for the case when you have access to a
4752
very large database of training samples and you'd like to construct an ensemble of
4753
decision trees using training samples pulled randomly from the training database.
4754
The last script illustrates how you can evaluate the classification power of an
4755
ensemble of decision trees as constructed by C by classifying
4756
a large number of test samples extracted randomly from the training database.
4757
4758
4759
=head1 THE C DIRECTORY
4760
4761
The C subdirectory in the main installation directory shows
4762
example scripts that you can use to become familiar with regression trees and how
4763
they can be used for nonlinear regression. If you are new to the concept of
4764
regression trees, start by executing the following scripts without changing them and
4765
see what sort of output is produced by them:
4766
4767
regression4.pl
4768
4769
regression5.pl
4770
4771
regression6.pl
4772
4773
regression8.pl
4774
4775
The C script involves only one predictor variable and one dependent
4776
variable. The training data for this exercise is drawn from the file C.
4777
This data file contains strongly nonlinear data. When you run the script
4778
C, you will see how much better the result from tree regression is
4779
compared to what you can get with linear regression.
4780
4781
The C script is essentially the same as the previous script except
4782
for the fact that the training datafile used in this case, C, consists
4783
of three noisy segments, as opposed to just two in the previous case.
4784
4785
The script C deals with the case when we have two predictor variables
4786
and one dependent variable. You can think of the data as consisting of noisy height
4787
values over an C<(x1,x2)> plane. The data used in this script is drawn from the csv
4788
file C.
4789
4790
Finally, the script C shows how you can carry out bulk prediction for
4791
all your test data records in a disk file. The script writes all the calculated
4792
predictions into another disk file whose name is derived from the name of the test
4793
data file.
4794
4795
4796
=head1 EXPORT
4797
4798
None by design.
4799
4800
=head1 BUGS
4801
4802
Please notify the author if you encounter any bugs. When sending email, please place
4803
the string 'DecisionTree' in the subject line.
4804
4805
=head1 INSTALLATION
4806
4807
Download the archive from CPAN in any directory of your choice. Unpack the archive
4808
with a command that on a Linux machine would look like:
4809
4810
tar zxvf Algorithm-DecisionTree-3.41.tar.gz
4811
4812
This will create an installation directory for you whose name will be
4813
C. Enter this directory and execute the following
4814
commands for a standard install of the module if you have root privileges:
4815
4816
perl Makefile.PL
4817
make
4818
make test
4819
sudo make install
4820
4821
If you do not have root privileges, you can carry out a non-standard install the
4822
module in any directory of your choice by:
4823
4824
perl Makefile.PL prefix=/some/other/directory/
4825
make
4826
make test
4827
make install
4828
4829
With a non-standard install, you may also have to set your PERL5LIB environment
4830
variable so that this module can find the required other modules. How you do that
4831
would depend on what platform you are working on. In order to install this module in
4832
a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
4833
variable by
4834
4835
setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4836
4837
If I used bash, I'd need to declare:
4838
4839
export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4840
4841
4842
=head1 THANKS
4843
4844
I wish to thank many users of this module for their feedback. Many of the
4845
improvements I have made to the module over the years are a result of the feedback
4846
received.
4847
4848
I thank Slaven Rezic for pointing out that the module worked with Perl 5.14.x. For
4849
Version 2.22, I had set the required version of Perl to 5.18.0 since that's what I
4850
used for testing the module. Slaven's feedback in the form of the Bug report
4851
C<#96547> resulted in Version 2.23 of the module. Version 2.25 further downshifts
4852
the required version of Perl to 5.10.
4853
4854
On the basis of the report posted by Slaven at C regarding Version 2.27,
4855
I am removing the Perl version restriction altogether from Version 2.30. Thanks
4856
Slaven!
4857
4858
4859
=head1 AUTHOR
4860
4861
The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy Project"
4862
with the publication of the book I by John-Wiley. If
4863
interested, visit his web page at Purdue to find out what this project was all
4864
about. You might like I especially if you enjoyed reading
4865
Harry Potter as a kid (or even as an adult, for that matter).
4866
4867
If you send email regarding this module, please place the string "DecisionTree" in
4868
your subject line to get past my spam filter. Avi Kak's email address is
4869
C
4870
4871
=head1 COPYRIGHT
4872
4873
This library is free software; you can redistribute it and/or modify it under the
4874
same terms as Perl itself.
4875
4876
Copyright 2016 Avinash Kak
4877
4878
=cut
4879