line
stmt
bran
cond
sub
pod
time
code
1
package Algorithm::DecisionTree;
2
3
#--------------------------------------------------------------------------------------
4
# Copyright (c) 2017 Avinash Kak. All rights reserved. This program is free
5
# software. You may modify and/or distribute it under the same terms as Perl itself.
6
# This copyright notice must remain attached to the file.
7
#
8
# Algorithm::DecisionTree is a Perl module for decision-tree based classification of
9
# multidimensional data.
10
# -------------------------------------------------------------------------------------
11
12
#use 5.10.0;
13
1
1
22561
use strict;
1
4
1
46
14
1
1
9
use warnings;
1
3
1
51
15
1
1
7
use Carp;
1
9
1
21944
16
17
our $VERSION = '3.43';
18
19
############################################ Constructor ##############################################
20
sub new {
21
1
1
1
559
my ($class, %args, $eval_or_boosting_mode);
22
1
50
6
if (@_ % 2 != 0) {
23
1
7
($class, %args) = @_;
24
} else {
25
0
0
$class = shift;
26
0
0
$eval_or_boosting_mode = shift;
27
0
0
0
0
die unless $eval_or_boosting_mode eq 'evalmode' || $eval_or_boosting_mode eq 'boostingmode';
28
0
0
0
die "Only one string arg allowed in eval and boosting modes" if @_;
29
}
30
1
50
5
unless ($eval_or_boosting_mode) {
31
1
5
my @params = keys %args;
32
1
50
5
croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
33
if check_for_illegal_params2(@params) == 0;
34
}
35
bless {
36
_training_datafile => $args{training_datafile},
37
_entropy_threshold => $args{entropy_threshold} || 0.01,
38
_max_depth_desired => exists $args{max_depth_desired} ?
39
$args{max_depth_desired} : undef,
40
_debug1 => $args{debug1} || 0,
41
_debug2 => $args{debug2} || 0,
42
_debug3 => $args{debug3} || 0,
43
_csv_class_column_index => $args{csv_class_column_index} || undef,
44
_csv_columns_for_features => $args{csv_columns_for_features} || undef,
45
_symbolic_to_numeric_cardinality_threshold
46
=> $args{symbolic_to_numeric_cardinality_threshold} || 10,
47
_number_of_histogram_bins => $args{number_of_histogram_bins} || undef,
48
1
50
50
76
_csv_cleanup_needed => $args{csv_cleanup_needed} || 0,
50
50
50
50
50
50
50
50
49
_training_data => [],
50
_root_node => undef,
51
_probability_cache => {},
52
_entropy_cache => {},
53
_training_data_hash => {},
54
_features_and_values_hash => {},
55
_samples_class_label_hash => {},
56
_class_names => [],
57
_class_priors => [],
58
_class_priors_hash => {},
59
_feature_names => [],
60
_numeric_features_valuerange_hash => {},
61
_sampling_points_for_numeric_feature_hash => {},
62
_feature_values_how_many_uniques_hash => {},
63
_prob_distribution_numeric_features_hash => {},
64
_histogram_delta_hash => {},
65
_num_of_histogram_bins_hash => {},
66
}, $class;
67
68
}
69
70
#################################### Classify with Decision Tree #######################################
71
72
## Classifies one test sample at a time using the decision tree constructed from
73
## your training file. The data record for the test sample must be supplied as
74
## shown in the scripts in the `examples' subdirectory. See the scripts
75
## construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
76
sub classify {
77
0
0
1
0
my $self = shift;
78
0
0
my $root_node = shift;
79
0
0
my $feature_and_values = shift;
80
0
0
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
81
0
0
my @features_and_values = @$feature_and_values;
82
0
0
@features_and_values = @{deep_copy_array(\@features_and_values)};
0
0
83
0
0
0
die "\n\nError in the names you have used for features and/or values. " .
84
"Try using the csv_cleanup_needed option in the constructor call."
85
unless $self->check_names_used(\@features_and_values);
86
0
0
my @new_features_and_values = ();
87
0
0
my $pattern = '(\S+)\s*=\s*(\S+)';
88
0
0
foreach my $feature_and_value (@features_and_values) {
89
0
0
$feature_and_value =~ /$pattern/;
90
0
0
my ($feature, $value) = ($1, $2);
91
0
0
my $newvalue = $value;
92
0
0
my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
0
0
93
0
0
my $not_all_values_float = 0;
94
0
0
0
map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
0
0
95
0
0
0
0
if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
0
0
96
$not_all_values_float == 0) {
97
0
0
$newvalue = closest_sampling_point($value, \@unique_values_for_feature);
98
}
99
0
0
push @new_features_and_values, "$feature" . '=' . "$newvalue";
100
}
101
0
0
@features_and_values = @new_features_and_values;
102
0
0
0
print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
103
0
0
my %answer = ();
104
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
105
0
0
$answer{$class_name} = undef;
106
}
107
0
0
$answer{'solution_path'} = [];
108
0
0
my %classification = %{$self->recursive_descent_for_classification($root_node,
0
0
109
\@features_and_values,\%answer)};
110
0
0
@{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
0
0
0
0
111
0
0
0
if ($self->{_debug3}) {
112
0
0
print "\nCL2 The classification:\n";
113
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
114
0
0
print " $class_name with probability $classification{$class_name}\n";
115
}
116
}
117
0
0
my %classification_for_display = ();
118
0
0
foreach my $item (keys %classification) {
119
0
0
0
if ($item ne 'solution_path') {
120
0
0
$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
121
} else {
122
0
0
my @outlist = ();
123
0
0
foreach my $x (@{$classification{$item}}) {
0
0
124
0
0
push @outlist, "NODE$x";
125
}
126
0
0
$classification_for_display{$item} = \@outlist;
127
}
128
}
129
0
0
return \%classification_for_display;
130
}
131
132
sub recursive_descent_for_classification {
133
0
0
0
0
my $self = shift;
134
0
0
my $node = shift;
135
0
0
my $features_and_values = shift;
136
0
0
my $answer = shift;
137
0
0
my @features_and_values = @$features_and_values;
138
0
0
my %answer = %$answer;
139
0
0
my @children = @{$node->get_children()};
0
0
140
0
0
0
if (@children == 0) {
141
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
142
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
143
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
144
}
145
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
146
0
0
return \%answer;
147
}
148
0
0
my $feature_tested_at_node = $node->get_feature();
149
print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n"
150
0
0
0
if $self->{_debug3};
151
0
0
my $value_for_feature;
152
my $path_found;
153
0
0
my $pattern = '(\S+)\s*=\s*(\S+)';
154
0
0
foreach my $feature_and_value (@features_and_values) {
155
0
0
$feature_and_value =~ /$pattern/;
156
0
0
0
$value_for_feature = $2 if $feature_tested_at_node eq $1;
157
}
158
# The following clause introduced in Version 3.20
159
0
0
0
if (!defined $value_for_feature) {
160
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
161
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
162
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
163
}
164
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
165
0
0
return \%answer;
166
}
167
0
0
0
if ($value_for_feature) {
168
0
0
0
if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
0
0
169
0
0
0
print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};
170
0
0
my $pattern1 = '(.+)<(.+)';
171
0
0
my $pattern2 = '(.+)>(.+)';
172
0
0
foreach my $child (@children) {
173
0
0
my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
0
0
174
0
0
my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
175
0
0
0
if ($last_feature_and_value_on_branch =~ /$pattern1/) {
176
0
0
my ($feature, $threshold) = ($1,$2);
177
0
0
0
if ($value_for_feature <= $threshold) {
178
0
0
$path_found = 1;
179
0
0
%answer = %{$self->recursive_descent_for_classification($child,
0
0
180
$features_and_values,\%answer)};
181
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
182
0
0
last;
183
}
184
}
185
0
0
0
if ($last_feature_and_value_on_branch =~ /$pattern2/) {
186
0
0
my ($feature, $threshold) = ($1,$2);
187
0
0
0
if ($value_for_feature > $threshold) {
188
0
0
$path_found = 1;
189
0
0
%answer = %{$self->recursive_descent_for_classification($child,
0
0
190
$features_and_values,\%answer)};
191
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
192
0
0
last;
193
}
194
}
195
}
196
0
0
0
return \%answer if $path_found;
197
} else {
198
0
0
my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";
199
print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n"
200
0
0
0
if $self->{_debug3};
201
0
0
foreach my $child (@children) {
202
0
0
my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
0
0
203
0
0
0
print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};
204
0
0
my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
205
0
0
0
if ($last_feature_and_value_on_branch eq $feature_value_combo) {
206
0
0
%answer = %{$self->recursive_descent_for_classification($child,
0
0
207
$features_and_values,\%answer)};
208
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
209
0
0
$path_found = 1;
210
0
0
last;
211
}
212
}
213
0
0
0
return \%answer if $path_found;
214
}
215
}
216
0
0
0
if (! $path_found) {
217
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
218
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
219
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
220
}
221
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
222
}
223
0
0
return \%answer;
224
}
225
226
## If you want classification to be carried out by engaging a human user in a
227
## question-answer session, this is the method to use for that purpose. See, for
228
## example, the script classify_by_asking_questions.pl in the `examples'
229
## subdirectory for an illustration of how to do that.
230
sub classify_by_asking_questions {
231
0
0
1
0
my $self = shift;
232
0
0
my $root_node = shift;
233
0
0
my %answer = ();
234
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
235
0
0
$answer{$class_name} = undef;
236
}
237
0
0
$answer{'solution_path'} = [];
238
0
0
my %scratchpad_for_numeric_answers = ();
239
0
0
foreach my $feature_name (keys %{$self->{_prob_distribution_numeric_features_hash}}) {
0
0
240
0
0
$scratchpad_for_numeric_answers{$feature_name} = undef;
241
}
242
0
0
my %classification = %{$self->interactive_recursive_descent_for_classification($root_node,
0
0
243
\%answer, \%scratchpad_for_numeric_answers)};
244
0
0
@{$classification{'solution_path'}} = reverse @{$classification{'solution_path'}};
0
0
0
0
245
0
0
my %classification_for_display = ();
246
0
0
foreach my $item (keys %classification) {
247
0
0
0
if ($item ne 'solution_path') {
248
0
0
$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
249
} else {
250
0
0
my @outlist = ();
251
0
0
foreach my $x (@{$classification{$item}}) {
0
0
252
0
0
push @outlist, "NODE$x";
253
}
254
0
0
$classification_for_display{$item} = \@outlist;
255
}
256
}
257
0
0
return \%classification_for_display;
258
}
259
260
sub interactive_recursive_descent_for_classification {
261
0
0
0
0
my $self = shift;
262
0
0
my $node = shift;
263
0
0
my $answer = shift;
264
0
0
my $scratchpad_for_numerics = shift;
265
0
0
my %answer = %$answer;
266
0
0
my %scratchpad_for_numerics = %$scratchpad_for_numerics;
267
0
0
my $pattern1 = '(.+)<(.+)';
268
0
0
my $pattern2 = '(.+)>(.+)';
269
0
0
my $user_value_for_feature;
270
0
0
my @children = @{$node->get_children()};
0
0
271
0
0
0
if (@children == 0) {
272
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
273
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
274
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
275
}
276
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
277
0
0
return \%answer;
278
}
279
0
0
my @list_of_branch_attributes_to_children = ();
280
0
0
foreach my $child (@children) {
281
0
0
my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
0
0
282
0
0
my $feature_and_value_on_branch = $branch_features_and_values[-1];
283
0
0
push @list_of_branch_attributes_to_children, $feature_and_value_on_branch;
284
}
285
0
0
my $feature_tested_at_node = $node->get_feature();
286
0
0
my $feature_value_combo;
287
0
0
my $path_found = 0;
288
0
0
0
if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
0
0
289
0
0
0
if ($scratchpad_for_numerics{$feature_tested_at_node}) {
290
0
0
$user_value_for_feature = $scratchpad_for_numerics{$feature_tested_at_node};
291
} else {
292
0
0
my @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_tested_at_node}};
0
0
293
0
0
while (1) {
294
0
0
print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
295
0
0
print "\nEnter a value in the range (@valuerange): ";
296
0
0
$user_value_for_feature = ;
297
0
0
$user_value_for_feature =~ s/\r?\n?$//;
298
0
0
$user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
299
0
0
my $answer_found = 0;
300
0
0
0
0
if ($user_value_for_feature >= $valuerange[0] && $user_value_for_feature <= $valuerange[1]) {
301
0
0
$answer_found = 1;
302
0
0
last;
303
}
304
0
0
0
last if $answer_found;
305
0
0
print("You entered illegal value. Let's try again")
306
}
307
0
0
$scratchpad_for_numerics{$feature_tested_at_node} = $user_value_for_feature;
308
}
309
0
0
foreach my $i (0..@list_of_branch_attributes_to_children-1) {
310
0
0
my $branch_attribute = $list_of_branch_attributes_to_children[$i];
311
0
0
0
if ($branch_attribute =~ /$pattern1/) {
312
0
0
my ($feature,$threshold) = ($1,$2);
313
0
0
0
if ($user_value_for_feature <= $threshold) {
314
0
0
%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
0
0
315
\%answer, \%scratchpad_for_numerics)};
316
0
0
$path_found = 1;
317
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
318
0
0
last;
319
}
320
}
321
0
0
0
if ($branch_attribute =~ /$pattern2/) {
322
0
0
my ($feature,$threshold) = ($1,$2);
323
0
0
0
if ($user_value_for_feature > $threshold) {
324
0
0
%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
0
0
325
\%answer, \%scratchpad_for_numerics)};
326
0
0
$path_found = 1;
327
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
328
0
0
last;
329
}
330
}
331
}
332
0
0
0
return \%answer if $path_found;
333
} else {
334
0
0
my @possible_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature_tested_at_node}};
0
0
335
0
0
while (1) {
336
0
0
print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
337
0
0
print "\nEnter a value from the list (@possible_values_for_feature): ";
338
0
0
$user_value_for_feature = ;
339
0
0
$user_value_for_feature =~ s/\r?\n?$//;
340
0
0
$user_value_for_feature =~ s/^\s*(\S+)\s*$/$1/;
341
0
0
my $answer_found = 0;
342
0
0
0
if (contained_in($user_value_for_feature, @possible_values_for_feature)) {
343
0
0
$answer_found = 1;
344
0
0
last;
345
}
346
0
0
0
last if $answer_found;
347
0
0
print("You entered illegal value. Let's try again");
348
}
349
0
0
$feature_value_combo = "$feature_tested_at_node=$user_value_for_feature";
350
0
0
foreach my $i (0..@list_of_branch_attributes_to_children-1) {
351
0
0
my $branch_attribute = $list_of_branch_attributes_to_children[$i];
352
0
0
0
if ($branch_attribute eq $feature_value_combo) {
353
0
0
%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
0
0
354
\%answer, \%scratchpad_for_numerics)};
355
0
0
$path_found = 1;
356
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
357
0
0
last;
358
}
359
}
360
0
0
0
return \%answer if $path_found;
361
}
362
0
0
0
if (! $path_found) {
363
0
0
my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
0
0
364
0
0
foreach my $i (0..@{$self->{_class_names}}-1) {
0
0
365
0
0
$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
366
}
367
0
0
push @{$answer{'solution_path'}}, $node->get_serial_num();
0
0
368
}
369
0
0
return \%answer;
370
}
371
372
###################################### Decision Tree Construction ####################################
373
374
## At the root node, we find the best feature that yields the greatest reduction in
375
## class entropy from the entropy based on just the class priors. The logic for
376
## finding this feature is different for symbolic features and for numeric features.
377
## That logic is built into the method shown later for best feature calculations.
378
sub construct_decision_tree_classifier {
379
1
1
1
18
print "\nConstructing the decision tree ...\n";
380
1
6
my $self = shift;
381
1
50
7
if ($self->{_debug3}) {
382
0
0
$self->determine_data_condition();
383
0
0
print "\nStarting construction of the decision tree:\n";
384
}
385
1
4
my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
2
12
1
6
386
1
50
9
if ($self->{_debug3}) {
387
0
0
print "\nPrior class probabilities: @class_probabilities\n";
388
0
0
print "\nClass names: @{$self->{_class_names}}\n";
0
0
389
}
390
1
8
my $entropy = $self->class_entropy_on_priors();
391
1
50
9
print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};
392
1
35
my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');
393
1
4
$root_node->set_class_names(\@{$self->{_class_names}});
1
8
394
1
4
$self->{_root_node} = $root_node;
395
1
7
$self->recursive_descent($root_node);
396
1
11
return $root_node;
397
}
398
399
## After the root node of the decision tree is calculated by the previous methods,
400
## we invoke this method recursively to create the rest of the tree. At each node,
401
## we find the feature that achieves the largest entropy reduction with regard to
402
## the partitioning of the training data samples that correspond to that node.
403
sub recursive_descent {
404
46
46
0
119
my $self = shift;
405
46
109
my $node = shift;
406
print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"
407
46
50
182
if $self->{_debug3};
408
46
171
my $node_serial_number = $node->get_serial_num();
409
46
106
my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
46
175
410
46
165
my $existing_node_entropy = $node->get_node_entropy();
411
46
50
315
if ($self->{_debug3}) {
412
0
0
print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";
413
0
0
print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
414
0
0
print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
415
0
0
my @class_probs = @{$node->get_class_probabilities()};
0
0
416
0
0
print "\nRD4 Class probabilities: @class_probs\n";
417
}
418
46
50
249
if ($existing_node_entropy < $self->{_entropy_threshold}) {
419
0
0
0
print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
420
0
0
return;
421
}
422
46
108
my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
46
157
423
46
245
my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
424
$self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
425
46
329
$node->set_feature($best_feature);
426
46
50
205
$node->display_node() if $self->{_debug3};
427
46
50
33
188
if (defined($self->{_max_depth_desired}) &&
428
(@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
429
0
0
0
print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3};
430
0
0
return;
431
}
432
46
50
175
return if ! defined $best_feature;
433
46
50
161
if ($self->{_debug3}) {
434
0
0
print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
435
0
0
print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
436
0
0
print "\nRD9 Best feature entropy: $best_feature_entropy\n";
437
0
0
print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
438
}
439
46
140
my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
440
46
50
174
print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
441
46
100
189
if ($entropy_gain > $self->{_entropy_threshold}) {
442
26
50
33
154
if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} &&
443
$self->{_feature_values_how_many_uniques_hash}->{$best_feature} >
444
$self->{_symbolic_to_numeric_cardinality_threshold}) {
445
0
0
my $best_threshold = $decision_val; # as returned by best feature calculator
446
0
0
my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
447
my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
448
0
0
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
0
0
449
my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
450
0
0
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
0
0
451
0
0
my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
452
0
0
my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
453
0
0
push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
454
$feature_threshold_combo_for_less_than;
455
0
0
push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
456
$feature_threshold_combo_for_greater_than;
457
0
0
0
if ($self->{_debug3}) {
458
0
0
print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
459
"@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
460
0
0
print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
461
"@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
462
}
463
my @class_probabilities_for_lessthan_child_node =
464
0
0
map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
465
0
0
\@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
0
0
466
my @class_probabilities_for_greaterthan_child_node =
467
0
0
map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
468
0
0
\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
0
0
469
0
0
0
if ($self->{_debug3}) {
470
0
0
print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
471
0
0
print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
472
}
473
0
0
0
if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
474
0
0
my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
475
\@class_probabilities_for_lessthan_child_node,
476
\@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
477
0
0
$node->add_child_link($left_child_node);
478
0
0
$self->recursive_descent($left_child_node);
479
}
480
0
0
0
if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
481
0
0
my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
482
\@class_probabilities_for_greaterthan_child_node,
483
\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
484
0
0
$node->add_child_link($right_child_node);
485
0
0
$self->recursive_descent($right_child_node);
486
}
487
} else {
488
print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
489
26
50
108
if $self->{_debug3};
490
26
68
my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
26
189
491
26
50
116
print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
492
26
95
my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
88
495
493
26
88
my @class_entropies_for_children = ();
494
26
126
foreach my $feature_and_value_index (0..@feature_value_combos-1) {
495
print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
496
88
50
328
if $self->{_debug3};
497
88
186
my @extended_branch_features_and_values_or_thresholds;
498
88
100
296
if (! @features_and_values_or_thresholds_on_branch) {
499
4
18
@extended_branch_features_and_values_or_thresholds =
500
($feature_value_combos[$feature_and_value_index]);
501
} else {
502
@extended_branch_features_and_values_or_thresholds =
503
84
177
@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
84
258
504
84
279
push @extended_branch_features_and_values_or_thresholds,
505
$feature_value_combos[$feature_and_value_index];
506
}
507
my @class_probabilities =
508
176
568
map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
509
88
191
\@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
88
290
510
88
310
my $class_entropy_for_child =
511
$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
512
\@extended_branch_features_and_values_or_thresholds);
513
88
50
297
if ($self->{_debug3}) {
514
0
0
print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
515
0
0
print "\nRD20 class entropy for child: $class_entropy_for_child\n";
516
}
517
88
100
302
if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
518
45
290
my $child_node = DTNode->new(undef, $class_entropy_for_child,
519
\@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
520
45
227
$node->add_child_link($child_node);
521
45
289
$self->recursive_descent($child_node);
522
} else {
523
43
50
281
print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
524
}
525
}
526
}
527
} else {
528
print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n"
529
20
50
74
if $self->{_debug3};
530
20
159
return;
531
}
532
}
533
534
## This is the heart of the decision tree constructor. Its main job is to figure
535
## out the best feature to use for partitioning the training data samples that
536
## correspond to the current node. The search for the best feature is carried out
537
## differently for symbolic features and for numeric features. For a symbolic
538
## feature, the method estimates the entropy for each value of the feature and then
539
## averages out these entropies as a measure of the discriminatory power of that
540
## features. For a numeric feature, on the other hand, it estimates the entropy
541
## reduction that can be achieved if were to partition the set of training samples
542
## for each possible threshold. For a numeric feature, all possible sampling points
543
## relevant to the node in question are considered as candidates for thresholds.
544
sub best_feature_calculator {
545
46
46
0
108
my $self = shift;
546
46
105
my $features_and_values_or_thresholds_on_branch = shift;
547
46
120
my $existing_node_entropy = shift;
548
46
217
my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
549
46
109
my $pattern1 = '(.+)=(.+)';
550
46
106
my $pattern2 = '(.+)<(.+)';
551
46
145
my $pattern3 = '(.+)>(.+)';
552
46
125
my @all_symbolic_features = ();
553
46
112
foreach my $feature_name (@{$self->{_feature_names}}) {
46
153
554
push @all_symbolic_features, $feature_name
555
184
50
611
if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
556
}
557
46
150
my @symbolic_features_already_used = ();
558
46
136
foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
559
138
50
1156
push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
560
}
561
46
150
my @symbolic_features_not_yet_used;
562
46
130
foreach my $x (@all_symbolic_features) {
563
184
100
539
push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
564
}
565
46
128
my @true_numeric_types = ();
566
46
103
my @symbolic_types = ();
567
46
160
my @true_numeric_types_feature_names = ();
568
46
116
my @symbolic_types_feature_names = ();
569
46
118
foreach my $item (@features_and_values_or_thresholds_on_branch) {
570
138
50
1047
if ($item =~ /$pattern2/) {
50
50
571
0
0
push @true_numeric_types, $item;
572
0
0
push @true_numeric_types_feature_names, $1;
573
} elsif ($item =~ /$pattern3/) {
574
0
0
push @true_numeric_types, $item;
575
0
0
push @true_numeric_types_feature_names, $1;
576
} elsif ($item =~ /$pattern1/) {
577
138
342
push @symbolic_types, $item;
578
138
439
push @symbolic_types_feature_names, $1;
579
} else {
580
0
0
die "format error in the representation of feature and values or thresholds";
581
}
582
}
583
46
182
my %seen = ();
584
46
0
130
@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
0
0
585
46
123
%seen = ();
586
46
50
141
@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
138
765
587
my @bounded_intervals_numeric_types =
588
46
112
@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
46
205
589
# Calculate the upper and the lower bounds to be used when searching for the best
590
# threshold for each of the numeric features that are in play at the current node:
591
46
141
my (%upperbound, %lowerbound);
592
46
144
foreach my $feature (@true_numeric_types_feature_names) {
593
0
0
$upperbound{$feature} = undef;
594
0
0
$lowerbound{$feature} = undef;
595
}
596
46
130
foreach my $item (@bounded_intervals_numeric_types) {
597
0
0
foreach my $feature_grouping (@$item) {
598
0
0
0
if ($feature_grouping->[1] eq '>') {
599
0
0
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
600
} else {
601
0
0
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
602
}
603
}
604
}
605
46
141
my %entropy_values_for_different_features = ();
606
46
114
my %partitioning_point_child_entropies_hash = ();
607
46
107
my %partitioning_point_threshold = ();
608
46
118
my %entropies_for_different_values_of_symbolic_feature = ();
609
46
106
foreach my $feature (@{$self->{_feature_names}}) {
46
159
610
184
448
$entropy_values_for_different_features{$feature} = [];
611
184
453
$partitioning_point_child_entropies_hash{$feature} = {};
612
184
374
$partitioning_point_threshold{$feature} = undef;
613
184
519
$entropies_for_different_values_of_symbolic_feature{$feature} = [];
614
}
615
46
112
foreach my $i (0..@{$self->{_feature_names}}-1) {
46
167
616
184
544
my $feature_name = $self->{_feature_names}->[$i];
617
184
50
545
print "\n\nBFC1 FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
618
184
100
33
590
if (contained_in($feature_name, @symbolic_features_already_used)) {
50
619
138
431
next;
620
46
231
} elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
621
$self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
622
$self->{_symbolic_to_numeric_cardinality_threshold}) {
623
0
0
my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
0
0
624
0
0
0
print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};
625
0
0
my @newvalues = ();
626
0
0
0
if (contained_in($feature_name, @true_numeric_types_feature_names)) {
627
0
0
0
0
if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
0
0
0
0
0
0
628
$lowerbound{$feature_name} >= $upperbound{$feature_name}) {
629
0
0
next;
630
} elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
631
$lowerbound{$feature_name} < $upperbound{$feature_name}) {
632
0
0
foreach my $x (@values) {
633
0
0
0
0
push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
634
}
635
} elsif (defined($upperbound{$feature_name})) {
636
0
0
foreach my $x (@values) {
637
0
0
0
push @newvalues, $x if $x <= $upperbound{$feature_name};
638
}
639
} elsif (defined($lowerbound{$feature_name})) {
640
0
0
foreach my $x (@values) {
641
0
0
0
push @newvalues, $x if $x > $lowerbound{$feature_name};
642
}
643
} else {
644
0
0
die "Error is bound specifications in best feature calculator";
645
}
646
} else {
647
0
0
@newvalues = @{deep_copy_array(\@values)};
0
0
648
}
649
0
0
0
next if @newvalues == 0;
650
0
0
my @partitioning_entropies = ();
651
0
0
foreach my $value (@newvalues) {
652
0
0
my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
653
0
0
my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
654
0
0
my @for_left_child;
655
my @for_right_child;
656
0
0
0
if (@features_and_values_or_thresholds_on_branch) {
657
0
0
@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
0
0
658
0
0
push @for_left_child, $feature_and_less_than_value_string;
659
0
0
@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
0
0
660
0
0
push @for_right_child, $feature_and_greater_than_value_string;
661
} else {
662
0
0
@for_left_child = ($feature_and_less_than_value_string);
663
0
0
@for_right_child = ($feature_and_greater_than_value_string);
664
}
665
0
0
my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(
666
\@features_and_values_or_thresholds_on_branch, $feature_name, $value);
667
0
0
my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
668
\@features_and_values_or_thresholds_on_branch, $feature_name, $value);
669
0
0
my $partitioning_entropy = $entropy1 *
670
$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
671
$entropy2 *
672
$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);
673
674
0
0
push @partitioning_entropies, $partitioning_entropy;
675
0
0
$partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
676
}
677
0
0
my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
678
0
0
0
if ($min_entropy < $existing_node_entropy) {
679
0
0
$partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
680
0
0
$entropy_values_for_different_features{$feature_name} = $min_entropy;
681
}
682
} else {
683
46
50
190
print "\nBFC2: Entering section reserved for symbolic features\n" if $self->{_debug3};
684
46
50
175
print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
685
46
140
my %seen;
686
159
50
1182
my @values = grep {$_ ne 'NA' && !$seen{$_}++}
687
46
117
@{$self->{_features_and_unique_values_hash}->{$feature_name}};
46
223
688
46
282
@values = sort @values;
689
46
50
210
print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};
690
691
46
134
my $entropy = 0;
692
46
154
foreach my $value (@values) {
693
159
674
my $feature_value_string = "$feature_name" . '=' . "$value";
694
159
50
536
print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
695
159
313
my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
159
528
696
159
100
570
if (@features_and_values_or_thresholds_on_branch) {
697
145
385
push @extended_attributes, $feature_value_string;
698
} else {
699
14
62
@extended_attributes = ($feature_value_string);
700
}
701
159
608
$entropy +=
702
$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) *
703
$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
704
print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
705
159
50
595
"is $entropy\n" if $self->{_debug3};
706
159
377
push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
159
799
707
}
708
46
50
225
if ($entropy < $existing_node_entropy) {
709
46
465
$entropy_values_for_different_features{$feature_name} = $entropy;
710
}
711
}
712
}
713
46
189
my $min_entropy_for_best_feature;
714
my $best_feature_name;
715
46
224
foreach my $feature_nom (keys %entropy_values_for_different_features) {
716
184
100
478
if (!defined($best_feature_name)) {
717
46
106
$best_feature_name = $feature_nom;
718
46
157
$min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
719
} else {
720
138
100
503
if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
721
43
131
$best_feature_name = $feature_nom;
722
43
118
$min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
723
}
724
}
725
}
726
46
120
my $threshold_for_best_feature;
727
46
50
155
if (exists $partitioning_point_threshold{$best_feature_name}) {
728
46
126
$threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
729
} else {
730
0
0
$threshold_for_best_feature = undef;
731
}
732
46
116
my $best_feature_entropy = $min_entropy_for_best_feature;
733
46
112
my @val_based_entropies_to_be_returned;
734
my $decision_val_to_be_returned;
735
46
50
33
236
if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} &&
736
$self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} >
737
$self->{_symbolic_to_numeric_cardinality_threshold}) {
738
@val_based_entropies_to_be_returned =
739
0
0
@{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
0
0
740
} else {
741
46
125
@val_based_entropies_to_be_returned = ();
742
}
743
46
50
152
if (exists $partitioning_point_threshold{$best_feature_name}) {
744
46
110
$decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
745
} else {
746
0
0
$decision_val_to_be_returned = undef;
747
}
748
print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
749
46
50
174
"@val_based_entropies_to_be_returned\n" if $self->{_debug3};
750
46
1506
return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned,
751
$decision_val_to_be_returned);
752
}
753
754
######################################### Entropy Calculators #####################################
755
756
sub class_entropy_on_priors {
757
1
1
0
5
my $self = shift;
758
return $self->{_entropy_cache}->{'priors'}
759
1
50
8
if exists $self->{_entropy_cache}->{"priors"};
760
1
4
my @class_names = @{$self->{_class_names}};
1
7
761
1
4
my $entropy;
762
1
5
foreach my $class (@class_names) {
763
2
11
my $prob = $self->prior_probability_for_class($class);
764
2
50
33
28
my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;
765
2
50
10
$log_prob = 0 if $prob < 0.0001; # since X.log(X)->0 as X->0
766
2
50
10
$log_prob = 0 if $prob > 0.999; # since log(1) = 0
767
2
100
10
if (!defined $entropy) {
768
1
4
$entropy = -1.0 * $prob * $log_prob;
769
1
5
next;
770
}
771
1
7
$entropy += -1.0 * $prob * $log_prob;
772
}
773
1
5
$self->{_entropy_cache}->{'priors'} = $entropy;
774
1
5
return $entropy;
775
}
776
777
sub entropy_scanner_for_a_numeric_feature {
778
0
0
0
0
local $| = 1;
779
0
0
my $self = shift;
780
0
0
my $feature = shift;
781
0
0
my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};
0
0
782
0
0
my @entropies_for_less_than_thresholds = ();
783
0
0
my @entropies_for_greater_than_thresholds = ();
784
0
0
foreach my $point (@all_sampling_points) {
785
0
0
print ". ";
786
0
0
push @entropies_for_less_than_thresholds,
787
$self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);
788
0
0
push @entropies_for_greater_than_thresholds,
789
$self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);
790
}
791
0
0
print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ".
792
"@entropies_for_less_than_thresholds\n";
793
0
0
print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ".
794
"@entropies_for_greater_than_thresholds\n";
795
}
796
797
sub class_entropy_for_less_than_threshold_for_feature {
798
0
0
0
0
my $self = shift;
799
0
0
my $arr = shift;
800
0
0
my $feature = shift;
801
0
0
my $threshold = shift;
802
0
0
my @array_of_features_and_values_or_thresholds = @$arr;
803
0
0
my $feature_threshold_combo = "$feature" . '<' . "$threshold";
804
0
0
my $sequence = join ":", @array_of_features_and_values_or_thresholds;
805
0
0
$sequence .= ":" . $feature_threshold_combo;
806
0
0
0
return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
807
my @copy_of_array_of_features_and_values_or_thresholds =
808
0
0
@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
0
0
809
0
0
push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
810
0
0
my $entropy = 0;
811
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
812
0
0
my $log_prob = undef;
813
0
0
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
814
$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
815
0
0
0
0
if ($prob >= 0.0001 && $prob <= 0.999) {
0
0
816
0
0
$log_prob = log($prob) / log(2.0);
817
} elsif ($prob < 0.0001) {
818
0
0
$log_prob = 0;
819
} elsif ($prob > 0.999) {
820
0
0
$log_prob = 0;
821
} else {
822
0
0
die "An error has occurred in log_prob calculation";
823
}
824
0
0
$entropy += -1.0 * $prob * $log_prob;
825
}
826
0
0
0
if (abs($entropy) < 0.0000001) {
827
0
0
$entropy = 0.0;
828
}
829
0
0
$self->{_entropy_cache}->{$sequence} = $entropy;
830
0
0
return $entropy;
831
}
832
833
sub class_entropy_for_greater_than_threshold_for_feature {
834
0
0
0
0
my $self = shift;
835
0
0
my $arr = shift;
836
0
0
my $feature = shift;
837
0
0
my $threshold = shift;
838
0
0
my @array_of_features_and_values_or_thresholds = @$arr;
839
0
0
my $feature_threshold_combo = "$feature" . '>' . "$threshold";
840
0
0
my $sequence = join ":", @array_of_features_and_values_or_thresholds;
841
0
0
$sequence .= ":" . $feature_threshold_combo;
842
0
0
0
return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
843
my @copy_of_array_of_features_and_values_or_thresholds =
844
0
0
@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
0
0
845
0
0
push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
846
0
0
my $entropy = 0;
847
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
848
0
0
my $log_prob = undef;
849
0
0
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
850
$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
851
0
0
0
0
if ($prob >= 0.0001 && $prob <= 0.999) {
0
0
852
0
0
$log_prob = log($prob) / log(2.0);
853
} elsif ($prob < 0.0001) {
854
0
0
$log_prob = 0;
855
} elsif ($prob > 0.999) {
856
0
0
$log_prob = 0;
857
} else {
858
0
0
die "An error has occurred in log_prob calculation";
859
}
860
0
0
$entropy += -1.0 * $prob * $log_prob;
861
}
862
0
0
0
if (abs($entropy) < 0.0000001) {
863
0
0
$entropy = 0.0;
864
}
865
0
0
$self->{_entropy_cache}->{$sequence} = $entropy;
866
0
0
return $entropy;
867
}
868
869
sub class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds {
870
247
247
0
566
my $self = shift;
871
247
456
my $array_of_features_and_values_or_thresholds = shift;
872
247
670
my @array_of_features_and_values_or_thresholds = @$array_of_features_and_values_or_thresholds;
873
247
725
my $sequence = join ":", @array_of_features_and_values_or_thresholds;
874
247
100
1033
return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
875
159
363
my $entropy = 0;
876
159
310
foreach my $class_name (@{$self->{_class_names}}) {
159
469
877
318
698
my $log_prob = undef;
878
318
1015
my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
879
$class_name, \@array_of_features_and_values_or_thresholds);
880
318
50
33
1672
if ($prob >= 0.0001 && $prob <= 0.999) {
0
0
881
318
1056
$log_prob = log($prob) / log(2.0);
882
} elsif ($prob < 0.0001) {
883
0
0
$log_prob = 0;
884
} elsif ($prob > 0.999) {
885
0
0
$log_prob = 0;
886
} else {
887
0
0
die "An error has occurred in log_prob calculation";
888
}
889
318
918
$entropy += -1.0 * $prob * $log_prob;
890
}
891
159
50
593
if (abs($entropy) < 0.0000001) {
892
0
0
$entropy = 0.0;
893
}
894
159
578
$self->{_entropy_cache}->{$sequence} = $entropy;
895
159
791
return $entropy;
896
}
897
898
899
##################################### Probability Calculators ########################################
900
901
sub prior_probability_for_class {
902
4
4
0
10
my $self = shift;
903
4
11
my $class = shift;
904
4
14
my $class_name_in_cache = "prior" . '::' . $class;
905
return $self->{_probability_cache}->{$class_name_in_cache}
906
4
50
25
if exists $self->{_probability_cache}->{$class_name_in_cache};
907
0
0
my $total_num_of_samples = keys %{$self->{_samples_class_label_hash}};
0
0
908
0
0
my @values = values %{$self->{_samples_class_label_hash}};
0
0
909
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
910
0
0
my @trues = grep {$_ eq $class_name} @values;
0
0
911
0
0
my $prior_for_this_class = (1.0 * @trues) / $total_num_of_samples;
912
0
0
my $this_class_name_in_cache = "prior" . '::' . $class_name;
913
0
0
$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
914
}
915
0
0
return $self->{_probability_cache}->{$class_name_in_cache};
916
}
917
918
sub calculate_class_priors {
919
1
1
1
19
my $self = shift;
920
1
50
4
return if scalar keys %{$self->{_class_priors_hash}} > 1;
1
10
921
1
5
foreach my $class_name (@{$self->{_class_names}}) {
1
6
922
2
11
my $class_name_in_cache = "prior::$class_name";
923
2
6
my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};
2
10
924
2
6
my @all_values = values %{$self->{_samples_class_label_hash}};
2
28
925
2
8
my @trues_for_this_class = grep {$_ eq $class_name} @all_values;
70
196
926
2
11
my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;
927
2
11
$self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;
928
2
9
my $this_class_name_in_cache = "prior::$class_name";
929
2
17
$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
930
}
931
1
50
9
if ($self->{_debug1}) {
932
0
0
foreach my $class (sort keys %{$self->{_class_priors_hash}}) {
0
0
933
0
0
print "$class => $self->{_class_priors_hash}->{$class}\n";
934
}
935
}
936
}
937
938
sub calculate_first_order_probabilities {
939
1
1
1
21
print "\nEstimating probabilities...\n";
940
1
3
my $self = shift;
941
1
2
foreach my $feature (@{$self->{_feature_names}}) {
1
4
942
4
30
$self->probability_of_feature_value($feature, undef);
943
4
50
28
if ($self->{_debug2}) {
944
0
0
0
if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {
945
0
0
print "\nPresenting probability distribution for a numeric feature:\n";
946
0
0
foreach my $sampling_point (sort {$a <=> $b} keys
0
0
947
0
0
%{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {
948
0
0
my $sampling_pt_for_display = sprintf("%.2f", $sampling_point);
949
print "$feature :: $sampling_pt_for_display=" . sprintf("%.5f",
950
0
0
$self->{_prob_distribution_numeric_features_hash}->{$feature}{$sampling_point}) . "\n";
951
}
952
} else {
953
0
0
print "\nPresenting probabilities for the values of a feature considered to be symbolic:\n";
954
0
0
my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
0
0
955
0
0
foreach my $value (sort @values_for_feature) {
956
0
0
my $prob = $self->probability_of_feature_value($feature,$value);
957
0
0
print "$feature :: $value = " . sprintf("%.5f", $prob) . "\n";
958
}
959
}
960
}
961
}
962
}
963
964
sub probability_of_feature_value {
965
441
441
0
938
my $self = shift;
966
441
919
my $feature_name = shift;
967
441
942
my $value = shift;
968
441
50
66
2687
$value = sprintf("%.1f", $value) if defined($value) && $value =~ /^\d+$/;
969
441
50
66
2082
if (defined($value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
970
$value = closest_sampling_point($value,
971
0
0
$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
972
}
973
441
849
my $feature_and_value;
974
441
100
1224
if (defined($value)) {
975
437
1160
$feature_and_value = "$feature_name=$value";
976
}
977
441
50
66
2348
if (defined($value) && exists($self->{_probability_cache}->{$feature_and_value})) {
978
437
1732
return $self->{_probability_cache}->{$feature_and_value};
979
}
980
4
15
my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
981
4
50
19
if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
982
0
0
0
if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
983
$self->{_symbolic_to_numeric_cardinality_threshold}) {
984
0
0
0
if (! exists $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}) {
985
0
0
@valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
0
0
986
0
0
$diffrange = $valuerange[1] - $valuerange[0];
987
0
0
my %seen = ();
988
0
0
0
0
my @unique_values_for_feature = sort {$a <=> $b} grep {$_ if $_ ne 'NA' && !$seen{$_}++}
0
0
989
0
0
@{$self->{_features_and_values_hash}->{$feature_name}};
0
0
990
0
0
my @diffs = sort {$a <=> $b} map {$unique_values_for_feature[$_] -
0
0
0
0
991
$unique_values_for_feature[$_-1]} 1..@unique_values_for_feature-1;
992
0
0
my $median_diff = $diffs[int(@diffs/2) - 1];
993
0
0
$histogram_delta = $median_diff * 2;
994
0
0
0
if ($histogram_delta < $diffrange / 500.0) {
995
0
0
0
if (defined $self->{_number_of_histogram_bins}) {
996
0
0
$histogram_delta = $diffrange / $self->{_number_of_histogram_bins};
997
} else {
998
0
0
$histogram_delta = $diffrange / 500.0;
999
}
1000
}
1001
0
0
$self->{_histogram_delta_hash}->{$feature_name} = $histogram_delta;
1002
0
0
$num_of_histogram_bins = int($diffrange / $histogram_delta) + 1;
1003
0
0
$self->{_num_of_histogram_bins_hash}->{$feature_name} = $num_of_histogram_bins;
1004
0
0
my @sampling_points_for_feature = map {$valuerange[0] + $histogram_delta * $_}
0
0
1005
0..$num_of_histogram_bins-1;
1006
0
0
$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name} =
1007
\@sampling_points_for_feature;
1008
}
1009
}
1010
}
1011
4
50
15
if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1012
0
0
0
if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1013
$self->{_symbolic_to_numeric_cardinality_threshold}) {
1014
my @sampling_points_for_feature =
1015
0
0
@{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
0
0
1016
0
0
my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1017
0
0
my @actual_values_for_feature = grep {$_ ne 'NA'}
1018
0
0
@{$self->{_features_and_values_hash}->{$feature_name}};
0
0
1019
0
0
foreach my $i (0..@sampling_points_for_feature-1) {
1020
0
0
foreach my $j (0..@actual_values_for_feature-1) {
1021
0
0
0
if (abs($sampling_points_for_feature[$i]-$actual_values_for_feature[$j]) < $histogram_delta) {
1022
0
0
$counts_at_sampling_points[$i]++
1023
}
1024
}
1025
}
1026
0
0
my $total_counts = 0;
1027
0
0
map {$total_counts += $_} @counts_at_sampling_points;
0
0
1028
0
0
my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
0
0
1029
0
0
my %bin_prob_hash = ();
1030
0
0
foreach my $i (0..@sampling_points_for_feature-1) {
1031
0
0
$bin_prob_hash{$sampling_points_for_feature[$i]} = $probs[$i];
1032
}
1033
0
0
$self->{_prob_distribution_numeric_features_hash}->{$feature_name} = \%bin_prob_hash;
1034
0
0
my @values_for_feature = map "$feature_name=$_", map {sprintf("%.5f", $_)}
0
0
1035
@sampling_points_for_feature;
1036
0
0
foreach my $i (0..@values_for_feature-1) {
1037
0
0
$self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1038
}
1039
0
0
0
0
if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1040
0
0
return $self->{_probability_cache}->{$feature_and_value};
1041
} else {
1042
0
0
return 0;
1043
}
1044
} else {
1045
0
0
my %seen = ();
1046
0
0
0
0
my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1047
0
0
@{$self->{_features_and_values_hash}->{$feature_name}};
0
0
1048
0
0
@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
0
0
1049
0
0
my @value_counts = (0) x @values_for_feature;
1050
# foreach my $sample (sort {sample_index($a) cmp sample_index($b)}keys %{$self->{_training_data_hash}}){
1051
0
0
foreach my $sample (sort {sample_index($a) <=> sample_index($b)}keys %{$self->{_training_data_hash}}){
0
0
0
0
1052
0
0
my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
0
0
1053
0
0
foreach my $i (0..@values_for_feature-1) {
1054
0
0
foreach my $current_value (@features_and_values) {
1055
0
0
0
$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1056
}
1057
}
1058
}
1059
0
0
my $total_counts = 0;
1060
0
0
map {$total_counts += $_} @value_counts;
0
0
1061
0
0
0
die "PFV Something is wrong with your training file. It contains no training samples \
1062
for feature named $feature_name" if $total_counts == 0;
1063
0
0
my @probs = map {$_ / (1.0 * $total_counts)} @value_counts;
0
0
1064
0
0
foreach my $i (0..@values_for_feature-1) {
1065
0
0
$self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1066
}
1067
0
0
0
0
if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1068
0
0
return $self->{_probability_cache}->{$feature_and_value};
1069
} else {
1070
0
0
return 0;
1071
}
1072
}
1073
} else {
1074
# This section is only for purely symbolic features:
1075
4
12
my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
4
33
1076
4
15
@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
140
272
1077
4
21
my @value_counts = (0) x @values_for_feature;
1078
# foreach my $sample (sort {sample_index($a) cmp sample_index($b)} keys %{$self->{_training_data_hash}}) {
1079
4
10
foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}}) {
544
1144
4
53
1080
140
281
my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
140
657
1081
140
401
foreach my $i (0..@values_for_feature-1) {
1082
4900
9059
for my $current_value (@features_and_values) {
1083
19600
100
47918
$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1084
}
1085
}
1086
}
1087
4
56
foreach my $i (0..@values_for_feature-1) {
1088
$self->{_probability_cache}->{$values_for_feature[$i]} =
1089
140
221
$value_counts[$i] / (1.0 * scalar(keys %{$self->{_training_data_hash}}));
140
382
1090
}
1091
4
50
33
32
if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1092
0
0
return $self->{_probability_cache}->{$feature_and_value};
1093
} else {
1094
4
40
return 0;
1095
}
1096
}
1097
}
1098
1099
sub probability_of_feature_value_given_class {
1100
882
882
0
1879
my $self = shift;
1101
882
1754
my $feature_name = shift;
1102
882
1672
my $feature_value = shift;
1103
882
1720
my $class_name = shift;
1104
882
50
33
4736
$feature_value = sprintf("%.1f", $feature_value) if defined($feature_value) && $feature_value =~ /^\d+$/;
1105
882
50
33
4195
if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
1106
$feature_value = closest_sampling_point($feature_value,
1107
0
0
$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
1108
}
1109
882
1780
my $feature_value_class;
1110
882
50
2234
if (defined($feature_value)) {
1111
882
2648
$feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";
1112
}
1113
882
100
66
4649
if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {
1114
print "\nNext answer returned by cache for feature $feature_name and " .
1115
874
50
2536
"value $feature_value given class $class_name\n" if $self->{_debug2};
1116
874
3495
return $self->{_probability_cache}->{$feature_value_class};
1117
}
1118
8
42
my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
1119
1120
8
50
45
if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1121
0
0
0
if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1122
$self->{_symbolic_to_numeric_cardinality_threshold}) {
1123
0
0
$histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};
1124
0
0
$num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};
1125
0
0
@valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
0
0
1126
0
0
$diffrange = $valuerange[1] - $valuerange[0];
1127
}
1128
}
1129
8
24
my @samples_for_class = ();
1130
# Accumulate all samples names for the given class:
1131
8
22
foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
8
98
1132
280
100
907
if ($self->{_samples_class_label_hash}->{$sample_name} eq $class_name) {
1133
140
384
push @samples_for_class, $sample_name;
1134
}
1135
}
1136
8
50
54
if (exists($self->{_numeric_features_valuerange_hash}->{$feature_name})) {
1137
0
0
0
if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1138
$self->{_symbolic_to_numeric_cardinality_threshold}) {
1139
my @sampling_points_for_feature =
1140
0
0
@{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
0
0
1141
0
0
my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1142
0
0
my @actual_feature_values_for_samples_in_class = ();
1143
0
0
foreach my $sample (@samples_for_class) {
1144
0
0
foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
0
0
1145
0
0
my $pattern = '(.+)=(.+)';
1146
0
0
$feature_and_value =~ /$pattern/;
1147
0
0
my ($feature, $value) = ($1, $2);
1148
0
0
0
0
if (($feature eq $feature_name) && ($value ne 'NA')) {
1149
0
0
push @actual_feature_values_for_samples_in_class, $value;
1150
}
1151
}
1152
}
1153
0
0
foreach my $i (0..@sampling_points_for_feature-1) {
1154
0
0
foreach my $j (0..@actual_feature_values_for_samples_in_class-1) {
1155
0
0
0
if (abs($sampling_points_for_feature[$i] -
1156
$actual_feature_values_for_samples_in_class[$j]) < $histogram_delta) {
1157
0
0
$counts_at_sampling_points[$i]++;
1158
}
1159
}
1160
}
1161
0
0
my $total_counts = 0;
1162
0
0
map {$total_counts += $_} @counts_at_sampling_points;
0
0
1163
0
0
0
die "PFVC1 Something is wrong with your training file. It contains no training " .
1164
"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1165
0
0
my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
0
0
1166
0
0
my @values_for_feature_and_class = map {"$feature_name=$_" . "::" . "$class_name"}
0
0
1167
@sampling_points_for_feature;
1168
0
0
foreach my $i (0..@values_for_feature_and_class-1) {
1169
0
0
$self->{_probability_cache}->{$values_for_feature_and_class[$i]} = $probs[$i];
1170
}
1171
0
0
0
if (exists $self->{_probability_cache}->{$feature_value_class}) {
1172
0
0
return $self->{_probability_cache}->{$feature_value_class};
1173
} else {
1174
0
0
return 0;
1175
}
1176
} else {
1177
# This section is for numeric features that will be treated symbolically
1178
0
0
my %seen = ();
1179
0
0
0
0
my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1180
0
0
@{$self->{_features_and_values_hash}->{$feature_name}};
0
0
1181
0
0
@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
0
0
1182
0
0
my @value_counts = (0) x @values_for_feature;
1183
0
0
foreach my $sample (@samples_for_class) {
1184
0
0
my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
0
0
1185
0
0
foreach my $i (0..@values_for_feature-1) {
1186
0
0
foreach my $current_value (@features_and_values) {
1187
0
0
0
$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1188
}
1189
}
1190
}
1191
0
0
my $total_counts = 0;
1192
0
0
map {$total_counts += $_} @value_counts;
0
0
1193
0
0
0
die "PFVC2 Something is wrong with your training file. It contains no training " .
1194
"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1195
# We normalize by total_count because the probabilities are conditioned on a given class
1196
0
0
foreach my $i (0..@values_for_feature-1) {
1197
0
0
my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1198
0
0
$self->{_probability_cache}->{$feature_and_value_and_class} =
1199
$value_counts[$i] / (1.0 * $total_counts);
1200
}
1201
0
0
0
if (exists $self->{_probability_cache}->{$feature_value_class}) {
1202
0
0
return $self->{_probability_cache}->{$feature_value_class};
1203
} else {
1204
0
0
return 0;
1205
}
1206
}
1207
} else {
1208
# This section is for purely symbolic features
1209
8
25
my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
8
107
1210
8
35
my %seen = ();
1211
280
100
66
1926
@values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1212
8
20
@{$self->{_features_and_values_hash}->{$feature_name}};
8
36
1213
8
33
@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
28
118
1214
8
41
my @value_counts = (0) x @values_for_feature;
1215
8
31
foreach my $sample (@samples_for_class) {
1216
140
331
my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
140
631
1217
140
440
foreach my $i (0..@values_for_feature-1) {
1218
490
1165
foreach my $current_value (@features_and_values) {
1219
1960
100
6044
$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1220
}
1221
}
1222
}
1223
8
22
my $total_counts = 0;
1224
8
28
map {$total_counts += $_} @value_counts;
28
79
1225
8
50
39
die "PFVC2 Something is wrong with your training file. It contains no training " .
1226
"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1227
# We normalize by total_count because the probabilities are conditioned on a given class
1228
8
41
foreach my $i (0..@values_for_feature-1) {
1229
28
110
my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1230
28
189
$self->{_probability_cache}->{$feature_and_value_and_class} =
1231
$value_counts[$i] / (1.0 * $total_counts);
1232
}
1233
8
50
43
if (exists $self->{_probability_cache}->{$feature_value_class}) {
1234
8
89
return $self->{_probability_cache}->{$feature_value_class};
1235
} else {
1236
0
0
return 0;
1237
}
1238
}
1239
}
1240
1241
sub probability_of_feature_less_than_threshold {
1242
0
0
0
0
my $self = shift;
1243
0
0
my $feature_name = shift;
1244
0
0
my $threshold = shift;
1245
0
0
my $feature_threshold_combo = "$feature_name" . '<' . "$threshold";
1246
return $self->{_probability_cache}->{$feature_threshold_combo}
1247
0
0
0
if (exists $self->{_probability_cache}->{$feature_threshold_combo});
1248
0
0
0
my @all_values = grep {$_ if $_ ne 'NA'} @{$self->{_features_and_values_hash}->{$feature_name}};
0
0
0
0
1249
0
0
0
my @all_values_less_than_threshold = grep {$_ if $_ <= $threshold} @all_values;
0
0
1250
0
0
my $probability = (1.0 * @all_values_less_than_threshold) / @all_values;
1251
0
0
$self->{_probability_cache}->{$feature_threshold_combo} = $probability;
1252
0
0
return $probability;
1253
}
1254
1255
sub probability_of_feature_less_than_threshold_given_class {
1256
0
0
0
0
my $self = shift;
1257
0
0
my $feature_name = shift;
1258
0
0
my $threshold = shift;
1259
0
0
my $class_name = shift;
1260
0
0
my $feature_threshold_class_combo = "$feature_name" . "<" . "$threshold" . "::" . "$class_name";
1261
return $self->{_probability_cache}->{$feature_threshold_class_combo}
1262
0
0
0
if (exists $self->{_probability_cache}->{$feature_threshold_class_combo});
1263
0
0
my @data_samples_for_class = ();
1264
# Accumulate all samples names for the given class:
1265
0
0
foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
0
0
1266
push @data_samples_for_class, $sample_name
1267
0
0
0
if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;
1268
}
1269
0
0
my @actual_feature_values_for_samples_in_class = ();
1270
0
0
foreach my $sample (@data_samples_for_class) {
1271
0
0
foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
0
0
1272
0
0
my $pattern = '(.+)=(.+)';
1273
0
0
$feature_and_value =~ /$pattern/;
1274
0
0
my ($feature,$value) = ($1,$2);
1275
0
0
0
0
push @actual_feature_values_for_samples_in_class, $value
1276
if $feature eq $feature_name && $value ne 'NA';
1277
}
1278
}
1279
0
0
0
my @actual_points_for_feature_less_than_threshold = grep {$_ if $_ <= $threshold} @actual_feature_values_for_samples_in_class;
0
0
1280
# The condition in the assignment that follows was a bug correction in Version 3.20
1281
0
0
0
my $probability = @actual_feature_values_for_samples_in_class > 0 ? ((1.0 * @actual_points_for_feature_less_than_threshold) / @actual_feature_values_for_samples_in_class) : 0.0;
1282
0
0
$self->{_probability_cache}->{$feature_threshold_class_combo} = $probability;
1283
0
0
return $probability;
1284
}
1285
1286
# This method requires that all truly numeric types only be expressed as '<' or '>'
1287
# constructs in the array of branch features and thresholds
1288
sub probability_of_a_sequence_of_features_and_values_or_thresholds {
1289
477
477
0
1038
my $self = shift;
1290
477
880
my $arr = shift;
1291
477
1471
my @array_of_features_and_values_or_thresholds = @$arr;
1292
477
50
1506
return if scalar @array_of_features_and_values_or_thresholds == 0;
1293
477
1469
my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1294
477
100
2124
return $self->{_probability_cache}->{$sequence} if exists $self->{_probability_cache}->{$sequence};
1295
145
319
my $probability = undef;
1296
145
316
my $pattern1 = '(.+)=(.+)';
1297
145
306
my $pattern2 = '(.+)<(.+)';
1298
145
293
my $pattern3 = '(.+)>(.+)';
1299
145
320
my @true_numeric_types = ();
1300
145
312
my @true_numeric_types_feature_names = ();
1301
145
317
my @symbolic_types = ();
1302
145
317
my @symbolic_types_feature_names = ();
1303
145
371
foreach my $item (@array_of_features_and_values_or_thresholds) {
1304
437
50
1933
if ($item =~ /$pattern2/) {
50
1305
0
0
push @true_numeric_types, $item;
1306
0
0
my ($feature,$value) = ($1,$2);
1307
0
0
push @true_numeric_types_feature_names, $feature;
1308
} elsif ($item =~ /$pattern3/) {
1309
0
0
push @true_numeric_types, $item;
1310
0
0
my ($feature,$value) = ($1,$2);
1311
0
0
push @true_numeric_types_feature_names, $feature;
1312
} else {
1313
437
1003
push @symbolic_types, $item;
1314
437
1668
$item =~ /$pattern1/;
1315
437
1581
my ($feature,$value) = ($1,$2);
1316
437
1209
push @symbolic_types_feature_names, $feature;
1317
}
1318
}
1319
145
390
my %seen1 = ();
1320
145
0
360
@true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
0
0
1321
145
325
my %seen2 = ();
1322
145
50
378
@symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
437
2337
1323
145
524
my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1324
print_array_with_msg("POS: Answer returned by find_bounded: ",
1325
145
50
494
$bounded_intervals_numeric_types) if $self->{_debug2};
1326
# Calculate the upper and the lower bounds to be used when searching for the best
1327
# threshold for each of the numeric features that are in play at the current node:
1328
145
360
my (%upperbound, %lowerbound);
1329
145
318
foreach my $feature_name (@true_numeric_types_feature_names) {
1330
0
0
$upperbound{$feature_name} = undef;
1331
0
0
$lowerbound{$feature_name} = undef;
1332
}
1333
145
439
foreach my $item (@$bounded_intervals_numeric_types) {
1334
0
0
foreach my $feature_grouping (@$item) {
1335
0
0
0
if ($feature_grouping->[1] eq '>') {
1336
0
0
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1337
} else {
1338
0
0
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1339
}
1340
}
1341
}
1342
145
374
foreach my $feature_name (@true_numeric_types_feature_names) {
1343
0
0
0
0
if (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name}) &&
0
0
0
0
0
0
0
1344
$upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1345
0
0
return 0;
1346
} elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1347
0
0
0
if (! $probability) {
1348
$probability = $self->probability_of_feature_less_than_threshold($feature_name,
1349
$upperbound{$feature_name}) -
1350
0
0
$self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name});
1351
} else {
1352
$probability *= ($self->probability_of_feature_less_than_threshold($feature_name,
1353
$upperbound{$feature_name}) -
1354
0
0
$self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name}))
1355
}
1356
} elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1357
0
0
0
if (! $probability) {
1358
$probability = $self->probability_of_feature_less_than_threshold($feature_name,
1359
0
0
$upperbound{$feature_name});
1360
} else {
1361
$probability *= $self->probability_of_feature_less_than_threshold($feature_name,
1362
0
0
$upperbound{$feature_name});
1363
}
1364
} elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1365
0
0
0
if (! $probability) {
1366
$probability = 1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1367
0
0
$lowerbound{$feature_name});
1368
} else {
1369
$probability *= (1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1370
0
0
$lowerbound{$feature_name}));
1371
}
1372
} else {
1373
0
0
die("Ill formatted call to 'probability_of_sequence' method");
1374
}
1375
}
1376
145
367
foreach my $feature_and_value (@symbolic_types) {
1377
437
50
2771
if ($feature_and_value =~ /$pattern1/) {
1378
437
1751
my ($feature,$value) = ($1,$2);
1379
437
100
1326
if (! $probability) {
1380
145
501
$probability = $self->probability_of_feature_value($feature, $value);
1381
} else {
1382
292
837
$probability *= $self->probability_of_feature_value($feature, $value);
1383
}
1384
}
1385
}
1386
145
606
$self->{_probability_cache}->{$sequence} = $probability;
1387
145
746
return $probability;
1388
}
1389
1390
## The following method requires that all truly numeric types only be expressed as
1391
## '<' or '>' constructs in the array of branch features and thresholds
1392
sub probability_of_a_sequence_of_features_and_values_or_thresholds_given_class {
1393
318
318
0
665
my $self = shift;
1394
318
592
my $arr = shift;
1395
318
723
my $class_name = shift;
1396
318
901
my @array_of_features_and_values_or_thresholds = @$arr;
1397
318
50
984
return if scalar @array_of_features_and_values_or_thresholds == 0;
1398
318
958
my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1399
318
870
my $sequence_with_class = "$sequence" . "::" . $class_name;
1400
return $self->{_probability_cache}->{$sequence_with_class}
1401
318
100
1203
if exists $self->{_probability_cache}->{$sequence_with_class};
1402
298
618
my $probability = undef;
1403
298
606
my $pattern1 = '(.+)=(.+)';
1404
298
587
my $pattern2 = '(.+)<(.+)';
1405
298
550
my $pattern3 = '(.+)>(.+)';
1406
298
660
my @true_numeric_types = ();
1407
298
593
my @true_numeric_types_feature_names = ();
1408
298
652
my @symbolic_types = ();
1409
298
625
my @symbolic_types_feature_names = ();
1410
298
657
foreach my $item (@array_of_features_and_values_or_thresholds) {
1411
882
50
3740
if ($item =~ /$pattern2/) {
50
1412
0
0
push @true_numeric_types, $item;
1413
0
0
my ($feature,$value) = ($1,$2);
1414
0
0
push @true_numeric_types_feature_names, $feature;
1415
} elsif ($item =~ /$pattern3/) {
1416
0
0
push @true_numeric_types, $item;
1417
0
0
my ($feature,$value) = ($1,$2);
1418
0
0
push @true_numeric_types_feature_names, $feature;
1419
} else {
1420
882
2021
push @symbolic_types, $item;
1421
882
3527
$item =~ /$pattern1/;
1422
882
3261
my ($feature,$value) = ($1,$2);
1423
882
2508
push @symbolic_types_feature_names, $feature;
1424
}
1425
}
1426
298
808
my %seen1 = ();
1427
298
0
717
@true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
0
0
1428
298
621
my %seen2 = ();
1429
298
50
772
@symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
882
4830
1430
298
1110
my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1431
print_array_with_msg("POSC: Answer returned by find_bounded: ",
1432
298
50
1013
$bounded_intervals_numeric_types) if $self->{_debug2};
1433
# Calculate the upper and the lower bounds to be used when searching for the best
1434
# threshold for each of the numeric features that are in play at the current node:
1435
298
629
my (%upperbound, %lowerbound);
1436
298
645
foreach my $feature_name (@true_numeric_types_feature_names) {
1437
0
0
$upperbound{$feature_name} = undef;
1438
0
0
$lowerbound{$feature_name} = undef;
1439
}
1440
298
676
foreach my $item (@$bounded_intervals_numeric_types) {
1441
0
0
foreach my $feature_grouping (@$item) {
1442
0
0
0
if ($feature_grouping->[1] eq '>') {
1443
0
0
$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1444
} else {
1445
0
0
$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1446
}
1447
}
1448
}
1449
298
689
foreach my $feature_name (@true_numeric_types_feature_names) {
1450
0
0
0
0
if ($lowerbound{$feature_name} && $upperbound{$feature_name} &&
0
0
0
0
0
0
0
1451
$upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1452
0
0
return 0;
1453
} elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1454
0
0
0
if (! $probability) {
1455
1456
$probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1457
$upperbound{$feature_name}, $class_name) -
1458
$self->probability_of_feature_less_than_threshold_given_class($feature_name,
1459
0
0
$lowerbound{$feature_name}, $class_name);
1460
} else {
1461
$probability *= ($self->probability_of_feature_less_than_threshold_given_class($feature_name,
1462
$upperbound{$feature_name}, $class_name) -
1463
$self->probability_of_feature_less_than_threshold_given_class($feature_name,
1464
0
0
$lowerbound{$feature_name}, $class_name))
1465
}
1466
} elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1467
0
0
0
if (! $probability) {
1468
$probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1469
0
0
$upperbound{$feature_name}, $class_name);
1470
} else {
1471
$probability *= $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1472
0
0
$upperbound{$feature_name}, $class_name);
1473
}
1474
} elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1475
0
0
0
if (! $probability) {
1476
$probability = 1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1477
0
0
$lowerbound{$feature_name}, $class_name);
1478
} else {
1479
$probability *= (1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1480
0
0
$lowerbound{$feature_name}, $class_name));
1481
}
1482
} else {
1483
0
0
die("Ill formatted call to 'probability of sequence given class' method");
1484
}
1485
}
1486
298
651
foreach my $feature_and_value (@symbolic_types) {
1487
882
50
5684
if ($feature_and_value =~ /$pattern1/) {
1488
882
3452
my ($feature,$value) = ($1,$2);
1489
882
100
2372
if (! $probability) {
1490
298
1090
$probability = $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1491
} else {
1492
584
1678
$probability *= $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1493
}
1494
}
1495
}
1496
298
1302
$self->{_probability_cache}->{$sequence_with_class} = $probability;
1497
298
1690
return $probability;
1498
}
1499
1500
sub probability_of_a_class_given_sequence_of_features_and_values_or_thresholds {
1501
494
494
0
1036
my $self = shift;
1502
494
957
my $class_name = shift;
1503
494
889
my $arr = shift;
1504
494
1370
my @array_of_features_and_values_or_thresholds = @$arr;
1505
494
1349
my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1506
494
1298
my $class_and_sequence = "$class_name" . "::" . $sequence;
1507
return $self->{_probability_cache}->{$class_and_sequence}
1508
494
100
2370
if exists $self->{_probability_cache}->{$class_and_sequence};
1509
159
385
my @array_of_class_probabilities = (0) x scalar @{$self->{_class_names}};
159
573
1510
159
391
foreach my $i (0..@{$self->{_class_names}}-1) {
159
501
1511
318
761
my $class_name = $self->{_class_names}->[$i];
1512
318
1356
my $prob = $self->probability_of_a_sequence_of_features_and_values_or_thresholds_given_class(
1513
\@array_of_features_and_values_or_thresholds, $class_name);
1514
318
50
1127
if ($prob < 0.000001) {
1515
0
0
$array_of_class_probabilities[$i] = 0.0;
1516
0
0
next;
1517
}
1518
318
1122
my $prob_of_feature_sequence = $self->probability_of_a_sequence_of_features_and_values_or_thresholds(
1519
\@array_of_features_and_values_or_thresholds);
1520
# die "PCS Something is wrong with your sequence of feature values and thresholds in " .
1521
# "probability_of_a_class_given_sequence_of_features_and_values_or_thresholds()"
1522
# if ! $prob_of_feature_sequence;
1523
318
1094
my $prior = $self->{_class_priors_hash}->{$self->{_class_names}->[$i]};
1524
318
50
822
if ($prob_of_feature_sequence) {
1525
318
1220
$array_of_class_probabilities[$i] = $prob * $prior / $prob_of_feature_sequence;
1526
} else {
1527
0
0
$array_of_class_probabilities[$i] = $prior;
1528
}
1529
}
1530
159
363
my $sum_probability;
1531
159
433
map {$sum_probability += $_} @array_of_class_probabilities;
318
931
1532
159
50
491
if ($sum_probability == 0) {
1533
0
0
@array_of_class_probabilities = map {1.0 / (scalar @{$self->{_class_names}})}
0
0
1534
0
0
(0..@{$self->{_class_names}}-1);
0
0
1535
} else {
1536
159
392
@array_of_class_probabilities = map {$_ * 1.0 / $sum_probability} @array_of_class_probabilities;
318
1035
1537
}
1538
159
376
foreach my $i (0..@{$self->{_class_names}}-1) {
159
571
1539
318
1161
my $this_class_and_sequence = "$self->{_class_names}->[$i]" . "::" . "$sequence";
1540
318
1501
$self->{_probability_cache}->{$this_class_and_sequence} = $array_of_class_probabilities[$i];
1541
}
1542
159
636
return $self->{_probability_cache}->{$class_and_sequence};
1543
}
1544
1545
####################################### Class Based Utilities ##########################################
1546
1547
## Given a list of branch attributes for the numeric features of the form, say,
1548
## ['g2<1','g2<2','g2<3','age>34','age>36','age>37'], this method returns the
1549
## smallest list that is relevant for the purpose of calculating the probabilities.
1550
## To explain, the probability that the feature `g2' is less than 1 AND, at the same
1551
## time, less than 2, AND, at the same time, less than 3, is the same as the
1552
## probability that the feature less than 1. Similarly, the probability that 'age'
1553
## is greater than 34 and also greater than 37 is the same as `age' being greater
1554
## than 37.
1555
sub find_bounded_intervals_for_numeric_features {
1556
489
489
0
1104
my $self = shift;
1557
489
912
my $arr = shift;
1558
489
1061
my @arr = @$arr;
1559
489
917
my @features = @{$self->{_feature_names}};
489
1651
1560
489
1170
my @arr1 = map {my @x = split /(>|<)/, $_; \@x} @arr;
0
0
0
0
1561
489
50
1456
print_array_with_msg("arr1", \@arr1) if $self->{_debug2};
1562
489
1027
my @arr3 = ();
1563
489
1133
foreach my $feature_name (@features) {
1564
1956
3893
my @temp = ();
1565
1956
3831
foreach my $x (@arr1) {
1566
0
0
0
0
push @temp, $x if @$x > 0 && $x->[0] eq $feature_name;
1567
}
1568
1956
50
5964
push @arr3, \@temp if @temp > 0;
1569
}
1570
489
50
1444
print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
1571
# Sort each list so that '<' entries occur before '>' entries:
1572
489
915
my @arr4;
1573
489
992
foreach my $li (@arr3) {
1574
0
0
my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
0
0
1575
0
0
push @arr4, \@sorted;
1576
}
1577
489
50
1387
print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
1578
489
904
my @arr5;
1579
489
1030
foreach my $li (@arr4) {
1580
0
0
my @temp1 = ();
1581
0
0
my @temp2 = ();
1582
0
0
foreach my $inner (@$li) {
1583
0
0
0
if ($inner->[1] eq '<') {
1584
0
0
push @temp1, $inner;
1585
} else {
1586
0
0
push @temp2, $inner;
1587
}
1588
}
1589
0
0
0
0
if (@temp1 > 0 && @temp2 > 0) {
0
1590
0
0
push @arr5, [\@temp1, \@temp2];
1591
} elsif (@temp1 > 0) {
1592
0
0
push @arr5, [\@temp1];
1593
} else {
1594
0
0
push @arr5, [\@temp2];
1595
}
1596
}
1597
489
50
1396
print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
1598
489
1069
my @arr6 = ();
1599
489
1036
foreach my $li (@arr5) {
1600
0
0
my @temp1 = ();
1601
0
0
foreach my $inner (@$li) {
1602
0
0
my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
0
0
1603
0
0
push @temp1, \@sorted;
1604
}
1605
0
0
push @arr6, \@temp1;
1606
}
1607
489
50
1301
print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
1608
489
1034
my @arr9 = ();
1609
489
1042
foreach my $li (@arr6) {
1610
0
0
foreach my $alist (@$li) {
1611
0
0
my @newalist = ();
1612
0
0
0
if ($alist->[0][1] eq '<') {
1613
0
0
push @newalist, $alist->[0];
1614
} else {
1615
0
0
push @newalist, $alist->[-1];
1616
}
1617
0
0
0
if ($alist->[0][1] ne $alist->[-1][1]) {
1618
0
0
push @newalist, $alist->[-1];
1619
}
1620
0
0
push @arr9, \@newalist;
1621
}
1622
}
1623
489
50
1284
print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
1624
489
1787
return \@arr9;
1625
1626
}
1627
1628
## This method is used to verify that you used legal feature names in the test
1629
## sample that you want to classify with the decision tree.
1630
sub check_names_used {
1631
0
0
0
0
my $self = shift;
1632
0
0
my $features_and_values_test_data = shift;
1633
0
0
my @features_and_values_test_data = @$features_and_values_test_data;
1634
0
0
my $pattern = '(\S+)\s*=\s*(\S+)';
1635
0
0
foreach my $feature_and_value (@features_and_values_test_data) {
1636
0
0
$feature_and_value =~ /$pattern/;
1637
0
0
my ($feature,$value) = ($1,$2);
1638
0
0
0
0
die "Your test data has formatting error" unless defined($feature) && defined($value);
1639
0
0
0
return 0 unless contained_in($feature, @{$self->{_feature_names}});
0
0
1640
}
1641
0
0
return 1;
1642
}
1643
1644
####################################### Data Condition Calculator ######################################
1645
1646
## This method estimates the worst-case fan-out of the decision tree taking into
1647
## account the number of values (and therefore the number of branches emanating from
1648
## a node) for the symbolic features.
1649
sub determine_data_condition {
1650
0
0
0
0
my $self = shift;
1651
0
0
my $num_of_features = scalar @{$self->{_feature_names}};
0
0
1652
0
0
my @values = ();
1653
0
0
my @number_of_values;
1654
0
0
foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
0
0
1655
0
0
push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
1656
0
0
0
if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
0
0
1657
0
0
push @number_of_values, scalar @values;
1658
}
1659
0
0
0
return if ! @values;
1660
0
0
print "Number of features: $num_of_features\n";
1661
0
0
my @minmax = minmax(\@number_of_values);
1662
0
0
my $max_num_values = $minmax[1];
1663
0
0
print "Largest number of values for symbolic features is: $max_num_values\n";
1664
0
0
my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
1665
0
0
print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
1666
"nodes. The exact number of nodes created depends critically on " .
1667
"the entropy_threshold used for node expansion (the default value " .
1668
"for this threshold is 0.01) and on the value set for max_depth_desired " .
1669
"for the depth of the tree.\n";
1670
0
0
0
if ($estimated_number_of_nodes > 10000) {
1671
0
0
print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
1672
"large value for entropy_threshold and/or a small value for " .
1673
"for max_depth_desired to reduce the number of nodes created.\n";
1674
0
0
print "\nDo you wish to continue anyway? Enter 'y' for yes: ";
1675
0
0
my $answer = ;
1676
0
0
$answer =~ s/\r?\n?$//;
1677
0
0
0
while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
1678
0
0
print "\nAnswer not recognized. Let's try again. Enter 'y' or 'n': ";
1679
0
0
$answer = ;
1680
0
0
$answer =~ s/\r?\n?$//;
1681
}
1682
0
0
0
die unless $answer =~ /y(es)?/i;
1683
}
1684
}
1685
1686
1687
#################################### Read Training Data From File ######################################
1688
1689
1690
sub get_training_data {
1691
1
1
1
7
my $self = shift;
1692
1
50
10
die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
1693
1
4
my %class_names = ();
1694
1
3
my %all_record_ids_with_class_labels;
1695
my $firstline;
1696
1
0
my %data_hash;
1697
1
4
$|++;
1698
1
50
25
open FILEIN, $self->{_training_datafile} || die "unable to open $self->{_training_datafile}: $!";
1699
1
4
my $record_index = 0;
1700
1
2
my $firsetline;
1701
1
16
while () {
1702
36
50
115
next if /^[ ]*\r?\n?$/;
1703
36
112
$_ =~ s/\r?\n?$//;
1704
36
50
84
my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
1705
36
100
77
if ($record_index == 0) {
1706
1
2
$firstline = $record;
1707
1
3
$record_index++;
1708
1
11
next;
1709
}
1710
35
152
my @parts = split /,/, $record;
1711
35
66
my $classname = $parts[$self->{_csv_class_column_index}];
1712
35
49
$class_names{$classname} = 1;
1713
35
56
my $record_label = shift @parts;
1714
35
81
$record_label =~ s/^\s*\"|\"\s*$//g;
1715
35
77
$data_hash{$record_label} = \@parts;
1716
35
59
$all_record_ids_with_class_labels{$record_label} = $classname;
1717
35
50
76
print "." if $record_index % 10000 == 0;
1718
35
108
$record_index++;
1719
}
1720
1
9
close FILEIN;
1721
1
3
$|--;
1722
1
5
$self->{_how_many_total_training_samples} = $record_index - 1; # must subtract 1 for the header record
1723
1
50
4
print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
1724
1
11
my @all_feature_names = split /,/, substr($firstline, index($firstline,','));
1725
1
4
my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
1726
1
4
my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
2
12
1727
1
3
my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
4
9
1
3
1728
1
5
my %class_for_sample_hash = map {"sample_" . $_ => "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
35
96
1729
1
10
my @sample_names = map {"sample_$_"} keys %data_hash;
35
59
1730
1
50
8
my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $data_hash{$sampleID}->[$_-1] } } @{$self->{_csv_columns_for_features}} ] } keys %data_hash;
35
59
35
55
140
220
140
223
140
516
35
58
1731
1
50
10
my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};
4
15
4
11
4
31
140
300
140
518
140
387
1
5
1732
1
4
my %numeric_features_valuerange_hash = ();
1733
1
3
my %feature_values_how_many_uniques_hash = ();
1734
1
4
my %features_and_unique_values_hash = ();
1735
1
3
my $numregex = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
1736
1
5
foreach my $feature (keys %features_and_values_hash) {
1737
4
9
my %seen1 = ();
1738
140
100
66
655
my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
1739
4
8
@{$features_and_values_hash{$feature}};
4
16
1740
4
15
$feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
1741
4
11
my $not_all_values_float = 0;
1742
4
50
11
map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
14
152
1743
4
50
13
if ($not_all_values_float == 0) {
1744
0
0
my @minmaxvalues = minmax(\@unique_values_for_feature);
1745
0
0
$numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
1746
}
1747
4
17
$features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
1748
}
1749
1
50
7
if ($self->{_debug1}) {
1750
0
0
print "\nAll class names: @all_class_names\n";
1751
0
0
print "\nEach sample data record:\n";
1752
0
0
foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
0
0
1753
0
0
print "$sample => @{$feature_values_for_samples_hash{$sample}}\n";
0
0
1754
}
1755
0
0
print "\nclass label for each data sample:\n";
1756
0
0
foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %class_for_sample_hash) {
0
0
1757
0
0
print "$sample => $class_for_sample_hash{$sample}\n";
1758
}
1759
0
0
print "\nFeatures used: @feature_names\n\n";
1760
0
0
print "\nfeatures and the values taken by them:\n";
1761
0
0
foreach my $feature (sort keys %features_and_values_hash) {
1762
0
0
print "$feature => @{$features_and_values_hash{$feature}}\n";
0
0
1763
}
1764
0
0
print "\nnumeric features and their ranges:\n";
1765
0
0
foreach my $feature (sort keys %numeric_features_valuerange_hash) {
1766
0
0
print "$feature => @{$numeric_features_valuerange_hash{$feature}}\n";
0
0
1767
}
1768
0
0
print "\nnumber of unique values in each feature:\n";
1769
0
0
foreach my $feature (sort keys %feature_values_how_many_uniques_hash) {
1770
0
0
print "$feature => $feature_values_how_many_uniques_hash{$feature}\n";
1771
}
1772
}
1773
1
4
$self->{_class_names} = \@all_class_names;
1774
1
4
$self->{_feature_names} = \@feature_names;
1775
1
4
$self->{_samples_class_label_hash} = \%class_for_sample_hash;
1776
1
3
$self->{_training_data_hash} = \%feature_values_for_samples_hash;
1777
1
4
$self->{_features_and_values_hash} = \%features_and_values_hash;
1778
1
12
$self->{_features_and_unique_values_hash} = \%features_and_unique_values_hash;
1779
1
3
$self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
1780
1
44
$self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
1781
}
1782
1783
sub show_training_data {
1784
0
0
1
0
my $self = shift;
1785
0
0
my @class_names = @{$self->{_class_names}};
0
0
1786
0
0
my %features_and_values_hash = %{$self->{_features_and_values_hash}};
0
0
1787
0
0
my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};
0
0
1788
0
0
my %training_data_hash = %{$self->{_training_data_hash}};
0
0
1789
0
0
print "\n\nClass Names: @class_names\n";
1790
0
0
print "\n\nFeatures and Their Values:\n\n";
1791
0
0
while ( my ($k, $v) = each %features_and_values_hash ) {
1792
0
0
print "$k ---> @{$features_and_values_hash{$k}}\n";
0
0
1793
}
1794
0
0
print "\n\nSamples vs. Class Labels:\n\n";
1795
0
0
foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {
0
0
1796
0
0
print "$kee => $samples_class_label_hash{$kee}\n";
1797
}
1798
0
0
print "\n\nTraining Samples:\n\n";
1799
0
0
foreach my $kee (sort {sample_index($a) <=> sample_index($b)}
0
0
1800
keys %training_data_hash) {
1801
0
0
print "$kee => @{$training_data_hash{$kee}}\n";
0
0
1802
}
1803
}
1804
1805
sub get_class_names {
1806
0
0
0
0
my $self = shift;
1807
0
0
return @{$self->{_class_names}}
0
0
1808
}
1809
1810
########################################## Utility Routines ############################################
1811
1812
sub closest_sampling_point {
1813
0
0
0
0
my $value = shift;
1814
0
0
my $arr_ref = shift;
1815
0
0
my @arr = @{$arr_ref};
0
0
1816
0
0
my @compare = map {abs($_ - $value)} @arr;
0
0
1817
0
0
my ($minval,$index) = minimum(\@compare);
1818
0
0
return $arr[$index];
1819
}
1820
1821
## returns the array index that contains a specified STRING value: (meant only for array of strings)
1822
sub get_index_at_value {
1823
0
0
0
0
my $value = shift;
1824
0
0
my @array = @{shift @_};
0
0
1825
0
0
foreach my $i (0..@array-1) {
1826
0
0
0
return $i if $value eq $array[$i];
1827
}
1828
}
1829
1830
## When the training data is read from a CSV file, we assume that the first column
1831
## of each data record contains a unique integer identifier for the record in that
1832
## row. This training data is stored in a hash whose keys are the prefix 'sample_'
1833
## followed by the identifying integers. The purpose of this function is to return
1834
## the identifying integer associated with a data record.
1835
sub sample_index {
1836
1088
1088
0
1757
my $arg = shift;
1837
1088
2447
$arg =~ /_(.+)$/;
1838
1088
2685
return $1;
1839
}
1840
1841
# Returns the minimum value and its positional index in an array
1842
sub minimum {
1843
0
0
0
0
my $arr = shift;
1844
0
0
my $min;
1845
my $index;
1846
0
0
foreach my $i (0..@{$arr}-1) {
0
0
1847
0
0
0
0
if ( (!defined $min) || ($arr->[$i] < $min) ) {
1848
0
0
$index = $i;
1849
0
0
$min = $arr->[$i];
1850
}
1851
}
1852
0
0
return ($min, $index);
1853
}
1854
1855
# Returns an array of two values, the min and the max, of an array of floats
1856
sub minmax {
1857
0
0
0
0
my $arr = shift;
1858
0
0
my ($min, $max);
1859
0
0
foreach my $i (0..@{$arr}-1) {
0
0
1860
0
0
0
0
if ( (!defined $min) || ($arr->[$i] < $min) ) {
1861
0
0
$min = $arr->[$i];
1862
}
1863
0
0
0
0
if ( (!defined $max) || ($arr->[$i] > $max) ) {
1864
0
0
$max = $arr->[$i];
1865
}
1866
}
1867
0
0
return ($min, $max);
1868
}
1869
1870
# checks whether an element is in an array:
1871
sub contained_in {
1872
414
414
0
858
my $ele = shift;
1873
414
1227
my @array = @_;
1874
414
858
my $count = 0;
1875
414
100
908
map {$count++ if $ele eq $_} @array;
1104
3997
1876
414
1727
return $count;
1877
}
1878
1879
# Meant only for an array of strings (no nesting):
1880
sub deep_copy_array {
1881
289
289
0
619
my $ref_in = shift;
1882
289
578
my $ref_out;
1883
289
100
885
return [] if scalar @$ref_in == 0;
1884
274
525
foreach my $i (0..@{$ref_in}-1) {
274
994
1885
628
1783
$ref_out->[$i] = $ref_in->[$i];
1886
}
1887
274
986
return $ref_out;
1888
}
1889
1890
sub check_for_illegal_params2 {
1891
1
1
0
3
my @params = @_;
1892
1
7
my @legal_params = qw / training_datafile
1893
entropy_threshold
1894
max_depth_desired
1895
csv_class_column_index
1896
csv_columns_for_features
1897
symbolic_to_numeric_cardinality_threshold
1898
number_of_histogram_bins
1899
csv_cleanup_needed
1900
debug1
1901
debug2
1902
debug3
1903
/;
1904
1
7
my $found_match_flag;
1905
1
3
foreach my $param (@params) {
1906
3
5
foreach my $legal (@legal_params) {
1907
10
16
$found_match_flag = 0;
1908
10
100
20
if ($param eq $legal) {
1909
3
4
$found_match_flag = 1;
1910
3
5
last;
1911
}
1912
}
1913
3
50
9
last if $found_match_flag == 0;
1914
}
1915
1
9
return $found_match_flag;
1916
}
1917
1918
sub print_array_with_msg {
1919
0
0
0
0
my $message = shift;
1920
0
0
my $arr = shift;
1921
0
0
print "\n$message: ";
1922
0
0
print_nested_array( $arr );
1923
}
1924
1925
sub print_nested_array {
1926
0
0
0
0
my $arr = shift;
1927
0
0
my @arr = @$arr;
1928
0
0
print "[";
1929
0
0
foreach my $item (@arr) {
1930
0
0
0
if (ref $item) {
1931
0
0
print_nested_array($item);
1932
} else {
1933
0
0
print "$item";
1934
}
1935
}
1936
0
0
print "]";
1937
}
1938
1939
sub cleanup_csv {
1940
0
0
0
0
my $line = shift;
1941
0
0
$line =~ tr/\/:?()[]{}'/ /;
1942
# my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
1943
0
0
my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
1944
0
0
for (@double_quoted) {
1945
0
0
my $item = $_;
1946
0
0
$item = substr($item, 1, -1);
1947
0
0
$item =~ s/^\s+|,|\s+$//g;
1948
0
0
$item = join '_', split /\s+/, $item;
1949
0
0
substr($line, index($line, $_), length($_)) = $item;
1950
}
1951
0
0
my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
1952
0
0
for (@white_spaced) {
1953
0
0
my $item = $_;
1954
0
0
$item =~ s/\s+/_/g;
1955
0
0
$item =~ s/^\s*_|_\s*$//g;
1956
0
0
substr($line, index($line, $_), length($_)) = $item;
1957
}
1958
0
0
$line =~ s/,\s*(?=,|$)/,NA/g;
1959
0
0
return $line;
1960
}
1961
1962
######################################### Class EvalTrainingData ########################################
1963
1964
## This subclass of the DecisionTree class is used to evaluate the quality of your
1965
## training data by running a 10-fold cross-validation test on it. This test divides
1966
## all of the training data into ten parts, with nine parts used for training a
1967
## decision tree and one part used for testing its ability to classify correctly.
1968
## This selection of nine parts for training and one part for testing is carried out
1969
## in all of the ten different possible ways. This testing functionality can also
1970
## be used to find the best values to use for the constructor parameters
1971
## entropy_threshold, max_depth_desired, and
1972
## symbolic_to_numeric_cardinality_threshold.
1973
1974
## Only the CSV training files can be evaluated in this manner (because only CSV
1975
## training are allowed to have numeric features --- which is the more interesting
1976
## case for evaluation analytics.
1977
1978
package EvalTrainingData;
1979
1980
@EvalTrainingData::ISA = ('Algorithm::DecisionTree');
1981
1982
sub new {
1983
0
0
0
my $class = shift;
1984
0
0
my $instance = Algorithm::DecisionTree->new(@_);
1985
0
0
bless $instance, $class;
1986
}
1987
1988
sub evaluate_training_data {
1989
0
0
0
my $self = shift;
1990
0
0
my $evaldebug = 0;
1991
die "The data evaluation function in the module can only be used when your " .
1992
0
0
0
"training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
1993
0
0
print "\nWill run a 10-fold cross-validation test on your training data to test its " .
1994
"class-discriminatory power:\n";
1995
0
0
my %all_training_data = %{$self->{_training_data_hash}};
0
0
1996
0
0
my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=>
0
0
1997
Algorithm::DecisionTree::sample_index($b)} keys %all_training_data;
1998
0
0
my $fold_size = int(0.1 * (scalar keys %all_training_data));
1999
0
0
print "fold size: $fold_size\n";
2000
0
0
my %confusion_matrix = ();
2001
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
2002
0
0
foreach my $inner_class_name (@{$self->{_class_names}}) {
0
0
2003
0
0
$confusion_matrix{$class_name}->{$inner_class_name} = 0;
2004
}
2005
}
2006
0
0
foreach my $fold_index (0..9) {
2007
0
0
print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n";
2008
0
0
my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
2009
0
0
my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],
2010
@all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
2011
0
0
my %testing_data = ();
2012
0
0
foreach my $x (@testing_samples) {
2013
0
0
$testing_data{$x} = $all_training_data{$x};
2014
}
2015
0
0
my %training_data = ();
2016
0
0
foreach my $x (@training_samples) {
2017
0
0
$training_data{$x} = $all_training_data{$x};
2018
}
2019
0
0
my $trainingDT = Algorithm::DecisionTree->new('evalmode');
2020
0
0
$trainingDT->{_training_data_hash} = \%training_data;
2021
0
0
$trainingDT->{_class_names} = $self->{_class_names};
2022
0
0
$trainingDT->{_feature_names} = $self->{_feature_names};
2023
0
0
$trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
2024
0
0
$trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
2025
$trainingDT->{_symbolic_to_numeric_cardinality_threshold} =
2026
0
0
$self->{_symbolic_to_numeric_cardinality_threshold};
2027
0
0
foreach my $sample_name (@training_samples) {
2028
$trainingDT->{_samples_class_label_hash}->{$sample_name} =
2029
0
0
$self->{_samples_class_label_hash}->{$sample_name};
2030
}
2031
0
0
foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
0
0
2032
0
0
$trainingDT->{_features_and_values_hash}->{$feature} = ();
2033
}
2034
0
0
my $pattern = '(\S+)\s*=\s*(\S+)';
2035
0
0
foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=>
0
0
2036
Algorithm::DecisionTree::sample_index($b)}
2037
0
0
keys %{$trainingDT->{_training_data_hash}}) {
2038
0
0
foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
0
0
2039
0
0
$feature_and_value =~ /$pattern/;
2040
0
0
my ($feature,$value) = ($1,$2);
2041
0
0
0
push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
0
0
2042
}
2043
}
2044
0
0
foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
0
0
2045
0
0
my %seen = ();
2046
0
0
0
0
my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
2047
0
0
@{$trainingDT->{_features_and_values_hash}->{$feature}};
0
0
2048
0
0
0
if (Algorithm::DecisionTree::contained_in($feature,
2049
0
0
keys %{$self->{_numeric_features_valuerange_hash}})) {
2050
0
0
@unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;
0
0
2051
} else {
2052
0
0
@unique_values_for_feature = sort @unique_values_for_feature;
2053
}
2054
0
0
$trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
2055
}
2056
0
0
foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
0
0
2057
my @minmaxvalues = Algorithm::DecisionTree::minmax(
2058
0
0
\@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
0
0
2059
0
0
$trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
2060
}
2061
0
0
0
if ($evaldebug) {
2062
0
0
print "\n\nprinting samples in the testing set: @testing_samples\n";
2063
0
0
print "\n\nPrinting features and their values in the training set:\n";
2064
0
0
foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
0
0
2065
0
0
print "$item => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
0
0
2066
}
2067
0
0
print "\n\nPrinting unique values for features:\n";
2068
0
0
foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
0
0
2069
0
0
print "$item => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
0
0
2070
}
2071
0
0
print "\n\nPrinting unique value ranges for features:\n";
2072
0
0
foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
0
0
2073
0
0
print "$item => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
0
0
2074
}
2075
}
2076
0
0
foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
0
0
2077
$trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} =
2078
0
0
scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
0
0
2079
}
2080
0
0
0
$trainingDT->{_debug2} = 1 if $evaldebug;
2081
0
0
$trainingDT->calculate_first_order_probabilities();
2082
0
0
$trainingDT->calculate_class_priors();
2083
0
0
my $root_node = $trainingDT->construct_decision_tree_classifier();
2084
0
0
0
$root_node->display_decision_tree(" ") if $evaldebug;
2085
0
0
foreach my $test_sample_name (@testing_samples) {
2086
0
0
my @test_sample_data = @{$all_training_data{$test_sample_name}};
0
0
2087
0
0
0
print "original data in test sample: @test_sample_data\n" if $evaldebug;
2088
0
0
0
0
@test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
0
0
2089
0
0
0
print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
2090
0
0
my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
0
0
2091
0
0
my @solution_path = @{$classification{'solution_path'}};
0
0
2092
0
0
delete $classification{'solution_path'};
2093
0
0
my @which_classes = keys %classification;
2094
0
0
@which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
0
0
2095
0
0
my $most_likely_class_label = $which_classes[0];
2096
0
0
0
if ($evaldebug) {
2097
0
0
print "\nClassification:\n\n";
2098
0
0
print " class probability\n";
2099
0
0
print " ---------- -----------\n";
2100
0
0
foreach my $which_class (@which_classes) {
2101
0
0
my $classstring = sprintf("%-30s", $which_class);
2102
0
0
my $valuestring = sprintf("%-30s", $classification{$which_class});
2103
0
0
print " $classstring $valuestring\n";
2104
}
2105
0
0
print "\nSolution path in the decision tree: @solution_path\n";
2106
0
0
print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
2107
}
2108
0
0
my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
2109
0
0
0
print "$test_sample_name: true_class: $true_class_label_for_sample " .
2110
"estimated_class: $most_likely_class_label\n" if $evaldebug;
2111
0
0
$confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
2112
}
2113
}
2114
0
0
print "\n\n DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
2115
0
0
my $matrix_header = " " x 30;
2116
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
2117
0
0
$matrix_header .= sprintf("%-30s", $class_name);
2118
}
2119
0
0
print "\n" . $matrix_header . "\n\n";
2120
0
0
foreach my $row_class_name (sort keys %confusion_matrix) {
2121
0
0
my $row_display = sprintf("%-30s", $row_class_name);
2122
0
0
foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
0
0
2123
0
0
$row_display .= sprintf( "%-30u", $confusion_matrix{$row_class_name}->{$col_class_name} );
2124
}
2125
0
0
print "$row_display\n\n";
2126
}
2127
0
0
print "\n\n";
2128
0
0
my ($diagonal_sum, $off_diagonal_sum) = (0,0);
2129
0
0
foreach my $row_class_name (sort keys %confusion_matrix) {
2130
0
0
foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
0
0
2131
0
0
0
if ($row_class_name eq $col_class_name) {
2132
0
0
$diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2133
} else {
2134
0
0
$off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2135
}
2136
}
2137
}
2138
0
0
my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
2139
0
0
print "\nTraining Data Quality Index: $data_quality_index (out of a possible maximum of 100)\n";
2140
0
0
0
0
if ($data_quality_index <= 80) {
0
0
0
0
0
2141
0
0
print "\nYour training data does not possess much class discriminatory " .
2142
"information. It could be that the classes are inherently not well " .
2143
"separable or that your constructor parameter choices are not appropriate.\n";
2144
} elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
2145
0
0
print "\nYour training data possesses some class discriminatory information " .
2146
"but it may not be sufficient for real-world applications. You might " .
2147
"try tweaking the constructor parameters to see if that improves the " .
2148
"class discriminations.\n";
2149
} elsif ($data_quality_index > 90 && $data_quality_index <= 95) {
2150
0
0
print "\nYour training data appears to possess good class discriminatory " .
2151
"information. Whether or not it is acceptable would depend on your " .
2152
"application.\n";
2153
} elsif ($data_quality_index > 95 && $data_quality_index <= 98) {
2154
0
0
print "\nYour training data is of excellent quality.\n";
2155
} else {
2156
0
0
print "\nYour training data is perfect.\n";
2157
}
2158
2159
}
2160
2161
2162
############################################# Class DTNode #############################################
2163
2164
# The nodes of the decision tree are instances of this class:
2165
2166
package DTNode;
2167
2168
1
1
23
use strict;
1
5
1
39
2169
1
1
14
use Carp;
1
4
1
1578
2170
2171
# $feature is the feature test at the current node. $branch_features_and_values is
2172
# an anonymous array holding the feature names and corresponding values on the path
2173
# from the root to the current node:
2174
sub new {
2175
46
46
235
my ($class, $feature, $entropy, $class_probabilities,
2176
$branch_features_and_values_or_thresholds, $dt, $root_or_not) = @_;
2177
46
100
229
$root_or_not = '' if !defined $root_or_not;
2178
46
100
187
if ($root_or_not eq 'root') {
2179
1
5
$dt->{nodes_created} = -1;
2180
1
8
$dt->{class_names} = undef;
2181
}
2182
46
400
my $self = {
2183
_dt => $dt,
2184
_feature => $feature,
2185
_node_creation_entropy => $entropy,
2186
_class_probabilities => $class_probabilities,
2187
_branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
2188
_linked_to => [],
2189
};
2190
46
153
bless $self, $class;
2191
46
199
$self->{_serial_number} = $self->get_next_serial_num();
2192
46
136
return $self;
2193
}
2194
2195
sub how_many_nodes {
2196
0
0
0
my $self = shift;
2197
0
0
return $self->{_dt}->{nodes_created} + 1;
2198
}
2199
2200
sub set_class_names {
2201
1
1
5
my $self = shift;
2202
1
4
my $class_names_list = shift;
2203
1
4
$self->{_dt}->{class_names} = $class_names_list;
2204
}
2205
2206
sub get_class_names {
2207
0
0
0
my $self = shift;
2208
0
0
return $self->{_dt}->{class_names};
2209
}
2210
2211
sub get_next_serial_num {
2212
46
46
110
my $self = shift;
2213
46
158
$self->{_dt}->{nodes_created} += 1;
2214
46
154
return $self->{_dt}->{nodes_created};
2215
}
2216
2217
sub get_serial_num {
2218
46
46
115
my $self = shift;
2219
46
135
$self->{_serial_number};
2220
}
2221
2222
# this returns the feature test at the current node
2223
sub get_feature {
2224
0
0
0
my $self = shift;
2225
0
0
return $self->{ _feature };
2226
}
2227
2228
sub set_feature {
2229
46
46
114
my $self = shift;
2230
46
105
my $feature = shift;
2231
46
168
$self->{_feature} = $feature;
2232
}
2233
2234
sub get_node_entropy {
2235
46
46
103
my $self = shift;
2236
46
132
return $self->{_node_creation_entropy};
2237
}
2238
2239
sub get_class_probabilities {
2240
0
0
0
my $self = shift;
2241
0
0
return $self->{ _class_probabilities};
2242
}
2243
2244
sub get_branch_features_and_values_or_thresholds {
2245
46
46
117
my $self = shift;
2246
46
178
return $self->{_branch_features_and_values_or_thresholds};
2247
}
2248
2249
sub add_to_branch_features_and_values {
2250
0
0
0
my $self = shift;
2251
0
0
my $feature_and_value = shift;
2252
0
0
push @{$self->{ _branch_features_and_values }}, $feature_and_value;
0
0
2253
}
2254
2255
sub get_children {
2256
0
0
0
my $self = shift;
2257
0
0
return $self->{_linked_to};
2258
}
2259
2260
sub add_child_link {
2261
45
45
136
my ($self, $new_node, ) = @_;
2262
45
103
push @{$self->{_linked_to}}, $new_node;
45
151
2263
}
2264
2265
sub delete_all_links {
2266
0
0
0
my $self = shift;
2267
0
0
$self->{_linked_to} = undef;
2268
}
2269
2270
sub display_node {
2271
0
0
0
my $self = shift;
2272
0
0
0
my $feature_at_node = $self->get_feature() || " ";
2273
0
0
my $node_creation_entropy_at_node = $self->get_node_entropy();
2274
0
0
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2275
0
0
my @class_probabilities = @{$self->get_class_probabilities()};
0
0
2276
0
0
my @class_probabilities_for_display = map {sprintf("%0.3f", $_)} @class_probabilities;
0
0
2277
0
0
my $serial_num = $self->get_serial_num();
2278
0
0
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
0
0
2279
0
0
print "\n\nNODE $serial_num" .
2280
":\n Branch features and values to this node: @branch_features_and_values_or_thresholds" .
2281
"\n Class probabilities at current node: @class_probabilities_for_display" .
2282
"\n Entropy at current node: $print_node_creation_entropy_at_node" .
2283
"\n Best feature test at current node: $feature_at_node\n\n";
2284
}
2285
2286
sub display_decision_tree {
2287
0
0
0
my $self = shift;
2288
0
0
my $offset = shift;
2289
0
0
my $serial_num = $self->get_serial_num();
2290
0
0
0
if (@{$self->get_children()} > 0) {
0
0
2291
0
0
0
my $feature_at_node = $self->get_feature() || " ";
2292
0
0
my $node_creation_entropy_at_node = $self->get_node_entropy();
2293
0
0
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2294
0
0
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
0
0
2295
0
0
my @class_probabilities = @{$self->get_class_probabilities()};
0
0
2296
0
0
my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
0
0
2297
0
0
my @class_names = @{$self->get_class_names()};
0
0
2298
my @print_class_probabilities_with_class =
2299
0
0
map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
0
0
2300
0
0
print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
2301
0
0
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2302
0
0
print "$second_line_offset" . "Decision Feature: $feature_at_node Node Creation Entropy: " ,
2303
"$print_node_creation_entropy_at_node Class Probs: @print_class_probabilities_with_class\n\n";
2304
0
0
$offset .= " ";
2305
0
0
foreach my $child (@{$self->get_children()}) {
0
0
2306
0
0
$child->display_decision_tree($offset);
2307
}
2308
} else {
2309
0
0
my $node_creation_entropy_at_node = $self->get_node_entropy();
2310
0
0
my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2311
0
0
my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
0
0
2312
0
0
my @class_probabilities = @{$self->get_class_probabilities()};
0
0
2313
0
0
my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
0
0
2314
0
0
my @class_names = @{$self->get_class_names()};
0
0
2315
my @print_class_probabilities_with_class =
2316
0
0
map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
0
0
2317
0
0
print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
2318
0
0
my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2319
0
0
print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node " .
2320
"Class Probs: @print_class_probabilities_with_class\n\n";
2321
}
2322
}
2323
2324
2325
############################## Generate Your Own Numeric Training Data #################################
2326
############################# Class TrainingDataGeneratorNumeric ################################
2327
2328
## See the script generate_training_data_numeric.pl in the examples
2329
## directory on how to use this class for generating your own numeric training and
2330
## test data. The training and test data are generated in accordance with the
2331
## specifications you place in the parameter file that is supplied as an argument to
2332
## the constructor of this class.
2333
2334
package TrainingDataGeneratorNumeric;
2335
2336
1
1
20
use strict;
1
3
1
36
2337
1
1
8
use Carp;
1
4
1
1485
2338
2339
sub new {
2340
0
0
0
my ($class, %args) = @_;
2341
0
0
my @params = keys %args;
2342
0
0
0
croak "\nYou have used a wrong name for a keyword argument " .
2343
"--- perhaps a misspelling\n"
2344
if check_for_illegal_params3(@params) == 0;
2345
bless {
2346
_output_training_csv_file => $args{'output_training_csv_file'}
2347
|| croak("name for output_training_csv_file required"),
2348
_output_test_csv_file => $args{'output_test_csv_file'}
2349
|| croak("name for output_test_csv_file required"),
2350
_parameter_file => $args{'parameter_file'}
2351
|| croak("parameter_file required"),
2352
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
2353
|| croak("number_of_samples_for_training"),
2354
_number_of_samples_for_testing => $args{'number_of_samples_for_testing'}
2355
|| croak("number_of_samples_for_testing"),
2356
0
0
0
_debug => $args{debug} || 0,
0
0
0
0
0
2357
_class_names => [],
2358
_class_names_and_priors => {},
2359
_features_with_value_range => {},
2360
_features_ordered => [],
2361
_classes_and_their_param_values => {},
2362
}, $class;
2363
}
2364
2365
sub check_for_illegal_params3 {
2366
0
0
0
my @params = @_;
2367
0
0
my @legal_params = qw / output_training_csv_file
2368
output_test_csv_file
2369
parameter_file
2370
number_of_samples_for_training
2371
number_of_samples_for_testing
2372
debug
2373
/;
2374
0
0
my $found_match_flag;
2375
0
0
foreach my $param (@params) {
2376
0
0
foreach my $legal (@legal_params) {
2377
0
0
$found_match_flag = 0;
2378
0
0
0
if ($param eq $legal) {
2379
0
0
$found_match_flag = 1;
2380
0
0
last;
2381
}
2382
}
2383
0
0
0
last if $found_match_flag == 0;
2384
}
2385
0
0
return $found_match_flag;
2386
}
2387
2388
## The training data generated by an instance of the class
2389
## TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
2390
## you supply to the class constructor through a constructor variable called
2391
## `parameter_file'. This method is for parsing the parameter file in order to
2392
## order to determine the names to be used for the different data classes, their
2393
## means, and their variances.
2394
sub read_parameter_file_numeric {
2395
0
0
0
my $self = shift;
2396
0
0
my @class_names = ();
2397
0
0
my %class_names_and_priors = ();
2398
0
0
my %features_with_value_range = ();
2399
0
0
my %classes_and_their_param_values = ();
2400
# my $regex8 = '[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?';
2401
0
0
0
open FILE, $self->{_parameter_file} || die "unable to open parameter file: $!";
2402
0
0
my @params = ;
2403
0
0
my $params = join "", @params;
2404
0
0
my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
2405
0
0
$params =~ /$regex/si;
2406
0
0
my ($class_names, $class_priors) = ($1, $2);
2407
0
0
@class_names = split ' ', $class_names;
2408
0
0
my @class_priors = split ' ', $class_priors;
2409
0
0
foreach my $i (0..@class_names-1) {
2410
0
0
$class_names_and_priors{$class_names[$i]} = $class_priors[$i];
2411
}
2412
0
0
0
if ($self->{_debug}) {
2413
0
0
foreach my $cname (keys %class_names_and_priors) {
2414
0
0
print "$cname => $class_names_and_priors{$cname}\n";
2415
}
2416
}
2417
0
0
$regex = 'feature name: \w*.*?value range: [\d\. -]+';
2418
0
0
my @features = $params =~ /$regex/gsi;
2419
0
0
my @features_ordered;
2420
0
0
$regex = 'feature name: (\w+)\W*?value range:\s*([\d. -]+)';
2421
0
0
foreach my $feature (@features) {
2422
0
0
$feature =~ /$regex/i;
2423
0
0
my $feature_name = $1;
2424
0
0
push @features_ordered, $feature_name;
2425
0
0
my @value_range = split ' ', $2;
2426
0
0
$features_with_value_range{$feature_name} = \@value_range;
2427
}
2428
0
0
0
if ($self->{_debug}) {
2429
0
0
foreach my $fname (keys %features_with_value_range) {
2430
0
0
print "$fname => @{$features_with_value_range{$fname}}\n";
0
0
2431
}
2432
}
2433
0
0
foreach my $i (0..@class_names-1) {
2434
0
0
$classes_and_their_param_values{$class_names[$i]} = {};
2435
}
2436
0
0
$regex = 'params for class: \w*?\W+?mean:[\d\. ]+\W*?covariance:\W+?(?:[ \d.]+\W+?)+';
2437
0
0
my @class_params = $params =~ /$regex/gsi;
2438
0
0
$regex = 'params for class: (\w+)\W*?mean:\s*([\d. -]+)\W*covariance:\s*([\s\d.]+)';
2439
0
0
foreach my $class_param (@class_params) {
2440
0
0
$class_param =~ /$regex/gsi;
2441
0
0
my $class_name = $1;
2442
0
0
my @class_mean = split ' ', $2;
2443
0
0
$classes_and_their_param_values{$class_name}->{'mean'} = \@class_mean;
2444
0
0
my $class_param_string = $3;
2445
0
0
my @covar_rows = split '\n', $class_param_string;
2446
0
0
my @covar_matrix;
2447
0
0
foreach my $row (@covar_rows) {
2448
0
0
my @row = split ' ', $row;
2449
0
0
push @covar_matrix, \@row;
2450
}
2451
0
0
$classes_and_their_param_values{$class_name}->{'covariance'} = \@covar_matrix;
2452
}
2453
0
0
0
if ($self->{_debug}) {
2454
0
0
print "\nThe class parameters are:\n\n";
2455
0
0
foreach my $cname (keys %classes_and_their_param_values) {
2456
0
0
print "\nFor class name $cname:\n";
2457
0
0
my %params_hash = %{$classes_and_their_param_values{$cname}};
0
0
2458
0
0
foreach my $x (keys %params_hash) {
2459
0
0
0
if ($x eq 'mean') {
2460
0
0
print " $x => @{$params_hash{$x}}\n";
0
0
2461
} else {
2462
0
0
0
if ($x eq 'covariance') {
2463
0
0
print " The covariance matrix:\n";
2464
0
0
my @matrix = @{$params_hash{'covariance'}};
0
0
2465
0
0
foreach my $row (@matrix) {
2466
0
0
print " @$row\n";
2467
}
2468
}
2469
}
2470
}
2471
}
2472
}
2473
0
0
$self->{_class_names} = \@class_names;
2474
0
0
$self->{_class_names_and_priors} = \%class_names_and_priors;
2475
0
0
$self->{_features_with_value_range} = \%features_with_value_range;
2476
0
0
$self->{_classes_and_their_param_values} = \%classes_and_their_param_values;
2477
0
0
$self->{_features_ordered} = \@features_ordered;
2478
}
2479
2480
## After the parameter file is parsed by the previous method, this method calls on
2481
## Math::Random::random_multivariate_normal() to generate the training and test data
2482
## samples. Your training and test data can be of any number of of dimensions, can
2483
## have any mean, and any covariance. The training and test data must obviously be
2484
## drawn from the same distribution.
2485
sub gen_numeric_training_and_test_data_and_write_to_csv {
2486
1
1
864
use Math::Random;
1
8174
1
1453
2487
0
0
0
my $self = shift;
2488
0
0
my %training_samples_for_class;
2489
my %test_samples_for_class;
2490
0
0
foreach my $class_name (@{$self->{_class_names}}) {
0
0
2491
0
0
$training_samples_for_class{$class_name} = [];
2492
0
0
$test_samples_for_class{$class_name} = [];
2493
}
2494
0
0
foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
0
0
2495
0
0
my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
0
0
2496
0
0
my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
0
0
2497
my @new_training_data = Math::Random::random_multivariate_normal(
2498
0
0
$self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
2499
@mean, @covariance );
2500
my @new_test_data = Math::Random::random_multivariate_normal(
2501
0
0
$self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
2502
@mean, @covariance );
2503
0
0
0
if ($self->{_debug}) {
2504
0
0
print "training data for class $class_name:\n";
2505
0
0
foreach my $x (@new_training_data) {print "@$x\n";}
0
0
2506
0
0
print "\n\ntest data for class $class_name:\n";
2507
0
0
foreach my $x (@new_test_data) {print "@$x\n";}
0
0
2508
}
2509
0
0
$training_samples_for_class{$class_name} = \@new_training_data;
2510
0
0
$test_samples_for_class{$class_name} = \@new_test_data;
2511
}
2512
0
0
my @training_data_records = ();
2513
0
0
my @test_data_records = ();
2514
0
0
foreach my $class_name (keys %training_samples_for_class) {
2515
my $num_of_samples_for_training = $self->{_number_of_samples_for_training} *
2516
0
0
$self->{_class_names_and_priors}->{$class_name};
2517
my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} *
2518
0
0
$self->{_class_names_and_priors}->{$class_name};
2519
0
0
foreach my $sample_index (0..$num_of_samples_for_training-1) {
2520
0
0
my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};
0
0
2521
0
0
@training_vector = map {sprintf("%.3f", $_)} @training_vector;
0
0
2522
0
0
my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";
2523
0
0
push @training_data_records, $training_data_record;
2524
}
2525
0
0
foreach my $sample_index (0..$num_of_samples_for_testing-1) {
2526
0
0
my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
0
0
2527
0
0
@test_vector = map {sprintf("%.3f", $_)} @test_vector;
0
0
2528
0
0
my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
2529
0
0
push @test_data_records, $test_data_record;
2530
}
2531
}
2532
0
0
fisher_yates_shuffle(\@training_data_records);
2533
0
0
fisher_yates_shuffle(\@test_data_records);
2534
0
0
0
if ($self->{_debug}) {
2535
0
0
foreach my $record (@training_data_records) {
2536
0
0
print "$record";
2537
}
2538
0
0
foreach my $record (@test_data_records) {
2539
0
0
print "$record";
2540
}
2541
}
2542
0
0
open OUTPUT, ">$self->{_output_training_csv_file}";
2543
0
0
my @feature_names_training = @{$self->{_features_ordered}};
0
0
2544
0
0
my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
0
0
2545
0
0
my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
2546
0
0
print OUTPUT "$first_row_training\n";
2547
0
0
foreach my $i (0..@training_data_records-1) {
2548
0
0
my $i1 = $i+1;
2549
0
0
my $sample_record = "\"$i1\",$training_data_records[$i]";
2550
0
0
print OUTPUT "$sample_record";
2551
}
2552
0
0
close OUTPUT;
2553
0
0
open OUTPUT, ">$self->{_output_test_csv_file}";
2554
0
0
my @feature_names_testing = keys %{$self->{_features_with_value_range}};
0
0
2555
0
0
my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
0
0
2556
0
0
my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
2557
0
0
print OUTPUT "$first_row_testing\n";
2558
0
0
foreach my $i (0..@test_data_records-1) {
2559
0
0
my $i1 = $i+1;
2560
0
0
my $sample_record = "\"$i1\",$test_data_records[$i]";
2561
0
0
print OUTPUT "$sample_record";
2562
}
2563
0
0
close OUTPUT;
2564
}
2565
2566
# from perl docs:
2567
sub fisher_yates_shuffle {
2568
0
0
0
my $arr = shift;
2569
0
0
my $i = @$arr;
2570
0
0
while (--$i) {
2571
0
0
my $j = int rand( $i + 1 );
2572
0
0
@$arr[$i, $j] = @$arr[$j, $i];
2573
}
2574
}
2575
2576
########################### Generate Your Own Symbolic Training Data ###############################
2577
########################### Class TrainingDataGeneratorSymbolic #############################
2578
2579
## See the sample script generate_training_and_test_data_symbolic.pl for how to use
2580
## this class for generating purely symbolic training and test data. The data is
2581
## generated according to the specifications you place in a parameter file whose
2582
## name you supply as one of constructor arguments.
2583
package TrainingDataGeneratorSymbolic;
2584
2585
1
1
19
use strict;
1
5
1
39
2586
1
1
9
use Carp;
1
3
1
3354
2587
2588
sub new {
2589
1
1
32
my ($class, %args) = @_;
2590
1
7
my @params = keys %args;
2591
1
50
8
croak "\nYou have used a wrong name for a keyword argument " .
2592
"--- perhaps a misspelling\n"
2593
if check_for_illegal_params4(@params) == 0;
2594
bless {
2595
_output_training_datafile => $args{'output_training_datafile'}
2596
|| die("name for output_training_datafile required"),
2597
_parameter_file => $args{'parameter_file'}
2598
|| die("parameter_file required"),
2599
_number_of_samples_for_training => $args{'number_of_samples_for_training'}
2600
|| die("number_of_samples_for_training required"),
2601
1
50
31
_debug => $args{debug} || 0,
50
50
50
2602
_class_names => [],
2603
_class_priors => [],
2604
_features_and_values_hash => {},
2605
_bias_hash => {},
2606
_training_sample_records => {},
2607
}, $class;
2608
}
2609
2610
sub check_for_illegal_params4 {
2611
1
1
5
my @params = @_;
2612
1
5
my @legal_params = qw / output_training_datafile
2613
parameter_file
2614
number_of_samples_for_training
2615
debug
2616
/;
2617
1
4
my $found_match_flag;
2618
1
4
foreach my $param (@params) {
2619
3
10
foreach my $legal (@legal_params) {
2620
6
15
$found_match_flag = 0;
2621
6
100
23
if ($param eq $legal) {
2622
3
9
$found_match_flag = 1;
2623
3
8
last;
2624
}
2625
}
2626
3
50
14
last if $found_match_flag == 0;
2627
}
2628
1
8
return $found_match_flag;
2629
}
2630
2631
## Read a parameter file for generating symbolic training data. See the script
2632
## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
2633
## pass the name of the parameter file to the constructor of the
2634
## TrainingDataGeneratorSymbolic class.
2635
sub read_parameter_file_symbolic {
2636
1
1
9
my $self = shift;
2637
1
10
my $debug = $self->{_debug};
2638
1
5
my $number_of_training_samples = $self->{_number_of_samples_for_training};
2639
1
3
my $input_parameter_file = $self->{_parameter_file};
2640
1
50
7
croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
2641
1
3
my $output_file_training = $self->{_output_training_datafile};
2642
1
4
my $output_file_testing = $self->{_output_test_datafile};
2643
1
4
my @all_params;
2644
my $param_string;
2645
1
33
36
open INPUT, $input_parameter_file || "unable to open parameter file: $!";
2646
1
53
@all_params = ;
2647
1
9
@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
40
150
2648
1
5
@all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
36
173
2649
1
9
$param_string = join ' ', @all_params;
2650
1
16
my ($class_names, $class_priors, $rest_param) =
2651
$param_string =~ /^\s*class names:(.*?)\s*class priors:(.*?)(feature: .*)/;
2652
1
50
12
my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
3
26
2653
1
25
push @{$self->{_class_names}}, @class_names;
1
7
2654
1
50
16
my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
3
24
2655
1
4
push @{$self->{_class_priors}}, @class_priors;
1
6
2656
1
15
my ($feature_string, $bias_string) = $rest_param =~ /(feature:.*?) (bias:.*)/;
2657
1
4
my %features_and_values_hash;
2658
1
24
my @features = split /(feature[:])/, $feature_string;
2659
1
50
5
@features = grep {defined($_) && length($_) > 0} @features;
9
52
2660
1
4
foreach my $item (@features) {
2661
8
100
38
next if $item =~ /feature/;
2662
4
24
my @splits = split / /, $item;
2663
4
50
14
@splits = grep {defined($_) && length($_) > 0} @splits;
27
148
2664
4
16
foreach my $i (0..@splits-1) {
2665
22
100
67
if ($i == 0) {
2666
4
20
$features_and_values_hash{$splits[0]} = [];
2667
} else {
2668
18
100
70
next if $splits[$i] =~ /values/;
2669
14
29
push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
14
56
2670
}
2671
}
2672
}
2673
1
6
$self->{_features_and_values_hash} = \%features_and_values_hash;
2674
1
4
my %bias_hash = %{$self->{_bias_hash}};
1
6
2675
1
20
my @biases = split /(bias[:]\s*class[:])/, $bias_string;
2676
1
50
5
@biases = grep {defined($_) && length($_) > 0} @biases;
5
35
2677
1
5
foreach my $item (@biases) {
2678
4
100
21
next if $item =~ /bias/;
2679
2
24
my @splits = split /\s+/, $item;
2680
2
50
8
@splits = grep {defined($_) && length($_) > 0} @splits;
18
97
2681
2
6
my $feature_name;
2682
2
16
foreach my $i (0..@splits-1) {
2683
16
100
89
if ($i == 0) {
100
2684
2
10
$bias_hash{$splits[0]} = {};
2685
} elsif ($splits[$i] =~ /(^.+)[:]$/) {
2686
8
29
$feature_name = $1;
2687
8
37
$bias_hash{$splits[0]}->{$feature_name} = [];
2688
} else {
2689
6
50
28
next if !defined $feature_name;
2690
6
50
20
push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
6
26
2691
if defined $feature_name;
2692
}
2693
}
2694
}
2695
1
5
$self->{_bias_hash} = \%bias_hash;
2696
1
50
16
if ($debug) {
2697
0
0
print "\n\nClass names: @class_names\n";
2698
0
0
my $num_of_classes = @class_names;
2699
0
0
print "Class priors: @class_priors\n";
2700
0
0
print "Number of classes: $num_of_classes\n";
2701
0
0
print "\nHere are the features and their possible values:\n";
2702
0
0
while ( my ($k, $v) = each %features_and_values_hash ) {
2703
0
0
print "$k ===> @$v\n";
2704
}
2705
0
0
print "\nHere is the biasing for each class:\n";
2706
0
0
while ( my ($k, $v) = each %bias_hash ) {
2707
0
0
print "$k:\n";
2708
0
0
while ( my ($k1, $v1) = each %$v ) {
2709
0
0
print " $k1 ===> @$v1\n";
2710
}
2711
}
2712
}
2713
}
2714
2715
## This method generates training data according to the specifications placed in a
2716
## parameter file that is read by the previous method.
2717
sub gen_symbolic_training_data {
2718
1
1
11
my $self = shift;
2719
1
3
my @class_names = @{$self->{_class_names}};
1
7
2720
1
4
my @class_priors = @{$self->{_class_priors}};
1
5
2721
1
3
my %training_sample_records;
2722
1
3
my %features_and_values_hash = %{$self->{_features_and_values_hash}};
1
8
2723
1
4
my %bias_hash = %{$self->{_bias_hash}};
1
5
2724
1
5
my $how_many_training_samples = $self->{_number_of_samples_for_training};
2725
1
4
my $how_many_test_samples = $self->{_number_of_samples_for_testing};
2726
1
3
my %class_priors_to_unit_interval_map;
2727
1
3
my $accumulated_interval = 0;
2728
1
6
foreach my $i (0..@class_names-1) {
2729
2
13
$class_priors_to_unit_interval_map{$class_names[$i]}
2730
= [$accumulated_interval, $accumulated_interval + $class_priors[$i]];
2731
2
8
$accumulated_interval += $class_priors[$i];
2732
}
2733
1
50
6
if ($self->{_debug}) {
2734
0
0
print "Mapping of class priors to unit interval: \n";
2735
0
0
while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {
2736
0
0
print "$k => @$v\n";
2737
}
2738
0
0
print "\n\n";
2739
}
2740
1
10
my $ele_index = 0;
2741
1
6
while ($ele_index < $how_many_training_samples) {
2742
35
86
my $sample_name = "sample" . "_$ele_index";
2743
35
108
$training_sample_records{$sample_name} = [];
2744
# Generate class label for this training sample:
2745
35
109
my $roll_the_dice = rand(1.0);
2746
35
56
my $class_label;
2747
35
95
foreach my $class_name (keys %class_priors_to_unit_interval_map ) {
2748
52
93
my $v = $class_priors_to_unit_interval_map{$class_name};
2749
52
100
66
245
if ( ($roll_the_dice >= $v->[0]) && ($roll_the_dice <= $v->[1]) ) {
2750
35
67
push @{$training_sample_records{$sample_name}},
35
103
2751
"class=" . $class_name;
2752
35
69
$class_label = $class_name;
2753
35
66
last;
2754
}
2755
}
2756
35
92
foreach my $feature (keys %features_and_values_hash) {
2757
140
259
my @values = @{$features_and_values_hash{$feature}};
140
411
2758
140
303
my $bias_string = $bias_hash{$class_label}->{$feature}->[0];
2759
140
289
my $no_bias = 1.0 / @values;
2760
140
100
482
$bias_string = "$values[0]" . "=$no_bias" if !defined $bias_string;
2761
140
232
my %value_priors_to_unit_interval_map;
2762
140
700
my @splits = split /\s*=\s*/, $bias_string;
2763
140
342
my $chosen_for_bias_value = $splits[0];
2764
140
230
my $chosen_bias = $splits[1];
2765
140
365
my $remaining_bias = 1 - $chosen_bias;
2766
140
298
my $remaining_portion_bias = $remaining_bias / (@values -1);
2767
140
50
301
@splits = grep {defined($_) && length($_) > 0} @splits;
280
1359
2768
140
291
my $accumulated = 0;
2769
140
335
foreach my $i (0..@values-1) {
2770
490
100
1220
if ($values[$i] eq $chosen_for_bias_value) {
2771
140
407
$value_priors_to_unit_interval_map{$values[$i]}
2772
= [$accumulated, $accumulated + $chosen_bias];
2773
140
304
$accumulated += $chosen_bias;
2774
} else {
2775
350
866
$value_priors_to_unit_interval_map{$values[$i]}
2776
= [$accumulated, $accumulated + $remaining_portion_bias];
2777
350
677
$accumulated += $remaining_portion_bias;
2778
}
2779
}
2780
140
276
my $roll_the_dice = rand(1.0);
2781
140
230
my $value_label;
2782
140
353
foreach my $value_name (keys %value_priors_to_unit_interval_map ) {
2783
307
546
my $v = $value_priors_to_unit_interval_map{$value_name};
2784
307
100
100
1282
if ( ($roll_the_dice >= $v->[0])
2785
&& ($roll_the_dice <= $v->[1]) ) {
2786
140
232
push @{$training_sample_records{$sample_name}},
140
490
2787
$feature . "=" . $value_name;
2788
140
260
$value_label = $value_name;
2789
140
352
last;
2790
}
2791
}
2792
140
50
676
if ($self->{_debug}) {
2793
0
0
print "mapping feature value priors for '$feature' " .
2794
"to unit interval: \n";
2795
0
0
while ( my ($k, $v) =
2796
each %value_priors_to_unit_interval_map ) {
2797
0
0
print "$k => @$v\n";
2798
}
2799
0
0
print "\n\n";
2800
}
2801
}
2802
35
120
$ele_index++;
2803
}
2804
1
4
$self->{_training_sample_records} = \%training_sample_records;
2805
1
50
6
if ($self->{_debug}) {
2806
0
0
print "\n\nPRINTING TRAINING RECORDS:\n\n";
2807
0
0
foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {
0
0
2808
0
0
print "$kee => @{$training_sample_records{$kee}}\n\n";
0
0
2809
}
2810
}
2811
1
4
my $output_training_file = $self->{_output_training_datafile};
2812
1
50
5
print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};
2813
1
120
open FILEHANDLE, ">$output_training_file";
2814
1
10
my @features = sort keys %features_and_values_hash;
2815
1
4
my $title_string = ',class';
2816
1
4
foreach my $feature_name (@features) {
2817
4
9
$title_string .= ',' . $feature_name;
2818
}
2819
1
12
print FILEHANDLE "$title_string\n";
2820
1
15
my @sample_names = sort {$a <=> $b} map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;
105
156
35
76
35
63
2821
1
6
my $record_string = '';
2822
1
3
foreach my $sample_name (@sample_names) {
2823
35
61
$record_string .= "$sample_name,";
2824
35
44
my @record = @{$training_sample_records{"sample_$sample_name"}};
35
111
2825
35
59
my %item_parts_hash;
2826
35
50
foreach my $item (@record) {
2827
175
431
my @splits = grep $_, split /=/, $item;
2828
175
365
$item_parts_hash{$splits[0]} = $splits[1];
2829
}
2830
35
76
$record_string .= $item_parts_hash{"class"};
2831
35
64
delete $item_parts_hash{"class"};
2832
35
102
my @kees = sort keys %item_parts_hash;
2833
35
67
foreach my $kee (@kees) {
2834
140
232
$record_string .= ",$item_parts_hash{$kee}";
2835
}
2836
35
67
print FILEHANDLE "$record_string\n";
2837
35
85
$record_string = '';
2838
}
2839
1
62
close FILEHANDLE;
2840
}
2841
2842
sub sample_index {
2843
0
0
my $arg = shift;
2844
0
$arg =~ /_(.+)$/;
2845
0
return $1;
2846
}
2847
2848
################################# Decision Tree Introspection #######################################
2849
################################# Class DTIntrospection #######################################
2850
2851
package DTIntrospection;
2852
2853
## Instances constructed from this class can provide explanations for the
2854
## classification decisions at the nodes of a decision tree.
2855
##
2856
## When used in the interactive mode, the decision-tree introspection made possible
2857
## by this class provides answers to the following three questions: (1) List of the
2858
## training samples that fall in the portion of the feature space that corresponds
2859
## to a node of the decision tree; (2) The probabilities associated with the last
2860
## feature test that led to the node; and (3) The class probabilities predicated on
2861
## just the last feature test on the path to that node.
2862
##
2863
## CAVEAT: It is possible for a node to exist even when there are no training
2864
## samples in the portion of the feature space that corresponds to the node. That
2865
## is because a decision tree is based on the probability densities estimated from
2866
## the training data. When training data is non-uniformly distributed, it is
2867
## possible for the probability associated with a point in the feature space to be
2868
## non-zero even when there are no training samples at or in the vicinity of that
2869
## point.
2870
##
2871
## For a node to exist even where there are no training samples in the portion of
2872
## the feature space that belongs to the node is an indication of the generalization
2873
## ability of decision-tree based classification.
2874
##
2875
## When used in a non-interactive mode, an instance of this class can be used to
2876
## create a tabular display that shows what training samples belong directly to the
2877
## portion of the feature space that corresponds to each node of the decision tree.
2878
## An instance of this class can also construct a tabular display that shows how the
2879
## influence of each training sample propagates in the decision tree. For each
2880
## training sample, this display first shows the list of nodes that came into
2881
## existence through feature test(s) that used the data provided by that sample.
2882
## This list for each training sample is followed by a subtree of the nodes that owe
2883
## their existence indirectly to the training sample. A training sample influences a
2884
## node indirectly if the node is a descendant of another node that is affected
2885
## directly by the training sample.
2886
2887
1
1
18
use strict;
1
5
1
47
2888
1
1
9
use Carp;
1
3
1
3391
2889
2890
sub new {
2891
0
0
my ($class, $dt) = @_;
2892
0
0
croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
2893
unless ref($dt) eq "Algorithm::DecisionTree";
2894
bless {
2895
_dt => $dt,
2896
_root_dtnode => $dt->{_root_node},
2897
0
_samples_at_nodes_hash => {},
2898
_branch_features_to_nodes_hash => {},
2899
_sample_to_node_mapping_direct_hash => {},
2900
_node_serial_num_to_node_hash => {},
2901
_awareness_raising_msg_shown => 0,
2902
_debug => 0,
2903
}, $class;
2904
}
2905
2906
sub initialize {
2907
0
0
my $self = shift;
2908
croak "You must first construct the decision tree before using the DTIntrospection class"
2909
0
0
unless $self->{_root_dtnode};
2910
0
$self->recursive_descent($self->{_root_dtnode});
2911
}
2912
2913
sub recursive_descent {
2914
0
0
my $self = shift;
2915
0
my $node = shift;
2916
0
my $node_serial_number = $node->get_serial_num();
2917
0
my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2918
0
0
print "\nAt node $node_serial_number: the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
0
2919
0
$self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
2920
0
$self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
2921
0
my @samples_at_node = ();
2922
0
foreach my $item (@$branch_features_and_values_or_thresholds) {
2923
0
my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
2924
0
0
unless (@samples_at_node) {
2925
0
@samples_at_node = @$samples_for_feature_value_combo;
2926
} else {
2927
0
my @accum;
2928
0
foreach my $sample (@samples_at_node) {
2929
0
0
push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);
2930
}
2931
0
@samples_at_node = @accum;
2932
}
2933
0
0
last unless @samples_at_node;
2934
}
2935
0
@samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node;
0
2936
0
0
print "Node: $node_serial_number the samples are: [@samples_at_node]\n" if ($self->{_debug});
2937
0
$self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
2938
0
0
if (@samples_at_node) {
2939
0
foreach my $sample (@samples_at_node) {
2940
0
0
if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2941
0
$self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number];
2942
} else {
2943
0
push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
0
2944
}
2945
}
2946
}
2947
0
my $children = $node->get_children();
2948
0
foreach my $child (@$children) {
2949
0
$self->recursive_descent($child);
2950
}
2951
}
2952
2953
sub display_training_samples_at_all_nodes_direct_influence_only {
2954
0
0
my $self = shift;
2955
croak "You must first construct the decision tree before using the DT Introspection class."
2956
0
0
unless $self->{_root_dtnode};
2957
0
$self->recursive_descent_for_showing_samples_at_a_node($self->{_root_dtnode});
2958
}
2959
2960
sub recursive_descent_for_showing_samples_at_a_node{
2961
0
0
my $self = shift;
2962
0
my $node = shift;
2963
0
my $node_serial_number = $node->get_serial_num();
2964
0
my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2965
0
0
if (exists $self->{_samples_at_nodes_hash}->{$node_serial_number}) {
2966
0
0
print "\nAt node $node_serial_number: the branch features and values are: [@{$branch_features_and_values_or_thresholds}]\n" if $self->{_debug};
0
2967
0
print "Node $node_serial_number: the samples are: [@{$self->{_samples_at_nodes_hash}->{$node_serial_number}}]\n";
0
2968
}
2969
0
map $self->recursive_descent_for_showing_samples_at_a_node($_), @{$node->get_children()};
0
2970
}
2971
2972
sub display_training_samples_to_nodes_influence_propagation {
2973
0
0
my $self = shift;
2974
0
foreach my $sample (sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} keys %{$self->{_dt}->{_training_data_hash}}) {
0
0
2975
0
0
if (exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2976
0
my $nodes_directly_affected = $self->{_sample_to_node_mapping_direct_hash}->{$sample};
2977
0
print "\n$sample:\n nodes affected directly: [@{$nodes_directly_affected}]\n";
0
2978
0
print " nodes affected through probabilistic generalization:\n";
2979
0
map $self->recursive_descent_for_sample_to_node_influence($_, $nodes_directly_affected, " "), @$nodes_directly_affected;
2980
}
2981
}
2982
}
2983
2984
sub recursive_descent_for_sample_to_node_influence {
2985
0
0
my $self = shift;
2986
0
my $node_serial_num = shift;
2987
0
my $nodes_already_accounted_for = shift;
2988
0
my $offset = shift;
2989
0
$offset .= " ";
2990
0
my $node = $self->{_node_serial_num_to_node_hash}->{$node_serial_num};
2991
0
my @children = map $_->get_serial_num(), @{$node->get_children()};
0
2992
0
my @children_affected = grep {!Algorithm::DecisionTree::contained_in($_, @{$nodes_already_accounted_for})} @children;
0
0
2993
0
0
if (@children_affected) {
2994
0
print "$offset $node_serial_num => [@children_affected]\n";
2995
}
2996
0
map $self->recursive_descent_for_sample_to_node_influence($_, \@children_affected, $offset), @children_affected;
2997
}
2998
2999
sub get_samples_for_feature_value_combo {
3000
0
0
my $self = shift;
3001
0
my $feature_value_combo = shift;
3002
0
my ($feature,$op,$value) = $self->extract_feature_op_val($feature_value_combo);
3003
0
my @samples = ();
3004
0
0
if ($op eq '=') {
0
0
3005
0
@samples = grep Algorithm::DecisionTree::contained_in($feature_value_combo, @{$self->{_dt}->{_training_data_hash}->{$_}}), keys %{$self->{_dt}->{_training_data_hash}};
0
0
3006
} elsif ($op eq '<') {
3007
0
foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
0
3008
0
my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
0
3009
0
foreach my $item (@features_and_values) {
3010
0
my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3011
0
0
0
if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data <= $value)) {
0
3012
0
push @samples, $sample;
3013
0
last;
3014
}
3015
}
3016
}
3017
} elsif ($op eq '>') {
3018
0
foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
0
3019
0
my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
0
3020
0
foreach my $item (@features_and_values) {
3021
0
my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3022
0
0
0
if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data > $value)) {
0
3023
0
push @samples, $sample;
3024
0
last;
3025
}
3026
}
3027
}
3028
} else {
3029
0
die "Something strange is going on";
3030
}
3031
0
return \@samples;
3032
}
3033
3034
sub extract_feature_op_val {
3035
0
0
my $self = shift;
3036
0
my $feature_value_combo = shift;
3037
0
my $pattern1 = '(.+)=(.+)';
3038
0
my $pattern2 = '(.+)<(.+)';
3039
0
my $pattern3 = '(.+)>(.+)';
3040
0
my ($feature,$value,$op);
3041
0
0
if ($feature_value_combo =~ /$pattern2/) {
0
0
3042
0
($feature,$op,$value) = ($1,'<',$2);
3043
} elsif ($feature_value_combo =~ /$pattern3/) {
3044
0
($feature,$op,$value) = ($1,'>',$2);
3045
} elsif ($feature_value_combo =~ /$pattern1/) {
3046
0
($feature,$op,$value) = ($1,'=',$2);
3047
}
3048
0
return ($feature,$op,$value);
3049
}
3050
3051
sub explain_classifications_at_multiple_nodes_interactively {
3052
0
0
my $self = shift;
3053
croak "You called explain_classification_at_multiple_nodes_interactively() without " .
3054
"first initializing the DTIntrospection instance in your code. Aborting."
3055
0
0
unless $self->{_samples_at_nodes_hash};
3056
0
print "\n\nIn order for the decision tree to introspect\n\n";
3057
0
print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3058
" that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3059
" with it even when there are NO training data points in that region?\n" .
3060
"\nEnter 'y' for yes or any other character for no: ";
3061
0
my $ans = ;
3062
0
$ans =~ s/^\s*|\s*$//g;
3063
0
0
die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decisions in the decision tree. Aborting!\n" if $ans !~ /^ye?s?$/;
3064
0
$self->{_awareness_raising_msg_shown} = 1;
3065
0
while (1) {
3066
0
my $node_id;
3067
my $ans;
3068
0
while (1) {
3069
0
print "\nEnter the integer ID of a node: ";
3070
0
$ans = ;
3071
0
$ans =~ s/^\s*|\s*$//g;
3072
0
0
return if $ans =~ /^exit$/;
3073
0
0
last if Algorithm::DecisionTree::contained_in($ans, keys %{$self->{_samples_at_nodes_hash}});
0
3074
0
print "\nYour answer must be an integer ID of a node. Try again or enter 'exit'.\n";
3075
}
3076
0
$node_id = $ans;
3077
0
$self->explain_classification_at_one_node($node_id)
3078
}
3079
}
3080
3081
sub explain_classification_at_one_node {
3082
0
0
my $self = shift;
3083
0
my $node_id = shift;
3084
croak "You called explain_classification_at_one_node() without first initializing " .
3085
0
0
"the DTIntrospection instance in your code. Aborting." unless $self->{_samples_at_nodes_hash};
3086
0
0
unless (exists $self->{_samples_at_nodes_hash}->{$node_id}) {
3087
0
print "Node $node_id is not a node in the tree\n";
3088
0
return;
3089
}
3090
0
0
unless ($self->{_awareness_raising_msg_shown}) {
3091
0
print "\n\nIn order for the decision tree to introspect at Node $node_id: \n\n";
3092
0
print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3093
" that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3094
" with it even when there are NO training data points in that region?\n" .
3095
"\nEnter 'y' for yes or any other character for no: ";
3096
0
my $ans = ;
3097
0
$ans =~ s/^\s*|\s*$//g;
3098
0
0
die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decision at node $node_id\n" if $ans !~ /^ye?s?$/;
3099
}
3100
0
my @samples_at_node = @{$self->{_samples_at_nodes_hash}->{$node_id}};
0
3101
0
my @branch_features_to_node = @{$self->{_branch_features_to_nodes_hash}->{$node_id}};
0
3102
# my @class_names = @{$self->get_class_names()};
3103
0
my @class_names = $self->{_dt}->get_class_names();
3104
0
my $class_probabilities = $self->{_root_dtnode}->get_class_probabilities();
3105
0
my ($feature,$op,$value) = $self->extract_feature_op_val( $branch_features_to_node[-1] );
3106
0
0
my $msg = @samples_at_node == 0
3107
? "\n\n There are NO training data samples directly in the region of the feature space assigned to node $node_id: @samples_at_node\n\n"
3108
: "\n Samples in the portion of the feature space assigned to Node $node_id: @samples_at_node\n";
3109
0
$msg .= "\n Features tests on the branch to node $node_id: [@branch_features_to_node]\n\n";
3110
0
$msg .= "\n Would you like to see the probability associated with the last feature test on the branch leading to Node $node_id?\n";
3111
0
$msg .= "\n Enter 'y' if yes and any other character for 'no': ";
3112
0
print $msg;
3113
0
my $ans = ;
3114
0
$ans =~ s/^\s*|\s*$//g;
3115
0
0
if ($ans =~ /^ye?s?$/) {
3116
0
my $sequence = [$branch_features_to_node[-1]];
3117
0
my $prob = $self->{_dt}->probability_of_a_sequence_of_features_and_values_or_thresholds($sequence);
3118
0
print "\n probability of @{$sequence} is: $prob\n";
0
3119
}
3120
0
$msg = "\n Using Bayes rule, would you like to see the class probabilities predicated on just the last feature test on the branch leading to Node $node_id?\n";
3121
0
$msg .= "\n Enter 'y' for yes and any other character for no: ";
3122
0
print $msg;
3123
0
$ans = ;
3124
0
$ans =~ s/^\s*|\s*$//g;
3125
0
0
if ($ans =~ /^ye?s?$/) {
3126
0
my $sequence = [$branch_features_to_node[-1]];
3127
0
foreach my $cls (@class_names) {
3128
0
my $prob = $self->{_dt}->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($cls, $sequence);
3129
0
print "\n probability of class $cls given just the feature test @{$sequence} is: $prob\n";
0
3130
}
3131
} else {
3132
0
print "goodbye\n";
3133
}
3134
0
print "\n Finished supplying information on Node $node_id\n\n";
3135
}
3136
3137
1;
3138
3139
=pod
3140
3141
=encoding UTF8
3142
3143
=head1 NAME
3144
3145
Algorithm::DecisionTree - A Perl module for decision-tree based classification of
3146
multidimensional data.
3147
3148
3149
=head1 SYNOPSIS
3150
3151
# FOR CONSTRUCTING A DECISION TREE AND FOR CLASSIFYING A SAMPLE:
3152
3153
# In general, your call for constructing an instance of the DecisionTree class
3154
# will look like:
3155
3156
my $training_datafile = "stage3cancer.csv";
3157
my $dt = Algorithm::DecisionTree->new(
3158
training_datafile => $training_datafile,
3159
csv_class_column_index => 2,
3160
csv_columns_for_features => [3,4,5,6,7,8],
3161
entropy_threshold => 0.01,
3162
max_depth_desired => 8,
3163
symbolic_to_numeric_cardinality_threshold => 10,
3164
csv_cleanup_needed => 1,
3165
);
3166
3167
# The constructor option `csv_class_column_index' informs the module as to which
3168
# column of your CSV file contains the class label. THE COLUMN INDEXING IS ZERO
3169
# BASED. The constructor option `csv_columns_for_features' specifies which columns
3170
# are to be used for feature values. The first row of the CSV file must specify
3171
# the names of the features. See examples of CSV files in the `Examples'
3172
# subdirectory.
3173
3174
# The option `symbolic_to_numeric_cardinality_threshold' is also important. For
3175
# the example shown above, if an ostensibly numeric feature takes on only 10 or
3176
# fewer different values in your training datafile, it will be treated like a
3177
# symbolic features. The option `entropy_threshold' determines the granularity
3178
# with which the entropies are sampled for the purpose of calculating entropy gain
3179
# with a particular choice of decision threshold for a numeric feature or a feature
3180
# value for a symbolic feature.
3181
3182
# The option 'csv_cleanup_needed' is by default set to 0. If you set it
3183
# to 1, that would cause all line records in your CSV file to be "sanitized" before
3184
# they are used for constructing a decision tree. You need this option if your CSV
3185
# file uses double-quoted field names and field values in the line records and if
3186
# such double-quoted strings are allowed to include commas for, presumably, better
3187
# readability.
3188
3189
# After you have constructed an instance of the DecisionTree class as shown above,
3190
# you read in the training data file and initialize the probability cache by
3191
# calling:
3192
3193
$dt->get_training_data();
3194
$dt->calculate_first_order_probabilities();
3195
$dt->calculate_class_priors();
3196
3197
# Next you construct a decision tree for your training data by calling:
3198
3199
$root_node = $dt->construct_decision_tree_classifier();
3200
3201
# where $root_node is an instance of the DTNode class that is also defined in the
3202
# module file. Now you are ready to classify a new data record. Let's say that
3203
# your data record looks like:
3204
3205
my @test_sample = qw / g2=4.2
3206
grade=2.3
3207
gleason=4
3208
eet=1.7
3209
age=55.0
3210
ploidy=diploid /;
3211
3212
# You can classify it by calling:
3213
3214
my $classification = $dt->classify($root_node, \@test_sample);
3215
3216
# The call to `classify()' returns a reference to a hash whose keys are the class
3217
# names and the values the associated classification probabilities. This hash also
3218
# includes another key-value pair for the solution path from the root node to the
3219
# leaf node at which the final classification was carried out.
3220
3221
3222
=head1 CHANGES
3223
3224
B This version fixes a bug in the C function.
3225
The source of the bug was a typo in a regex component meant for matching with white
3226
space. I have also made one additional change to this function to increase its
3227
versatility. With this change, you are now allowed to have empty strings as values
3228
for features.
3229
3230
B This version reintroduces C as an optional
3231
parameter in the module constructor. This was done in response to several requests
3232
received from the user community. (Previously, all line records from a CSV file were
3233
processed by the C function no matter what.) The main point made by
3234
the users was that invoking C when there was no need for CSV clean-up
3235
extracted a performance penalty when ingesting large database files with tens of
3236
thousands of line records. In addition to making C optional, I
3237
have also tweaked up the code in the C function in order to extract
3238
data from a larger range of messy CSV files.
3239
3240
B All the changes made in this version relate to the construction of
3241
regression trees. I have fixed a couple of bugs in the calculation of the regression
3242
coefficients. Additionally, the C class now comes with a new
3243
constructor parameter named C. For most cases, you'd set this
3244
parameter to 0, which causes the regression coefficients to be estimated through
3245
linear least-squares minimization.
3246
3247
B In addition to constructing decision trees, this version of the
3248
module also allows you to construct regression trees. The regression tree capability
3249
has been packed into a separate subclass, named C, of the main
3250
C class. The subdirectory C in the main
3251
installation directory illustrates how you can use this new functionality of the
3252
module.
3253
3254
B This version incorporates four very significant upgrades/changes to
3255
the C module: B<(1)> The CSV cleanup is now the default. So you do not
3256
have to set any special parameters in the constructor calls to initiate CSV
3257
cleanup. B<(2)> In the form of a new Perl class named C,
3258
this module provides you with an easy-to-use programming interface for attempting
3259
needle-in-a-haystack solutions for the case when your training data is overwhelmingly
3260
dominated by a single class. You need to set the constructor parameter
3261
C to invoke the logic that constructs multiple
3262
decision trees, each using the minority class samples along with samples drawn
3263
randomly from the majority class. The final classification is made through a
3264
majority vote from all the decision trees. B<(3)> Assuming you are faced with a
3265
big-data problem --- in the sense that you have been given a training database with a
3266
very large number of training records --- the class C will
3267
also let you construct multiple decision trees by pulling training data randomly from
3268
your training database (without paying attention to the relative populations of the
3269
classes). The final classification decision for a test sample is based on a majority
3270
vote from all the decision trees thus constructed. See the C
3271
directory for how to use these new features of the module. And, finally, B<(4)>
3272
Support for the old-style '.dat' training files has been dropped in this version.
3273
3274
B This version makes it easier to use a CSV training file that
3275
violates the assumption that a comma be used only to separate the different field
3276
values in a line record. Some large econometrics databases use double-quoted values
3277
for fields, and these values may contain commas (presumably for better readability).
3278
This version also allows you to specify the leftmost entry in the first CSV record
3279
that names all the fields. Previously, this entry was required to be an empty
3280
double-quoted string. I have also made some minor changes to the
3281
'C' method to make it more user friendly for large
3282
training files that may contain tens of thousands of records. When pulling training
3283
data from such files, this method prints out a dot on the terminal screen for every
3284
10000 records it has processed.
3285
3286
B This version brings the boosting capability to the C
3287
module.
3288
3289
B This version adds bagging to the C module. If your
3290
training dataset is large enough, you can ask the module to construct multiple
3291
decision trees using data bags extracted from your dataset. The module can show you
3292
the results returned by the individual decision trees and also the results obtained
3293
by taking a majority vote of the classification decisions made by the individual
3294
trees. You can specify any arbitrary extent of overlap between the data bags.
3295
3296
B The introspection capability in this version packs more of a punch.
3297
For each training data sample, you can now figure out not only the decision-tree
3298
nodes that are affected directly by that sample, but also those nodes that are
3299
affected indirectly through the generalization achieved by the probabilistic modeling
3300
of the data. The 'examples' directory of this version includes additional scripts
3301
that illustrate these enhancements to the introspection capability. See the section
3302
"The Introspection API" for a declaration of the introspection related methods, old
3303
and new.
3304
3305
B In response to requests from several users, this version includes a new
3306
capability: You can now ask the module to introspect about the classification
3307
decisions returned by the decision tree. Toward that end, the module includes a new
3308
class named C. Perhaps the most important bit of information you
3309
are likely to seek through DT introspection is the list of the training samples that
3310
fall directly in the portion of the feature space that is assigned to a node.
3311
B When training samples are non-uniformly distributed in the underlying
3312
feature space, IT IS POSSIBLE FOR A NODE TO EXIST EVEN WHEN NO TRAINING SAMPLES FALL
3313
IN THE PORTION OF THE FEATURE SPACE ASSIGNED TO THE NODE. B<(This is an important
3314
part of the generalization achieved by probabilistic modeling of the training data.)>
3315
For additional information related to DT introspection, see the section titled
3316
"DECISION TREE INTROSPECTION" in this documentation page.
3317
3318
B fixes a bug in the part of the module that some folks use for generating
3319
synthetic data for experimenting with decision tree construction and classification.
3320
In the class C that is a part of the module, there
3321
was a problem with the order in which the features were recorded from the
3322
user-supplied parameter file. The basic code for decision tree construction and
3323
classification remains unchanged.
3324
3325
B further downshifts the required version of Perl for this module. This
3326
was a result of testing the module with Version 5.10.1 of Perl. Only one statement
3327
in the module code needed to be changed for the module to work with the older version
3328
of Perl.
3329
3330
B fixes the C restriction on the required Perl version. This
3331
version should work with Perl versions 5.14.0 and higher.
3332
3333
B changes the required version of Perl from 5.18.0 to 5.14.0. Everything
3334
else remains the same.
3335
3336
B should prove more robust when the probability distribution for the
3337
values of a feature is expected to be heavy-tailed; that is, when the supposedly rare
3338
observations can occur with significant probabilities. A new option in the
3339
DecisionTree constructor lets the user specify the precision with which the
3340
probability distributions are estimated for such features.
3341
3342
B fixes a bug that was caused by the explicitly set zero values for
3343
numerical features being misconstrued as "false" in the conditional statements in
3344
some of the method definitions.
3345
3346
B makes it easier to write code for classifying in one go all of your test
3347
data samples in a CSV file. The bulk classifications obtained can be written out to
3348
either a CSV file or to a regular text file. See the script
3349
C in the C directory for how to
3350
classify all of your test data records in a CSV file. This version also includes
3351
improved code for generating synthetic numeric/symbolic training and test data
3352
records for experimenting with the decision tree classifier.
3353
3354
B allows you to test the quality of your training data by running a 10-fold
3355
cross-validation test on the data. This test divides all of the training data into
3356
ten parts, with nine parts used for training a decision tree and one part used for
3357
testing its ability to classify correctly. This selection of nine parts for training
3358
and one part for testing is carried out in all of the ten different ways that are
3359
possible. This testing functionality in Version 2.1 can also be used to find the
3360
best values to use for the constructor parameters C,
3361
C, and C.
3362
3363
B Now you can use both numeric and
3364
symbolic features for constructing a decision tree. A feature is numeric if it can
3365
take any floating-point value over an interval.
3366
3367
B fixes a bug in the code that was triggered by 0 being declared as one of
3368
the features values in the training datafile. Version 1.71 also include an additional
3369
safety feature that is useful for training datafiles that contain a very large number
3370
of features. The new version makes sure that the number of values you declare for
3371
each sample record matches the number of features declared at the beginning of the
3372
training datafile.
3373
3374
B includes safety checks on the consistency of the data you place in your
3375
training datafile. When a training file contains thousands of samples, it is
3376
difficult to manually check that you used the same class names in your sample records
3377
that you declared at top of your training file or that the values you have for your
3378
features are legal vis-a-vis the earlier declarations of the values in the training
3379
file. Another safety feature incorporated in this version is the non-consideration
3380
of classes that are declared at the top of the training file but that have no sample
3381
records in the file.
3382
3383
B uses probability caching much more extensively compared to the previous
3384
versions. This should result in faster construction of large decision trees.
3385
Another new feature in Version 1.6 is the use of a decision tree for interactive
3386
classification. In this mode, after you have constructed a decision tree from the
3387
training data, the user is prompted for answers to the questions pertaining to the
3388
feature tests at the nodes of the tree.
3389
3390
Some key elements of the documentation were cleaned up and made more readable in
3391
B. The implementation code remains unchanged from Version 1.4.
3392
3393
B should make things faster (and easier) for folks who want to use this
3394
module with training data that creates very large decision trees (that is, trees with
3395
tens of thousands or more decision nodes). The speedup in Version 1.4 has been
3396
achieved by eliminating duplicate calculation of probabilities as the tree grows. In
3397
addition, this version provides an additional constructor parameter,
3398
C for controlling the size of the decision tree. This is in
3399
addition to the tree size control achieved by the parameter C that
3400
was introduced in Version 1.3. Since large decision trees can take a long time to
3401
create, you may find yourself wishing you could store the tree you just created in a
3402
disk file and that, subsequently, you could use the stored tree for classification
3403
work. The C directory contains two scripts, C and
3404
C, that show how you can do exactly that with the
3405
help of Perl's C module.
3406
3407
B addresses the issue that arises when the header of a training datafile
3408
declares a certain possible value for a feature but that (feature,value) pair does
3409
NOT show up anywhere in the training data. Version 1.3 also makes it possible for a
3410
user to control the size of the decision tree by changing the value of the parameter
3411
C Additionally, Version 1.3 includes a method called
3412
C that displays useful information regarding the size and
3413
some other attributes of the training data. It also warns the user if the training
3414
data might result in a decision tree that would simply be much too large --- unless
3415
the user controls the size with the entropy_threshold parameter.
3416
3417
In addition to the removal of a couple of serious bugs, B incorporates a
3418
number of enhancements: (1) Version 1.2 includes checks on the names of the features
3419
and values used in test data --- this is the data you want to classify with the
3420
decision tree classifier constructed by this module. (2) Version 1.2 includes a
3421
separate constructor for generating test data. To make it easier to generate test
3422
data whose probabilistic parameters may not be identical to that used for the
3423
training data, I have used separate routines for generating the test data. (3)
3424
Version 1.2 also includes in its examples directory a script that classifies the test
3425
data in a file and outputs the class labels into another file. This is for folks who
3426
do not wish to write their own scripts using this module. (4) Version 1.2 also
3427
includes addition to the documentation regarding the issue of numeric values for
3428
features.
3429
3430
=head1 DESCRIPTION
3431
3432
B is a I module for constructing a decision tree from
3433
a training datafile containing multidimensional data. In one form or another,
3434
decision trees have been around for about fifty years. From a statistical
3435
perspective, they are closely related to classification and regression by recursive
3436
partitioning of multidimensional data. Early work that demonstrated the usefulness
3437
of such partitioning of data for classification and regression can be traced to the
3438
work of Terry Therneau in the early 1980's in the statistics community, and to the
3439
work of Ross Quinlan in the mid 1990's in the machine learning community.
3440
3441
For those not familiar with decision tree ideas, the traditional way to classify
3442
multidimensional data is to start with a feature space whose dimensionality is the
3443
same as that of the data. Each feature in this space corresponds to the attribute
3444
that each dimension of the data measures. You then use the training data to carve up
3445
the feature space into different regions, each corresponding to a different class.
3446
Subsequently, when you try to classify a new data sample, you locate it in the
3447
feature space and find the class label of the region to which it belongs. One can
3448
also give the new data point the same class label as that of the nearest training
3449
sample. This is referred to as the nearest neighbor classification. There exist
3450
hundreds of variations of varying power on these two basic approaches to the
3451
classification of multidimensional data.
3452
3453
A decision tree classifier works differently. When you construct a decision tree,
3454
you select for the root node a feature test that partitions the training data in a
3455
way that causes maximal disambiguation of the class labels associated with the data.
3456
In terms of information content as measured by entropy, such a feature test would
3457
cause maximum reduction in class entropy in going from all of the training data taken
3458
together to the data as partitioned by the feature test. You then drop from the root
3459
node a set of child nodes, one for each partition of the training data created by the
3460
feature test at the root node. When your features are purely symbolic, you'll have
3461
one child node for each value of the feature chosen for the feature test at the root.
3462
When the test at the root involves a numeric feature, you find the decision threshold
3463
for the feature that best bipartitions the data and you drop from the root node two
3464
child nodes, one for each partition. Now at each child node you pose the same
3465
question that you posed when you found the best feature to use at the root: Which
3466
feature at the child node in question would maximally disambiguate the class labels
3467
associated with the training data corresponding to that child node?
3468
3469
As the reader would expect, the two key steps in any approach to decision-tree based
3470
classification are the construction of the decision tree itself from a file
3471
containing the training data, and then using the decision tree thus obtained for
3472
classifying new data.
3473
3474
What is cool about decision tree classification is that it gives you soft
3475
classification, meaning it may associate more than one class label with a given data
3476
vector. When this happens, it may mean that your classes are indeed overlapping in
3477
the underlying feature space. It could also mean that you simply have not supplied
3478
sufficient training data to the decision tree classifier. For a tutorial
3479
introduction to how a decision tree is constructed and used, visit
3480
L
3481
3482
This module also allows you to generate your own synthetic training and test
3483
data. Generating your own training data, using it for constructing a decision-tree
3484
classifier, and subsequently testing the classifier on a synthetically generated
3485
test set of data is a good way to develop greater proficiency with decision trees.
3486
3487
3488
=head1 WHAT PRACTICAL PROBLEM IS SOLVED BY THIS MODULE
3489
3490
If you are new to the concept of a decision tree, their practical utility is best
3491
understood with an example that only involves symbolic features. However, as
3492
mentioned earlier, versions of the module higher than 2.0 allow you to use both
3493
symbolic and numeric features.
3494
3495
Consider the following scenario: Let's say you are running a small investment company
3496
that employs a team of stockbrokers who make buy/sell decisions for the customers of
3497
your company. Assume that your company has asked the traders to make each investment
3498
decision on the basis of the following four criteria:
3499
3500
price_to_earnings_ratio (P_to_E)
3501
3502
price_to_sales_ratio (P_to_S)
3503
3504
return_on_equity (R_on_E)
3505
3506
market_share (MS)
3507
3508
Since you are the boss, you keep track of the buy/sell decisions made by the
3509
individual traders. But one unfortunate day, all of your traders decide to quit
3510
because you did not pay them enough. So what do you do? If you had a module like
3511
the one here, you could still run your company and do so in such a way that, on the
3512
average, would do better than any of the individual traders who worked for your
3513
company. This is what you do: You pool together the individual trader buy/sell
3514
decisions you have accumulated during the last one year. This pooled information is
3515
likely to look like:
3516
3517
3518
example buy/sell P_to_E P_to_S R_on_E MS
3519
============================================================+=
3520
3521
example_1 buy high low medium low
3522
example_2 buy medium medium low low
3523
example_3 sell low medium low high
3524
....
3525
....
3526
3527
This data, when formatted according to CSV, would constitute your training file. You
3528
could feed this file into the module by calling:
3529
3530
my $dt = Algorithm::DecisionTree->new(
3531
training_datafile => $training_datafile,
3532
csv_class_column_index => 1,
3533
csv_columns_for_features => [2,3,4,5],
3534
);
3535
$dt->get_training_data();
3536
$dt->calculate_first_order_probabilities();
3537
$dt->calculate_class_priors();
3538
3539
Subsequently, you would construct a decision tree by calling:
3540
3541
my $root_node = $dt->construct_decision_tree_classifier();
3542
3543
Now you and your company (with practically no employees) are ready to service the
3544
customers again. Suppose your computer needs to make a buy/sell decision about an
3545
investment prospect that is best described by:
3546
3547
price_to_earnings_ratio = low
3548
price_to_sales_ratio = very_low
3549
return_on_equity = none
3550
market_share = medium
3551
3552
All that your computer would need to do would be to construct a data vector like
3553
3554
my @data = qw / P_to_E=low
3555
P_to_S=very_low
3556
R_on_E=none
3557
MS=medium /;
3558
3559
and call the decision tree classifier you just constructed by
3560
3561
$dt->classify($root_node, \@data);
3562
3563
The answer returned will be 'buy' and 'sell', along with the associated
3564
probabilities. So if the probability of 'buy' is considerably greater than the
3565
probability of 'sell', that's what you should instruct your computer to do.
3566
3567
The chances are that, on the average, this approach would beat the performance of any
3568
of your individual traders who worked for you previously since the buy/sell decisions
3569
made by the computer would be based on the collective wisdom of all your previous
3570
traders. B
3571
captured by the silly little example here. However, it does nicely the convey the
3572
sense in which the current module could be used.>
3573
3574
=head1 SYMBOLIC FEATURES VERSUS NUMERIC FEATURES
3575
3576
A feature is symbolic when its values are compared using string comparison operators.
3577
By the same token, a feature is numeric when its values are compared using numeric
3578
comparison operators. Having said that, features that take only a small number of
3579
numeric values in the training data can be treated symbolically provided you are
3580
careful about handling their values in the test data. At the least, you have to set
3581
the test data value for such a feature to its closest value in the training data.
3582
The module does that automatically for you for those numeric features for which the
3583
number different numeric values is less than a user-specified threshold. For those
3584
numeric features that the module is allowed to treat symbolically, this snapping of
3585
the values of the features in the test data to the small set of values in the training
3586
data is carried out automatically by the module. That is, after a user has told the
3587
module which numeric features to treat symbolically, the user need not worry about
3588
how the feature values appear in the test data.
3589
3590
The constructor parameter C let's you tell
3591
the module when to consider an otherwise numeric feature symbolically. Suppose you
3592
set this parameter to 10, that means that all numeric looking features that take 10
3593
or fewer different values in the training datafile will be considered to be symbolic
3594
features by the module. See the tutorial at
3595
L for
3596
further information on the implementation issues related to the symbolic and numeric
3597
features.
3598
3599
=head1 FEATURES WITH NOT SO "NICE" STATISTICAL PROPERTIES
3600
3601
For the purpose of estimating the probabilities, it is necessary to sample the range
3602
of values taken on by a numerical feature. For features with "nice" statistical
3603
properties, this sampling interval is set to the median of the differences between
3604
the successive feature values in the training data. (Obviously, as you would expect,
3605
you first sort all the values for a feature before computing the successive
3606
differences.) This logic will not work for the sort of a feature described below.
3607
3608
Consider a feature whose values are heavy-tailed, and, at the same time, the values
3609
span a million to one range. What I mean by heavy-tailed is that rare values can
3610
occur with significant probabilities. It could happen that most of the values for
3611
such a feature are clustered at one of the two ends of the range. At the same time,
3612
there may exist a significant number of values near the end of the range that is less
3613
populated. (Typically, features related to human economic activities --- such as
3614
wealth, incomes, etc. --- are of this type.) With the logic described in the
3615
previous paragraph, you could end up with a sampling interval that is much too small,
3616
which could result in millions of sampling points for the feature if you are not
3617
careful.
3618
3619
Beginning with Version 2.22, you have two options in dealing with such features. You
3620
can choose to go with the default behavior of the module, which is to sample the
3621
value range for such a feature over a maximum of 500 points. Or, you can supply an
3622
additional option to the constructor that sets a user-defined value for the number of
3623
points to use. The name of the option is C. The following
3624
script
3625
3626
construct_dt_for_heavytailed.pl
3627
3628
in the C directory shows an example of how to call the constructor of the
3629
module with the C option.
3630
3631
3632
=head1 TESTING THE QUALITY OF YOUR TRAINING DATA
3633
3634
Versions 2.1 and higher include a new class named C, derived from
3635
the main class C, that runs a 10-fold cross-validation test on your
3636
training data to test its ability to discriminate between the classes mentioned in
3637
the training file.
3638
3639
The 10-fold cross-validation test divides all of the training data into ten parts,
3640
with nine parts used for training a decision tree and one part used for testing its
3641
ability to classify correctly. This selection of nine parts for training and one part
3642
for testing is carried out in all of the ten different possible ways.
3643
3644
The following code fragment illustrates how you invoke the testing function of the
3645
EvalTrainingData class:
3646
3647
my $training_datafile = "training.csv";
3648
my $eval_data = EvalTrainingData->new(
3649
training_datafile => $training_datafile,
3650
csv_class_column_index => 1,
3651
csv_columns_for_features => [2,3],
3652
entropy_threshold => 0.01,
3653
max_depth_desired => 3,
3654
symbolic_to_numeric_cardinality_threshold => 10,
3655
csv_cleanup_needed => 1,
3656
);
3657
$eval_data->get_training_data();
3658
$eval_data->evaluate_training_data()
3659
3660
The last statement above prints out a Confusion Matrix and the value of Training Data
3661
Quality Index on a scale of 0 to 100, with 100 designating perfect training data.
3662
The Confusion Matrix shows how the different classes were mislabeled in the 10-fold
3663
cross-validation test.
3664
3665
This testing functionality can also be used to find the best values to use for the
3666
constructor parameters C, C, and
3667
C.
3668
3669
The following two scripts in the C directory illustrate the use of the
3670
C class for testing the quality of your data:
3671
3672
evaluate_training_data1.pl
3673
evaluate_training_data2.pl
3674
3675
3676
=head1 HOW TO MAKE THE BEST CHOICES FOR THE CONSTRUCTOR PARAMETERS
3677
3678
Assuming your training data is good, the quality of the results you get from a
3679
decision tree would depend on the choices you make for the constructor parameters
3680
C, C, and
3681
C. You can optimize your choices for
3682
these parameters by running the 10-fold cross-validation test that is made available
3683
in Versions 2.2 and higher through the new class C that is included
3684
in the module file. A description of how to run this test is in the previous section
3685
of this document.
3686
3687
3688
=head1 DECISION TREE INTROSPECTION
3689
3690
Starting with Version 2.30, you can ask the C class of the module to
3691
explain the classification decisions made at the different nodes of the decision
3692
tree.
3693
3694
Perhaps the most important bit of information you are likely to seek through DT
3695
introspection is the list of the training samples that fall directly in the portion
3696
of the feature space that is assigned to a node.
3697
3698
However, note that, when training samples are non-uniformly distributed in the
3699
underlying feature space, it is possible for a node to exist even when there are no
3700
training samples in the portion of the feature space assigned to the node. That is
3701
because the decision tree is constructed from the probability densities estimated
3702
from the training data. When the training samples are non-uniformly distributed, it
3703
is entirely possible for the estimated probability densities to be non-zero in a
3704
small region around a point even when there are no training samples specifically in
3705
that region. (After you have created a statistical model for, say, the height
3706
distribution of people in a community, the model may return a non-zero probability
3707
for the height values in a small interval even if the community does not include a
3708
single individual whose height falls in that interval.)
3709
3710
That a decision-tree node can exist even when there are no training samples in that
3711
portion of the feature space that belongs to the node is an important indication of
3712
the generalization ability of a decision-tree-based classifier.
3713
3714
In light of the explanation provided above, before the DTIntrospection class supplies
3715
any answers at all, it asks you to accept the fact that features can take on non-zero
3716
probabilities at a point in the feature space even though there are zero training
3717
samples at that point (or in a small region around that point). If you do not accept
3718
this rudimentary fact, the introspection class will not yield any answers (since you
3719
are not going to believe the answers anyway).
3720
3721
The point made above implies that the path leading to a node in the decision tree may
3722
test a feature for a certain value or threshold despite the fact that the portion of
3723
the feature space assigned to that node is devoid of any training data.
3724
3725
See the following three scripts in the Examples directory for how to carry out DT
3726
introspection:
3727
3728
introspection_in_a_loop_interactive.pl
3729
3730
introspection_show_training_samples_at_all_nodes_direct_influence.pl
3731
3732
introspection_show_training_samples_to_nodes_influence_propagation.pl
3733
3734
The first script places you in an interactive session in which you will first be
3735
asked for the node number you are interested in. Subsequently, you will be asked for
3736
whether or not you are interested in specific questions that the introspection can
3737
provide answers for. The second script descends down the decision tree and shows for
3738
each node the training samples that fall directly in the portion of the feature space
3739
assigned to that node. The third script shows for each training sample how it
3740
affects the decision-tree nodes either directly or indirectly through the
3741
generalization achieved by the probabilistic modeling of the data.
3742
3743
The output of the script
3744
C looks like:
3745
3746
Node 0: the samples are: None
3747
Node 1: the samples are: [sample_46 sample_58]
3748
Node 2: the samples are: [sample_1 sample_4 sample_7 .....]
3749
Node 3: the samples are: []
3750
Node 4: the samples are: []
3751
...
3752
...
3753
3754
The nodes for which no samples are listed come into existence through
3755
the generalization achieved by the probabilistic modeling of the data.
3756
3757
The output produced by the script
3758
C looks like
3759
3760
sample_1:
3761
nodes affected directly: [2 5 19 23]
3762
nodes affected through probabilistic generalization:
3763
2=> [3 4 25]
3764
25=> [26]
3765
5=> [6]
3766
6=> [7 13]
3767
7=> [8 11]
3768
8=> [9 10]
3769
11=> [12]
3770
13=> [14 18]
3771
14=> [15 16]
3772
16=> [17]
3773
19=> [20]
3774
20=> [21 22]
3775
23=> [24]
3776
3777
sample_4:
3778
nodes affected directly: [2 5 6 7 11]
3779
nodes affected through probabilistic generalization:
3780
2=> [3 4 25]
3781
25=> [26]
3782
5=> [19]
3783
19=> [20 23]
3784
20=> [21 22]
3785
23=> [24]
3786
6=> [13]
3787
13=> [14 18]
3788
14=> [15 16]
3789
16=> [17]
3790
7=> [8]
3791
8=> [9 10]
3792
11=> [12]
3793
3794
...
3795
...
3796
...
3797
3798
For each training sample, the display shown above first presents the list of nodes
3799
that are directly affected by the sample. A node is affected directly by a sample if
3800
the latter falls in the portion of the feature space that belongs to the former.
3801
Subsequently, for each training sample, the display shows a subtree of the nodes that
3802
are affected indirectly by the sample through the generalization achieved by the
3803
probabilistic modeling of the data. In general, a node is affected indirectly by a
3804
sample if it is a descendant of another node that is affected directly.
3805
3806
Also see the section titled B regarding how to invoke the
3807
introspection capabilities of the module in your own code.
3808
3809
=head1 METHODS
3810
3811
The module provides the following methods for constructing a decision tree from
3812
training data in a disk file and for classifying new data records with the decision
3813
tree thus constructed:
3814
3815
=over 4
3816
3817
=item B
3818
3819
my $dt = Algorithm::DecisionTree->new(
3820
training_datafile => $training_datafile,
3821
csv_class_column_index => 2,
3822
csv_columns_for_features => [3,4,5,6,7,8],
3823
entropy_threshold => 0.01,
3824
max_depth_desired => 8,
3825
symbolic_to_numeric_cardinality_threshold => 10,
3826
csv_cleanup_needed => 1,
3827
);
3828
3829
A call to C constructs a new instance of the C class.
3830
For this call to make sense, the training data in the training datafile must be
3831
in the CSV format.
3832
3833
=back
3834
3835
=head2 The Constructor Parameters
3836
3837
=over 8
3838
3839
=item C:
3840
3841
This parameter supplies the name of the file that contains the training data.
3842
3843
=item C:
3844
3845
When using a CSV file for your training data, this parameter supplies the zero-based
3846
column index for the column that contains the class label for each data record in the
3847
training file.
3848
3849
3850
=item C:
3851
3852
You need to set this parameter to 1 if your CSV file has double quoted strings (which
3853
may include commas) as values for the fields and if such values are allowed to
3854
include commas for, presumably, better readability.
3855
3856
=item C:
3857
3858
When using a CSV file for your training data, this parameter supplies a list of
3859
columns corresponding to the features you wish to use for decision tree construction.
3860
Each column is specified by its zero-based index.
3861
3862
=item C:
3863
3864
This parameter sets the granularity with which the entropies are sampled by the
3865
module. For example, a feature test at a node in the decision tree is acceptable if
3866
the entropy gain achieved by the test exceeds this threshold. The larger the value
3867
you choose for this parameter, the smaller the tree. Its default value is 0.001.
3868
3869
=item C:
3870
3871
This parameter sets the maximum depth of the decision tree. For obvious reasons, the
3872
smaller the value you choose for this parameter, the smaller the tree.
3873
3874
=item C:
3875
3876
This parameter allows the module to treat an otherwise numeric feature symbolically
3877
if the number of different values the feature takes in the training data file does
3878
not exceed the value of this parameter.
3879
3880
=item C:
3881
3882
This parameter gives the user the option to set the number of points at which the
3883
value range for a feature should be sampled for estimating the probabilities. This
3884
parameter is effective only for those features that occupy a large value range and
3885
whose probability distributions are heavy tailed. B
3886
when you have a very large training dataset:> In general, the larger the dataset, the
3887
smaller the smallest difference between any two values for a numeric feature in
3888
relation to the overall range of values for that feature. In such cases, the module
3889
may use too large a number of bins for estimating the probabilities and that may slow
3890
down the calculation of the decision tree. You can get around this difficulty by
3891
explicitly giving a value to the 'C' parameter.
3892
3893
=back
3894
3895
3896
You can choose the best values to use for the last three constructor parameters by
3897
running a 10-fold cross-validation test on your training data through the class
3898
C that comes with Versions 2.1 and higher of this module. See the
3899
section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.
3900
3901
=over
3902
3903
=item B
3904
3905
After you have constructed a new instance of the C class,
3906
you must now read in the training data that is the file named in the call to the
3907
constructor. This you do by:
3908
3909
$dt->get_training_data();
3910
3911
3912
=item B
3913
3914
If you wish to see the training data that was just digested by the module,
3915
call
3916
3917
$dt->show_training_data();
3918
3919
=item B
3920
3921
=item B
3922
3923
After the module has read the training data file, it needs to initialize the
3924
probability cache. This you do by invoking:
3925
3926
$dt->calculate_first_order_probabilities()
3927
$dt->calculate_class_priors()
3928
3929
=item B
3930
3931
With the probability cache initialized, it is time to construct a decision tree
3932
classifier. This you do by
3933
3934
my $root_node = $dt->construct_decision_tree_classifier();
3935
3936
This call returns an instance of type C. The C class is defined
3937
within the main package file. So, don't forget, that C<$root_node> in the above
3938
example call will be instantiated to an object of type C.
3939
3940
=item B<$root_nodeC<< -> >>display_decision_tree(" "):>
3941
3942
$root_node->display_decision_tree(" ");
3943
3944
This will display the decision tree in your terminal window by using a recursively
3945
determined offset for each node as the display routine descends down the tree.
3946
3947
I have intentionally left the syntax fragment C<$root_node> in the above call to
3948
remind the reader that C is NOT called on the instance of
3949
the C we constructed earlier, but on the C instance returned by
3950
the call to C.
3951
3952
=item B
3953
3954
Let's say you want to classify the following data record:
3955
3956
my @test_sample = qw / g2=4.2
3957
grade=2.3
3958
gleason=4
3959
eet=1.7
3960
age=55.0
3961
ploidy=diploid /;
3962
3963
you'd make the following call:
3964
3965
my $classification = $dt->classify($root_node, \@test_sample);
3966
3967
where, again, C<$root_node> is an instance of type C returned by the call to
3968
C. The variable C<$classification> holds a
3969
reference to a hash whose keys are the class names and whose values the associated
3970
probabilities. The hash that is returned by the above call also includes a special
3971
key-value pair for a key named C. The value associated with this key
3972
is an anonymous array that holds the path, in the form of a list of nodes, from the
3973
root node to the leaf node in the decision tree where the final classification was
3974
made.
3975
3976
3977
=item B
3978
3979
This method allows you to use a decision-tree based classifier in an interactive
3980
mode. In this mode, a user is prompted for answers to the questions pertaining to
3981
the feature tests at the nodes of the tree. The syntax for invoking this method is:
3982
3983
my $classification = $dt->classify_by_asking_questions($root_node);
3984
3985
where C<$dt> is an instance of the C class returned by a
3986
call to C and C<$root_node> the root node of the decision tree returned by a
3987
call to C.
3988
3989
=back
3990
3991
3992
=head1 THE INTROSPECTION API
3993
3994
To construct an instance of C, you call
3995
3996
my $introspector = DTIntrospection->new($dt);
3997
3998
where you supply the instance of the C class you used for constructing
3999
the decision tree through the parameter C<$dt>. After you have constructed an
4000
instance of the introspection class, you must initialize it by
4001
4002
$introspector->initialize();
4003
4004
Subsequently, you can invoke either of the following methods:
4005
4006
$introspector->explain_classification_at_one_node($node);
4007
4008
$introspector->explain_classifications_at_multiple_nodes_interactively();
4009
4010
depending on whether you want introspection at a single specified node or inside an
4011
infinite loop for an arbitrary number of nodes.
4012
4013
If you want to output a tabular display that shows for each node in the decision tree
4014
all the training samples that fall in the portion of the feature space that belongs
4015
to that node, call
4016
4017
$introspector->display_training_samples_at_all_nodes_direct_influence_only();
4018
4019
If you want to output a tabular display that shows for each training sample a list of
4020
all the nodes that are affected directly AND indirectly by that sample, call
4021
4022
$introspector->display_training_training_samples_to_nodes_influence_propagation();
4023
4024
A training sample affects a node directly if the sample falls in the portion of the
4025
features space assigned to that node. On the other hand, a training sample is
4026
considered to affect a node indirectly if the node is a descendant of a node that is
4027
affected directly by the sample.
4028
4029
4030
=head1 BULK CLASSIFICATION OF DATA RECORDS
4031
4032
For large test datasets, you would obviously want to process an entire file of test
4033
data at a time. The following scripts in the C directory illustrate how you
4034
can do that:
4035
4036
classify_test_data_in_a_file.pl
4037
4038
This script requires three command-line arguments, the first argument names the
4039
training datafile, the second the test datafile, and the third the file in which the
4040
classification results are to be deposited.
4041
4042
The other examples directories, C, C, and
4043
C, also contain scripts that illustrate how to carry out
4044
bulk classification of data records when you wish to take advantage of bagging,
4045
boosting, or tree randomization. In their respective directories, these scripts are
4046
named:
4047
4048
bagging_for_bulk_classification.pl
4049
boosting_for_bulk_classification.pl
4050
classify_database_records.pl
4051
4052
4053
=head1 HOW THE CLASSIFICATION RESULTS ARE DISPLAYED
4054
4055
It depends on whether you apply the classifier at once to all the data samples in a
4056
file, or whether you feed one data sample at a time into the classifier.
4057
4058
In general, the classifier returns soft classification for a test data vector. What
4059
that means is that, in general, the classifier will list all the classes to which a
4060
given data vector could belong and the probability of each such class label for the
4061
data vector. Run the examples scripts in the Examples directory to see how the output
4062
of classification can be displayed.
4063
4064
With regard to the soft classifications returned by this classifier, if the
4065
probability distributions for the different classes overlap in the underlying feature
4066
space, you would want the classifier to return all of the applicable class labels for
4067
a data vector along with the corresponding class probabilities. Another reason for
4068
why the decision tree classifier may associate significant probabilities with
4069
multiple class labels is that you used inadequate number of training samples to
4070
induce the decision tree. The good thing is that the classifier does not lie to you
4071
(unlike, say, a hard classification rule that would return a single class label
4072
corresponding to the partitioning of the underlying feature space). The decision
4073
tree classifier give you the best classification that can be made given the training
4074
data you fed into it.
4075
4076
4077
=head1 USING BAGGING
4078
4079
Starting with Version 3.0, you can use the class C that
4080
comes with the module to incorporate bagging in your decision tree based
4081
classification. Bagging means constructing multiple decision trees for different
4082
(possibly overlapping) segments of the data extracted from your training dataset and
4083
then aggregating the decisions made by the individual decision trees for the final
4084
classification. The aggregation of the classification decisions can average out the
4085
noise and bias that may otherwise affect the classification decision obtained from
4086
just one tree.
4087
4088
=over 4
4089
4090
=item B
4091
4092
A typical call to the constructor for the C class looks
4093
like:
4094
4095
use Algorithm::DecisionTreeWithBagging;
4096
4097
my $training_datafile = "stage3cancer.csv";
4098
4099
my $dtbag = Algorithm::DecisionTreeWithBagging->new(
4100
training_datafile => $training_datafile,
4101
csv_class_column_index => 2,
4102
csv_columns_for_features => [3,4,5,6,7,8],
4103
entropy_threshold => 0.01,
4104
max_depth_desired => 8,
4105
symbolic_to_numeric_cardinality_threshold => 10,
4106
how_many_bags => 4,
4107
bag_overlap_fraction => 0.2,
4108
csv_cleanup_needed => 1,
4109
);
4110
4111
Note in particular the following two constructor parameters:
4112
4113
how_many_bags
4114
4115
bag_overlap_fraction
4116
4117
where, as the name implies, the parameter C controls how many bags
4118
(and, therefore, how many decision trees) will be constructed from your training
4119
dataset; and where the parameter C controls the degree of
4120
overlap between the bags. To understand what exactly is achieved by setting the
4121
parameter C to 0.2 in the above example, let's say that the
4122
non-overlapping partitioning of the training data between the bags results in 100
4123
training samples per bag. With bag_overlap_fraction set to 0.2, additional 20 samples
4124
drawn randomly from the other bags will be added to the data in each bag.
4125
4126
=back
4127
4128
=head2 B class>
4129
4130
=over 8
4131
4132
=item B
4133
4134
This method reads your training datafile, randomizes it, and then partitions it into
4135
the specified number of bags. Subsequently, if the constructor parameter
4136
C is non-zero, it adds to each bag additional samples drawn at
4137
random from the other bags. The number of these additional samples added to each bag
4138
is controlled by the constructor parameter C. If this
4139
parameter is set to, say, 0.2, the size of each bag will grow by 20% with the samples
4140
drawn from the other bags.
4141
4142
=item B
4143
4144
Shows for each bag the names of the training data samples in that bag.
4145
4146
=item B
4147
4148
Calls on the appropriate methods of the main C class to estimate the
4149
first-order probabilities from the data samples in each bag.
4150
4151
=item B
4152
4153
Calls on the appropriate method of the main C class to estimate the
4154
class priors for the data samples in each bag.
4155
4156
=item B
4157
4158
Calls on the appropriate method of the main C class to construct a
4159
decision tree from the training data in each bag.
4160
4161
=item B
4162
4163
Display separately the decision tree for each bag..
4164
4165
=item B
4166
4167
Calls on the appropriate methods of the main C class to classify the
4168
argument test sample.
4169
4170
=item B
4171
4172
Displays separately the classification decision made by each the decision tree
4173
constructed for each bag.
4174
4175
=item B
4176
4177
Using majority voting, this method aggregates the classification decisions made by
4178
the individual decision trees into a single decision.
4179
4180
=back
4181
4182
See the example scripts in the directory C for how to call these
4183
methods for classifying individual samples and for bulk classification when you place
4184
all your test samples in a single file.
4185
4186
=head1 USING BOOSTING
4187
4188
Starting with Version 3.20, you can use the class C for
4189
constructing a boosted decision-tree classifier. Boosting results in a cascade of
4190
decision trees in which each decision tree is constructed with samples that are
4191
mostly those that are misclassified by the previous decision tree. To be precise,
4192
you create a probability distribution over the training samples for the selection of
4193
samples for training each decision tree in the cascade. To start out, the
4194
distribution is uniform over all of the samples. Subsequently, this probability
4195
distribution changes according to the misclassifications by each tree in the cascade:
4196
if a sample is misclassified by a given tree in the cascade, the probability of its
4197
being selected for training the next tree is increased significantly. You also
4198
associate a trust factor with each decision tree depending on its power to classify
4199
correctly all of the training data samples. After a cascade of decision trees is
4200
constructed in this manner, you construct a final classifier that calculates the
4201
class label for a test data sample by taking into account the classification
4202
decisions made by each individual tree in the cascade, the decisions being weighted
4203
by the trust factors associated with the individual classifiers. These boosting
4204
notions --- generally referred to as the AdaBoost algorithm --- are based on a now
4205
celebrated paper "A Decision-Theoretic Generalization of On-Line Learning and an
4206
Application to Boosting" by Yoav Freund and Robert Schapire that appeared in 1995 in
4207
the Proceedings of the 2nd European Conf. on Computational Learning Theory. For a
4208
tutorial introduction to AdaBoost, see L
4209
4210
Keep in mind the fact that, ordinarily, the theoretical guarantees provided by
4211
boosting apply only to the case of binary classification. Additionally, your
4212
training dataset must capture all of the significant statistical variations in the
4213
classes represented therein.
4214
4215
=over 4
4216
4217
=item B
4218
4219
If you'd like to experiment with boosting, a typical call to the constructor for the
4220
C class looks like:
4221
4222
use Algorithm::BoostedDecisionTree;
4223
my $training_datafile = "training6.csv";
4224
my $boosted = Algorithm::BoostedDecisionTree->new(
4225
training_datafile => $training_datafile,
4226
csv_class_column_index => 1,
4227
csv_columns_for_features => [2,3],
4228
entropy_threshold => 0.01,
4229
max_depth_desired => 8,
4230
symbolic_to_numeric_cardinality_threshold => 10,
4231
how_many_stages => 4,
4232
csv_cleanup_needed => 1,
4233
);
4234
4235
Note in particular the constructor parameter:
4236
4237
how_many_stages
4238
4239
As its name implies, this parameter controls how many stages will be used in the
4240
boosted decision tree classifier. As mentioned above, a separate decision tree is
4241
constructed for each stage of boosting using a set of training samples that are drawn
4242
through a probability distribution maintained over the entire training dataset.
4243
4244
=back
4245
4246
=head2 B class>
4247
4248
=over 8
4249
4250
=item B
4251
4252
This method reads your training datafile, creates the data structures from the data
4253
ingested for constructing the base decision tree.
4254
4255
=item B
4256
4257
Writes to the standard output the training data samples and also some relevant
4258
properties of the features used in the training dataset.
4259
4260
=item B
4261
4262
Calls on the appropriate methods of the main C class to estimate the
4263
first-order probabilities and the class priors.
4264
4265
=item B
4266
4267
Calls on the appropriate method of the main C class to construct the
4268
base decision tree.
4269
4270
=item B
4271
4272
Displays the base decision tree in your terminal window. (The textual form of the
4273
decision tree is written out to the standard output.)
4274
4275
=item B
4276
4277
Uses the AdaBoost algorithm to construct a cascade of decision trees. As mentioned
4278
earlier, the training samples for each tree in the cascade are drawn using a
4279
probability distribution over the entire training dataset. This probability
4280
distribution for any given tree in the cascade is heavily influenced by which
4281
training samples are misclassified by the previous tree.
4282
4283
=item B
4284
4285
Displays separately in your terminal window the decision tree constructed for each
4286
stage of the cascade. (The textual form of the trees is written out to the standard
4287
output.)
4288
4289
=item B
4290
4291
Calls on each decision tree in the cascade to classify the argument C<$test_sample>.
4292
4293
=item B
4294
4295
You can call this method to display in your terminal window the classification
4296
decision made by each decision tree in the cascade. The method also prints out the
4297
trust factor associated with each decision tree. It is important to look
4298
simultaneously at the classification decision and the trust factor for each tree ---
4299
since a classification decision made by a specific tree may appear bizarre for a
4300
given test sample. This method is useful primarily for debugging purposes.
4301
4302
=item B
4303
4304
As with the previous method, this method is useful mostly for debugging. It returns
4305
class labels for the samples misclassified by the stage whose integer index is
4306
supplied as an argument to the method. Say you have 10 stages in your cascade. The
4307
value of the argument C would go from 0 to 9, with 0 corresponding to
4308
the base tree.
4309
4310
=item B
4311
4312
Uses the "final classifier" formula of the AdaBoost algorithm to pool together the
4313
classification decisions made by the individual trees while taking into account the
4314
trust factors associated with the trees. As mentioned earlier, we associate with
4315
each tree of the cascade a trust factor that depends on the overall misclassification
4316
rate associated with that tree.
4317
4318
=back
4319
4320
See the example scripts in the C subdirectory for how to call the
4321
methods listed above for classifying individual data samples with boosting and for
4322
bulk classification when you place all your test samples in a single file.
4323
4324
4325
=head1 USING RANDOMIZED DECISION TREES
4326
4327
Consider the following two situations that call for using randomized decision trees,
4328
meaning multiple decision trees that are trained using data extracted randomly from a
4329
large database of training samples:
4330
4331
(1) Consider a two-class problem for which the training database is grossly
4332
imbalanced in how many majority-class samples it contains vis-a-vis the number of
4333
minority class samples. Let's assume for a moment that the ratio of majority class
4334
samples to minority class samples is 1000 to 1. Let's also assume that you have a
4335
test dataset that is drawn randomly from the same population mixture from which the
4336
training database was created. Now consider a stupid data classification program
4337
that classifies everything as belonging to the majority class. If you measure the
4338
classification accuracy rate as the ratio of the number of samples correctly
4339
classified to the total number of test samples selected randomly from the population,
4340
this classifier would work with an accuracy of 99.99%.
4341
4342
(2) Let's now consider another situation in which we are faced with a huge training
4343
database but in which every class is equally well represented. Feeding all the data
4344
into a single decision tree would be akin to polling all of the population of the
4345
United States for measuring the Coke-versus-Pepsi preference in the country. You are
4346
likely to get better results if you construct multiple decision trees, each trained
4347
with a collection of training samples drawn randomly from the training database.
4348
After you have created all the decision trees, your final classification decision
4349
could then be based on, say, majority voting by the trees.
4350
4351
In summary, the C class allows you to solve the following
4352
two problems: (1) Data classification using the needle-in-a-haystack metaphor, that
4353
is, when a vast majority of your training samples belong to just one class. And (2)
4354
You have access to a very large database of training samples and you wish to
4355
construct an ensemble of decision trees for classification.
4356
4357
=over 4
4358
4359
=item B
4360
4361
Here is how you'd call the C constructor for
4362
needle-in-a-haystack classification:
4363
4364
use Algorithm::RandomizedTreesForBigData;
4365
my $training_datafile = "your_database.csv";
4366
my $rt = Algorithm::RandomizedTreesForBigData->new(
4367
training_datafile => $training_datafile,
4368
csv_class_column_index => 48,
4369
csv_columns_for_features => [24,32,33,34,41],
4370
entropy_threshold => 0.01,
4371
max_depth_desired => 8,
4372
symbolic_to_numeric_cardinality_threshold => 10,
4373
how_many_trees => 5,
4374
looking_for_needles_in_haystack => 1,
4375
csv_cleanup_needed => 1,
4376
);
4377
4378
Note in particular the constructor parameters:
4379
4380
looking_for_needles_in_haystack
4381
how_many_trees
4382
4383
The first of these parameters, C, invokes the logic for
4384
constructing an ensemble of decision trees, each based on a training dataset that
4385
uses all of the minority class samples, and a random drawing from the majority class
4386
samples.
4387
4388
Here is how you'd call the C constructor for a more
4389
general attempt at constructing an ensemble of decision trees, with each tree trained
4390
with randomly drawn samples from a large database of training data (without paying
4391
attention to the differences in the sizes of the populations for the different
4392
classes):
4393
4394
use Algorithm::RandomizedTreesForBigData;
4395
my $training_datafile = "your_database.csv";
4396
my $rt = Algorithm::RandomizedTreesForBigData->new(
4397
training_datafile => $training_datafile,
4398
csv_class_column_index => 2,
4399
csv_columns_for_features => [3,4,5,6,7,8],
4400
entropy_threshold => 0.01,
4401
max_depth_desired => 8,
4402
symbolic_to_numeric_cardinality_threshold => 10,
4403
how_many_trees => 3,
4404
how_many_training_samples_per_tree => 50,
4405
csv_cleanup_needed => 1,
4406
);
4407
4408
Note in particular the constructor parameters:
4409
4410
how_many_training_samples_per_tree
4411
how_many_trees
4412
4413
When you set the C parameter, you are not allowed
4414
to also set the C parameter, and vice versa.
4415
4416
=back
4417
4418
=head2 B class>
4419
4420
=over 8
4421
4422
=item B
4423
4424
What this method does depends on which of the two constructor parameters ---
4425
C or C --- is
4426
set. When the former is set, it creates a collection of training datasets for
4427
C number of decision trees, with each dataset being a mixture of the
4428
minority class and sample drawn randomly from the majority class. However, when the
4429
latter option is set, all the datasets are drawn randomly from the training database
4430
with no particular attention given to the relative populations of the two classes.
4431
4432
=item B
4433
4434
As the name implies, this method shows the training data being used for all the
4435
decision trees. This method is useful for debugging purposes using small datasets.
4436
4437
=item B
4438
4439
Calls on the appropriate method of the main C to estimate the
4440
first-order probabilities for the training dataset to be used for each decision tree.
4441
4442
=item B
4443
4444
Calls on the appropriate method of the main C class to estimate the
4445
class priors for the training dataset to be used for each decision tree.
4446
4447
=item B
4448
4449
Calls on the appropriate method of the main C class to construct the
4450
decision trees.
4451
4452
=item B
4453
4454
Displays all the decision trees in your terminal window. (The textual form of the
4455
decision trees is written out to the standard output.)
4456
4457
=item B
4458
4459
The test_sample is sent to each decision tree for classification.
4460
4461
=item B
4462
4463
The classification decisions returned by the individual decision trees are written
4464
out to the standard output.
4465
4466
=item B
4467
4468
This method aggregates the classification results returned by the individual decision
4469
trees and returns the majority decision.
4470
4471
=back
4472
4473
=head1 CONSTRUCTING REGRESSION TREES:
4474
4475
Decision tree based modeling requires that the class labels be distinct. That is,
4476
the training dataset must contain a relatively small number of discrete class labels
4477
for all of your data records if you want to model the data with one or more decision
4478
trees. However, when one is trying to understand all of the associational
4479
relationships that exist in a large database, one often runs into situations where,
4480
instead of discrete class labels, you have a continuously valued variable as a
4481
dependent variable whose values are predicated on a set of feature values. It is for
4482
such situations that you will find useful the new class C that is now
4483
a part of the C module. The C class has been
4484
programmed as a subclass of the main C class.
4485
4486
You can think of regression with a regression tree as a powerful generalization of
4487
the very commonly used Linear Regression algorithms. Although you can certainly
4488
carry out polynomial regression with run-of-the-mill Linear Regression algorithms for
4489
modeling nonlinearities between the predictor variables and the dependent variable,
4490
specifying the degree of the polynomial is often tricky. Additionally, a polynomial
4491
can inject continuities between the predictor and the predicted variables that may
4492
not really exist in the real data. Regression trees, on the other hand, give you a
4493
piecewise linear relationship between the predictor and the predicted variables that
4494
is freed from the constraints of superimposed continuities at the joins between the
4495
different segments. See the following tutorial for further information regarding the
4496
standard linear regression approach and the regression that can be achieved with the
4497
RegressionTree class in this module:
4498
L
4499
4500
The RegressionTree class in the current version of the module assumes that all of
4501
your data is numerical. That is, unlike what is possible with the DecisionTree class
4502
(and the other more closely related classes in this module) that allow your training
4503
file to contain a mixture of numerical and symbolic data, the RegressionTree class
4504
requires that ALL of your data be numerical. I hope to relax this constraint in
4505
future versions of this module. Obviously, the dependent variable will always be
4506
numerical for regression.
4507
4508
See the example scripts in the directory C if you wish to become
4509
more familiar with the regression capabilities of the module.
4510
4511
=over 4
4512
4513
=item B
4514
4515
my $training_datafile = "gendata5.csv";
4516
my $rt = Algorithm::RegressionTree->new(
4517
training_datafile => $training_datafile,
4518
dependent_variable_column => 2,
4519
predictor_columns => [1],
4520
mse_threshold => 0.01,
4521
max_depth_desired => 2,
4522
jacobian_choice => 0,
4523
csv_cleanup_needed => 1,
4524
);
4525
4526
Note in particular the constructor parameters:
4527
4528
dependent_variable
4529
predictor_columns
4530
mse_threshold
4531
jacobian_choice
4532
4533
The first of these parameters, C, is set to the column index in
4534
the CSV file for the dependent variable. The second constructor parameter,
4535
C, tells the system as to which columns contain values for the
4536
predictor variables. The third parameter, C, is for deciding when to
4537
partition the data at a node into two child nodes as a regression tree is being
4538
constructed. If the minmax of MSE (Mean Squared Error) that can be achieved by
4539
partitioning any of the features at a node is smaller than C, that
4540
node becomes a leaf node of the regression tree.
4541
4542
The last parameter, C, must be set to either 0 or 1 or 2. Its
4543
default value is 0. When this parameter equals 0, the regression coefficients are
4544
calculated using the linear least-squares method and no further "refinement" of the
4545
coefficients is carried out using gradient descent. This is the fastest way to
4546
calculate the regression coefficients. When C is set to 1, you get
4547
a weak version of gradient descent in which the Jacobian is set to the "design
4548
matrix" itself. Choosing 2 for C results in a more reasonable
4549
approximation to the Jacobian. That, however, is at a cost of much longer
4550
computation time. B For most cases, using 0 for C is the
4551
best choice. See my tutorial "I" for why
4552
that is the case.
4553
4554
=back
4555
4556
=head2 B class>
4557
4558
=over 8
4559
4560
=item B
4561
4562
Only CSV training datafiles are allowed. Additionally, the first record in the file
4563
must list the names of the fields, and the first column must contain an integer ID
4564
for each record.
4565
4566
=item B
4567
4568
As the name implies, this is the method that construct a regression tree.
4569
4570
=item B
4571
4572
Displays the regression tree, as the name implies. The white-space string argument
4573
specifies the offset to use in displaying the child nodes in relation to a parent
4574
node.
4575
4576
=item B
4577
4578
You call this method after you have constructed a regression tree if you want to
4579
calculate the prediction for one sample. The parameter C<$root_node> is what is
4580
returned by the call C. The formatting of the argument
4581
bound to the C<$test_sample> parameter is important. To elaborate, let's say you are
4582
using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In
4583
this case, the C<$test_sample> parameter will be bound to a list that will look like
4584
4585
['xvar1 = 23.4', 'xvar2 = 12.9']
4586
4587
Arbitrary amount of white space, including none, on the two sides of the equality
4588
symbol is allowed in the construct shown above. A call to this method returns a
4589
dictionary with two key-value pairs. One of the keys is called C and
4590
the other C. The value associated with key C is the path
4591
in the regression tree to the leaf node that yielded the prediction. And the value
4592
associated with the key C is the answer you are looking for.
4593
4594
=item B
4595
4596
This call calculates the predictions for all of the predictor variables data in your
4597
training file. The parameter C<$root_node> is what is returned by the call to
4598
C. The values for the dependent variable thus predicted
4599
can be seen by calling C, which is the method mentioned below.
4600
4601
=item B
4602
4603
This method displays the results obtained by calling the prediction method of the
4604
previous entry. This method also creates a hardcopy of the plots and saves it as a
4605
C<.png> disk file. The name of this output file is always C.
4606
4607
=item B
4608
4609
This method carries out an error analysis of the predictions for the samples in your
4610
training datafile. It shows you the overall MSE (Mean Squared Error) with tree-based
4611
regression, the MSE for the data samples at each of the leaf nodes of the regression
4612
tree, and the MSE for the plain old Linear Regression as applied to all of the data.
4613
The parameter C<$root_node> in the call syntax is what is returned by the call to
4614
C.
4615
4616
=item B
4617
4618
Call this method if you want to apply the regression tree to all your test data in a
4619
disk file. The predictions for all of the test samples in the disk file are written
4620
out to another file whose name is the same as that of the test file except for the
4621
addition of C<_output> in the name of the file. The parameter C<$filename> is the
4622
name of the disk file that contains the test data. And the parameter C<$columns> is a
4623
list of the column indices for the predictor variables in the test file.
4624
4625
=back
4626
4627
=head1 GENERATING SYNTHETIC TRAINING DATA
4628
4629
The module file contains the following additional classes: (1)
4630
C, and (2) C for
4631
generating synthetic training data.
4632
4633
The class C outputs one CSV file for the
4634
training data and another one for the test data for experimenting with numeric
4635
features. The numeric values are generated using a multivariate Gaussian
4636
distribution whose mean and covariance are specified in a parameter file. See the
4637
file C in the C directory for an example of such a
4638
parameter file. Note that the dimensionality of the data is inferred from the
4639
information you place in the parameter file.
4640
4641
The class C generates synthetic training for the
4642
purely symbolic case. The relative frequencies of the different possible values for
4643
the features is controlled by the biasing information you place in a parameter file.
4644
See C for an example of such a file.
4645
4646
4647
=head1 THE C DIRECTORY
4648
4649
See the C directory in the distribution for how to construct a decision
4650
tree, and how to then classify new data using the decision tree. To become more
4651
familiar with the module, run the scripts
4652
4653
construct_dt_and_classify_one_sample_case1.pl
4654
construct_dt_and_classify_one_sample_case2.pl
4655
construct_dt_and_classify_one_sample_case3.pl
4656
construct_dt_and_classify_one_sample_case4.pl
4657
4658
The first script is for the purely symbolic case, the second for the case that
4659
involves both numeric and symbolic features, the third for the case of purely numeric
4660
features, and the last for the case when the training data is synthetically generated
4661
by the script C.
4662
4663
Next run the following script as it is for bulk classification of data records placed
4664
in a CSV file:
4665
4666
classify_test_data_in_a_file.pl training4.csv test4.csv out4.csv
4667
4668
The script first constructs a decision tree using the training data in the training
4669
file supplied by the first argument file C. The script then
4670
calculates the class label for each data record in the test data file supplied
4671
through the second argument file, C. The estimated class labels are
4672
written out to the output file which in the call shown above is C. An
4673
important thing to note here is that your test file --- in this case C ---
4674
must have a column for class labels. Obviously, in real-life situations, there will
4675
be no class labels in this column. What that is the case, you can place an empty
4676
string C<""> there for each data record. This is demonstrated by the following call:
4677
4678
classify_test_data_in_a_file.pl training4.csv test4_no_class_labels.csv out4.csv
4679
4680
The following script in the C directory
4681
4682
classify_by_asking_questions.pl
4683
4684
shows how you can use a decision-tree classifier interactively. In this mode, you
4685
first construct the decision tree from the training data and then the user is
4686
prompted for answers to the feature tests at the nodes of the tree.
4687
4688
If your training data has a feature whose values span a large range and, at the same
4689
time, are characterized by a heavy-tail distribution, you should look at the script
4690
4691
construct_dt_for_heavytailed.pl
4692
4693
to see how to use the option C in the call to the
4694
constructor. This option was introduced in Version 2.22 for dealing with such
4695
features. If you do not set this option, the module will use the default value of
4696
500 for the number of points at which to sample the value range for such a feature.
4697
4698
The C directory also contains the following scripts:
4699
4700
generate_training_data_numeric.pl
4701
generate_training_data_symbolic.pl
4702
4703
that show how you can use the module to generate synthetic training. Synthetic
4704
training is generated according to the specifications laid out in a parameter file.
4705
There are constraints on how the information is laid out in a parameter file. See
4706
the files C and C in the C directory
4707
for how to structure these files.
4708
4709
The C directory of Versions 2.1 and higher of the module also contains the
4710
following two scripts:
4711
4712
evaluate_training_data1.pl
4713
evaluate_training_data2.pl
4714
4715
that illustrate how the Perl class C can be used to evaluate the
4716
quality of your training data (as long as it resides in a `C<.csv>' file.) This new
4717
class is a subclass of the C class in the module file. See the README
4718
in the C directory for further information regarding these two scripts.
4719
4720
The C directory of Versions 2.31 and higher of the module contains the
4721
following three scripts:
4722
4723
introspection_in_a_loop_interactive.pl
4724
4725
introspection_show_training_samples_at_all_nodes_direct_influence.pl
4726
4727
introspection_show_training_samples_to_nodes_influence_propagation.pl
4728
4729
The first script illustrates how to use the C class of the module
4730
interactively for generating explanations for the classification decisions made at
4731
the nodes of the decision tree. In the interactive session you are first asked for
4732
the node number you are interested in. Subsequently, you are asked for whether or
4733
not you are interested in specific questions that the introspector can provide
4734
answers for. The second script generates a tabular display that shows for each node
4735
of the decision tree a list of the training samples that fall directly in the portion
4736
of the feature space assigned that node. (As mentioned elsewhere in this
4737
documentation, when this list is empty for a node, that means the node is a result of
4738
the generalization achieved by probabilistic modeling of the data. Note that this
4739
module constructs a decision tree NOT by partitioning the set of training samples,
4740
BUT by partitioning the domains of the probability density functions.) The third
4741
script listed above also generates a tabular display, but one that shows how the
4742
influence of each training sample propagates in the tree. This display first shows
4743
the list of nodes that are affected directly by the data in a training sample. This
4744
list is followed by an indented display of the nodes that are affected indirectly by
4745
the training sample. A training sample affects a node indirectly if the node is a
4746
descendant of one of the nodes affected directly.
4747
4748
The latest addition to the Examples directory is the script:
4749
4750
get_indexes_associated_with_fields.py
4751
4752
As to why you may find this script useful, note that large database files may have
4753
hundreds of fields and it is not always easy to figure out what numerical index is
4754
associated with a given field. At the same time, the constructor of the DecisionTree
4755
module requires that the field that holds the class label and the fields that contain
4756
the feature values be specified by their numerical zero-based indexes. If you have a
4757
large database and you are faced with this problem, you can run this script to see
4758
the zero-based numerical index values associated with the different columns of your
4759
CSV file.
4760
4761
4762
=head1 THE C DIRECTORY
4763
4764
The C directory contains the following scripts:
4765
4766
bagging_for_classifying_one_test_sample.pl
4767
4768
bagging_for_bulk_classification.pl
4769
4770
As the names of the scripts imply, the first shows how to call the different methods
4771
of the C class for classifying a single test sample. When
4772
you are classifying a single test sample, you can also see how each bag is
4773
classifying the test sample. You can, for example, display the training data used in
4774
each bag, the decision tree constructed for each bag, etc.
4775
4776
The second script is for the case when you place all of the test samples in a single
4777
file. The demonstration script displays for each test sample a single aggregate
4778
classification decision that is obtained through majority voting by all the decision
4779
trees.
4780
4781
4782
=head1 THE C DIRECTORY
4783
4784
The C subdirectory in the main installation directory contains the
4785
following three scripts:
4786
4787
boosting_for_classifying_one_test_sample_1.pl
4788
4789
boosting_for_classifying_one_test_sample_2.pl
4790
4791
boosting_for_bulk_classification.pl
4792
4793
As the names of the first two scripts imply, these show how to call the different
4794
methods of the C class for classifying a single test sample.
4795
When you are classifying a single test sample, you can see how each stage of the
4796
cascade of decision trees is classifying the test sample. You can also view each
4797
decision tree separately and also see the trust factor associated with the tree.
4798
4799
The third script is for the case when you place all of the test samples in a single
4800
file. The demonstration script outputs for each test sample a single aggregate
4801
classification decision that is obtained through trust-factor weighted majority
4802
voting by all the decision trees.
4803
4804
=head1 THE C DIRECTORY
4805
4806
The C directory shows example scripts that you can use to
4807
become more familiar with the C class for solving
4808
needle-in-a-haystack and big-data data classification problems. These scripts are:
4809
4810
randomized_trees_for_classifying_one_test_sample_1.pl
4811
4812
randomized_trees_for_classifying_one_test_sample_2.pl
4813
4814
classify_database_records.pl
4815
4816
The first script shows the constructor options to use for solving a
4817
needle-in-a-haystack problem --- that is, a problem in which a vast majority of the
4818
training data belongs to just one class. The second script shows the constructor
4819
options for using randomized decision trees for the case when you have access to a
4820
very large database of training samples and you'd like to construct an ensemble of
4821
decision trees using training samples pulled randomly from the training database.
4822
The last script illustrates how you can evaluate the classification power of an
4823
ensemble of decision trees as constructed by C by classifying
4824
a large number of test samples extracted randomly from the training database.
4825
4826
4827
=head1 THE C DIRECTORY
4828
4829
The C subdirectory in the main installation directory shows
4830
example scripts that you can use to become familiar with regression trees and how
4831
they can be used for nonlinear regression. If you are new to the concept of
4832
regression trees, start by executing the following scripts without changing them and
4833
see what sort of output is produced by them:
4834
4835
regression4.pl
4836
4837
regression5.pl
4838
4839
regression6.pl
4840
4841
regression8.pl
4842
4843
The C script involves only one predictor variable and one dependent
4844
variable. The training data for this exercise is drawn from the file C.
4845
This data file contains strongly nonlinear data. When you run the script
4846
C, you will see how much better the result from tree regression is
4847
compared to what you can get with linear regression.
4848
4849
The C script is essentially the same as the previous script except
4850
for the fact that the training datafile used in this case, C, consists
4851
of three noisy segments, as opposed to just two in the previous case.
4852
4853
The script C deals with the case when we have two predictor variables
4854
and one dependent variable. You can think of the data as consisting of noisy height
4855
values over an C<(x1,x2)> plane. The data used in this script is drawn from the csv
4856
file C.
4857
4858
Finally, the script C shows how you can carry out bulk prediction for
4859
all your test data records in a disk file. The script writes all the calculated
4860
predictions into another disk file whose name is derived from the name of the test
4861
data file.
4862
4863
4864
=head1 EXPORT
4865
4866
None by design.
4867
4868
=head1 BUGS
4869
4870
Please notify the author if you encounter any bugs. When sending email, please place
4871
the string 'DecisionTree' in the subject line.
4872
4873
=head1 INSTALLATION
4874
4875
Download the archive from CPAN in any directory of your choice. Unpack the archive
4876
with a command that on a Linux machine would look like:
4877
4878
tar zxvf Algorithm-DecisionTree-3.43.tar.gz
4879
4880
This will create an installation directory for you whose name will be
4881
C. Enter this directory and execute the following
4882
commands for a standard install of the module if you have root privileges:
4883
4884
perl Makefile.PL
4885
make
4886
make test
4887
sudo make install
4888
4889
If you do not have root privileges, you can carry out a non-standard install the
4890
module in any directory of your choice by:
4891
4892
perl Makefile.PL prefix=/some/other/directory
4893
make
4894
make test
4895
make install
4896
4897
With a non-standard install, you may also have to set your PERL5LIB environment
4898
variable so that this module can find the required other modules. How you do that
4899
would depend on what platform you are working on. In order to install this module in
4900
a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
4901
variable by
4902
4903
setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4904
4905
If I used bash, I'd need to declare:
4906
4907
export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4908
4909
Another option for a non-standard install would be to place at the top of your own
4910
scripts the following invocation:
4911
4912
BEGIN {
4913
unshift @INC, "/some/other/directory/blib/lib";
4914
}
4915
4916
4917
=head1 THANKS
4918
4919
I wish to thank many users of this module for their feedback. Many of the
4920
improvements I have made to the module over the years are a result of the feedback
4921
received.
4922
4923
I thank Slaven Rezic for pointing out that the module worked with Perl 5.14.x. For
4924
Version 2.22, I had set the required version of Perl to 5.18.0 since that's what I
4925
used for testing the module. Slaven's feedback in the form of the Bug report
4926
C<#96547> resulted in Version 2.23 of the module. Version 2.25 further downshifts
4927
the required version of Perl to 5.10.
4928
4929
On the basis of the report posted by Slaven at C regarding Version 2.27,
4930
I am removing the Perl version restriction altogether from Version 2.30. Thanks
4931
Slaven!
4932
4933
Version 3.43 was prompted by a bug report sent by Jan Trukenmüller regarding a
4934
missing backslash in a regex in the C function. Thanks, Jan!
4935
4936
=head1 AUTHOR
4937
4938
The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy Project"
4939
with the publication of the book I by John-Wiley. If
4940
interested, visit his web page at Purdue to find out what this project was all
4941
about. You might like I especially if you enjoyed reading
4942
Harry Potter as a kid (or even as an adult, for that matter).
4943
4944
If you send email regarding this module, please place the string "DecisionTree" in
4945
your subject line to get past my spam filter. Avi Kak's email address is
4946
C
4947
4948
=head1 COPYRIGHT
4949
4950
This library is free software; you can redistribute it and/or modify it under the
4951
same terms as Perl itself.
4952
4953
Copyright 2017 Avinash Kak
4954
4955
=cut
4956