File Coverage

blib/lib/Algorithm/DecisionTree.pm

Criterion	Covered	Total	%
statement	788	2250	35.0
branch	200	708	28.2
condition	41	232	17.6
subroutine	45	92	48.9
pod	8	40	20.0
total	1082	3322	32.5

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::DecisionTree;
2
3							#--------------------------------------------------------------------------------------
4							# Copyright (c) 2016 Avinash Kak. All rights reserved. This program is free
5							# software. You may modify and/or distribute it under the same terms as Perl itself.
6							# This copyright notice must remain attached to the file.
7							#
8							# Algorithm::DecisionTree is a Perl module for decision-tree based classification of
9							# multidimensional data.
10							# -------------------------------------------------------------------------------------
11
12							#use 5.10.0;
13	1			1		10953	use strict;
	1					2
	1					21
14	1			1		3	use warnings;
	1					1
	1					17
15	1			1		3	use Carp;
	1					3
	1					12203
16
17							our $VERSION = '3.41';
18
19							############################################ Constructor ##############################################
20							sub new {
21	1			1	1	310	my ($class, %args, $eval_or_boosting_mode);
22	1	50				10	if (@_ % 2 != 0) {
23	1					5	($class, %args) = @_;
24							} else {
25	0					0	$class = shift;
26	0					0	$eval_or_boosting_mode = shift;
27	0	0	0			0	die unless $eval_or_boosting_mode eq 'evalmode' \|\| $eval_or_boosting_mode eq 'boostingmode';
28	0	0				0	die "Only one string arg allowed in eval and boosting modes" if @_;
29							}
30	1	50				2	unless ($eval_or_boosting_mode) {
31	1					4	my @params = keys %args;
32	1	50				2	croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
33							if check_for_illegal_params2(@params) == 0;
34							}
35							bless {
36							_training_datafile => $args{training_datafile},
37							_entropy_threshold => $args{entropy_threshold} \|\| 0.01,
38							_max_depth_desired => exists $args{max_depth_desired} ?
39							$args{max_depth_desired} : undef,
40							_debug1 => $args{debug1} \|\| 0,
41							_debug2 => $args{debug2} \|\| 0,
42							_debug3 => $args{debug3} \|\| 0,
43							_csv_class_column_index => $args{csv_class_column_index} \|\| undef,
44							_csv_columns_for_features => $args{csv_columns_for_features} \|\| undef,
45							_symbolic_to_numeric_cardinality_threshold
46							=> $args{symbolic_to_numeric_cardinality_threshold} \|\| 10,
47							_number_of_histogram_bins => $args{number_of_histogram_bins} \|\| undef,
48	1	50	50			45	_training_data => [],
			50
			50
			50
			50
			50
			50
			50
49							_root_node => undef,
50							_probability_cache => {},
51							_entropy_cache => {},
52							_training_data_hash => {},
53							_features_and_values_hash => {},
54							_samples_class_label_hash => {},
55							_class_names => [],
56							_class_priors => [],
57							_class_priors_hash => {},
58							_feature_names => [],
59							_numeric_features_valuerange_hash => {},
60							_sampling_points_for_numeric_feature_hash => {},
61							_feature_values_how_many_uniques_hash => {},
62							_prob_distribution_numeric_features_hash => {},
63							_histogram_delta_hash => {},
64							_num_of_histogram_bins_hash => {},
65							}, $class;
66
67							}
68
69							#################################### Classify with Decision Tree #######################################
70
71							## Classifies one test sample at a time using the decision tree constructed from
72							## your training file. The data record for the test sample must be supplied as
73							## shown in the scripts in the `examples' subdirectory. See the scripts
74							## construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
75							sub classify {
76	0			0	1	0	my $self = shift;
77	0					0	my $root_node = shift;
78	0					0	my $feature_and_values = shift;
79	0					0	my $numregex = '[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?';
80	0					0	my @features_and_values = @$feature_and_values;
81	0					0	@features_and_values = @{deep_copy_array(\@features_and_values)};
	0					0
82	0	0				0	die "Error in the names you have used for features and/or values"
83							unless $self->check_names_used(\@features_and_values);
84	0					0	my @new_features_and_values = ();
85	0					0	my $pattern = '(\S+)\s=\s(\S+)';
86	0					0	foreach my $feature_and_value (@features_and_values) {
87	0					0	$feature_and_value =~ /$pattern/;
88	0					0	my ($feature, $value) = ($1, $2);
89	0					0	my $newvalue = $value;
90	0					0	my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
	0					0
91	0					0	my $not_all_values_float = 0;
92	0	0				0	map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
	0					0
93	0	0	0			0	if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
	0					0
94							$not_all_values_float == 0) {
95	0					0	$newvalue = closest_sampling_point($value, \@unique_values_for_feature);
96							}
97	0					0	push @new_features_and_values, "$feature" . '=' . "$newvalue";
98							}
99	0					0	@features_and_values = @new_features_and_values;
100	0	0				0	print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
101	0					0	my %answer = ();
102	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
103	0					0	$answer{$class_name} = undef;
104							}
105	0					0	$answer{'solution_path'} = [];
106	0					0	my %classification = %{$self->recursive_descent_for_classification($root_node,
	0					0
107							\@features_and_values,\%answer)};
108	0					0	@{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
	0					0
	0					0
109	0	0				0	if ($self->{_debug3}) {
110	0					0	print "\nCL2 The classification:\n";
111	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
112	0					0	print " $class_name with probability $classification{$class_name}\n";
113							}
114							}
115	0					0	my %classification_for_display = ();
116	0					0	foreach my $item (keys %classification) {
117	0	0				0	if ($item ne 'solution_path') {
118	0					0	$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
119							} else {
120	0					0	my @outlist = ();
121	0					0	foreach my $x (@{$classification{$item}}) {
	0					0
122	0					0	push @outlist, "NODE$x";
123							}
124	0					0	$classification_for_display{$item} = \@outlist;
125							}
126							}
127	0					0	return \%classification_for_display;
128							}
129
130							sub recursive_descent_for_classification {
131	0			0	0	0	my $self = shift;
132	0					0	my $node = shift;
133	0					0	my $features_and_values = shift;
134	0					0	my $answer = shift;
135	0					0	my @features_and_values = @$features_and_values;
136	0					0	my %answer = %$answer;
137	0					0	my @children = @{$node->get_children()};
	0					0
138	0	0				0	if (@children == 0) {
139	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
140	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
141	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
142							}
143	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
144	0					0	return \%answer;
145							}
146	0					0	my $feature_tested_at_node = $node->get_feature();
147							print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n"
148	0	0				0	if $self->{_debug3};
149	0					0	my $value_for_feature;
150							my $path_found;
151	0					0	my $pattern = '(\S+)\s=\s(\S+)';
152	0					0	foreach my $feature_and_value (@features_and_values) {
153	0					0	$feature_and_value =~ /$pattern/;
154	0	0				0	$value_for_feature = $2 if $feature_tested_at_node eq $1;
155							}
156							# The following clause introduced in Version 3.20
157	0	0				0	if (!defined $value_for_feature) {
158	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
159	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
160	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
161							}
162	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
163	0					0	return \%answer;
164							}
165	0	0				0	if ($value_for_feature) {
166	0	0				0	if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
	0					0
167	0	0				0	print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};
168	0					0	my $pattern1 = '(.+)<(.+)';
169	0					0	my $pattern2 = '(.+)>(.+)';
170	0					0	foreach my $child (@children) {
171	0					0	my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
	0					0
172	0					0	my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
173	0	0				0	if ($last_feature_and_value_on_branch =~ /$pattern1/) {
174	0					0	my ($feature, $threshold) = ($1,$2);
175	0	0				0	if ($value_for_feature <= $threshold) {
176	0					0	$path_found = 1;
177	0					0	%answer = %{$self->recursive_descent_for_classification($child,
	0					0
178							$features_and_values,\%answer)};
179	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
180	0					0	last;
181							}
182							}
183	0	0				0	if ($last_feature_and_value_on_branch =~ /$pattern2/) {
184	0					0	my ($feature, $threshold) = ($1,$2);
185	0	0				0	if ($value_for_feature > $threshold) {
186	0					0	$path_found = 1;
187	0					0	%answer = %{$self->recursive_descent_for_classification($child,
	0					0
188							$features_and_values,\%answer)};
189	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
190	0					0	last;
191							}
192							}
193							}
194	0	0				0	return \%answer if $path_found;
195							} else {
196	0					0	my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";
197							print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n"
198	0	0				0	if $self->{_debug3};
199	0					0	foreach my $child (@children) {
200	0					0	my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
	0					0
201	0	0				0	print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};
202	0					0	my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
203	0	0				0	if ($last_feature_and_value_on_branch eq $feature_value_combo) {
204	0					0	%answer = %{$self->recursive_descent_for_classification($child,
	0					0
205							$features_and_values,\%answer)};
206	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
207	0					0	$path_found = 1;
208	0					0	last;
209							}
210							}
211	0	0				0	return \%answer if $path_found;
212							}
213							}
214	0	0				0	if (! $path_found) {
215	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
216	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
217	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
218							}
219	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
220							}
221	0					0	return \%answer;
222							}
223
224							## If you want classification to be carried out by engaging a human user in a
225							## question-answer session, this is the method to use for that purpose. See, for
226							## example, the script classify_by_asking_questions.pl in the `examples'
227							## subdirectory for an illustration of how to do that.
228							sub classify_by_asking_questions {
229	0			0	1	0	my $self = shift;
230	0					0	my $root_node = shift;
231	0					0	my %answer = ();
232	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
233	0					0	$answer{$class_name} = undef;
234							}
235	0					0	$answer{'solution_path'} = [];
236	0					0	my %scratchpad_for_numeric_answers = ();
237	0					0	foreach my $feature_name (keys %{$self->{_prob_distribution_numeric_features_hash}}) {
	0					0
238	0					0	$scratchpad_for_numeric_answers{$feature_name} = undef;
239							}
240	0					0	my %classification = %{$self->interactive_recursive_descent_for_classification($root_node,
	0					0
241							\%answer, \%scratchpad_for_numeric_answers)};
242	0					0	@{$classification{'solution_path'}} = reverse @{$classification{'solution_path'}};
	0					0
	0					0
243	0					0	my %classification_for_display = ();
244	0					0	foreach my $item (keys %classification) {
245	0	0				0	if ($item ne 'solution_path') {
246	0					0	$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
247							} else {
248	0					0	my @outlist = ();
249	0					0	foreach my $x (@{$classification{$item}}) {
	0					0
250	0					0	push @outlist, "NODE$x";
251							}
252	0					0	$classification_for_display{$item} = \@outlist;
253							}
254							}
255	0					0	return \%classification_for_display;
256							}
257
258							sub interactive_recursive_descent_for_classification {
259	0			0	0	0	my $self = shift;
260	0					0	my $node = shift;
261	0					0	my $answer = shift;
262	0					0	my $scratchpad_for_numerics = shift;
263	0					0	my %answer = %$answer;
264	0					0	my %scratchpad_for_numerics = %$scratchpad_for_numerics;
265	0					0	my $pattern1 = '(.+)<(.+)';
266	0					0	my $pattern2 = '(.+)>(.+)';
267	0					0	my $user_value_for_feature;
268	0					0	my @children = @{$node->get_children()};
	0					0
269	0	0				0	if (@children == 0) {
270	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
271	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
272	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
273							}
274	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
275	0					0	return \%answer;
276							}
277	0					0	my @list_of_branch_attributes_to_children = ();
278	0					0	foreach my $child (@children) {
279	0					0	my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
	0					0
280	0					0	my $feature_and_value_on_branch = $branch_features_and_values[-1];
281	0					0	push @list_of_branch_attributes_to_children, $feature_and_value_on_branch;
282							}
283	0					0	my $feature_tested_at_node = $node->get_feature();
284	0					0	my $feature_value_combo;
285	0					0	my $path_found = 0;
286	0	0				0	if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
	0					0
287	0	0				0	if ($scratchpad_for_numerics{$feature_tested_at_node}) {
288	0					0	$user_value_for_feature = $scratchpad_for_numerics{$feature_tested_at_node};
289							} else {
290	0					0	my @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_tested_at_node}};
	0					0
291	0					0	while (1) {
292	0					0	print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
293	0					0	print "\nEnter a value in the range (@valuerange): ";
294	0					0	$user_value_for_feature = ;
295	0					0	$user_value_for_feature =~ s/\r?\n?$//;
296	0					0	$user_value_for_feature =~ s/^\s(\S+)\s$/$1/;
297	0					0	my $answer_found = 0;
298	0	0	0			0	if ($user_value_for_feature >= $valuerange[0] && $user_value_for_feature <= $valuerange[1]) {
299	0					0	$answer_found = 1;
300	0					0	last;
301							}
302	0	0				0	last if $answer_found;
303	0					0	print("You entered illegal value. Let's try again")
304							}
305	0					0	$scratchpad_for_numerics{$feature_tested_at_node} = $user_value_for_feature;
306							}
307	0					0	foreach my $i (0..@list_of_branch_attributes_to_children-1) {
308	0					0	my $branch_attribute = $list_of_branch_attributes_to_children[$i];
309	0	0				0	if ($branch_attribute =~ /$pattern1/) {
310	0					0	my ($feature,$threshold) = ($1,$2);
311	0	0				0	if ($user_value_for_feature <= $threshold) {
312	0					0	%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
	0					0
313							\%answer, \%scratchpad_for_numerics)};
314	0					0	$path_found = 1;
315	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
316	0					0	last;
317							}
318							}
319	0	0				0	if ($branch_attribute =~ /$pattern2/) {
320	0					0	my ($feature,$threshold) = ($1,$2);
321	0	0				0	if ($user_value_for_feature > $threshold) {
322	0					0	%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
	0					0
323							\%answer, \%scratchpad_for_numerics)};
324	0					0	$path_found = 1;
325	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
326	0					0	last;
327							}
328							}
329							}
330	0	0				0	return \%answer if $path_found;
331							} else {
332	0					0	my @possible_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature_tested_at_node}};
	0					0
333	0					0	while (1) {
334	0					0	print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
335	0					0	print "\nEnter a value from the list (@possible_values_for_feature): ";
336	0					0	$user_value_for_feature = ;
337	0					0	$user_value_for_feature =~ s/\r?\n?$//;
338	0					0	$user_value_for_feature =~ s/^\s(\S+)\s$/$1/;
339	0					0	my $answer_found = 0;
340	0	0				0	if (contained_in($user_value_for_feature, @possible_values_for_feature)) {
341	0					0	$answer_found = 1;
342	0					0	last;
343							}
344	0	0				0	last if $answer_found;
345	0					0	print("You entered illegal value. Let's try again");
346							}
347	0					0	$feature_value_combo = "$feature_tested_at_node=$user_value_for_feature";
348	0					0	foreach my $i (0..@list_of_branch_attributes_to_children-1) {
349	0					0	my $branch_attribute = $list_of_branch_attributes_to_children[$i];
350	0	0				0	if ($branch_attribute eq $feature_value_combo) {
351	0					0	%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
	0					0
352							\%answer, \%scratchpad_for_numerics)};
353	0					0	$path_found = 1;
354	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
355	0					0	last;
356							}
357							}
358	0	0				0	return \%answer if $path_found;
359							}
360	0	0				0	if (! $path_found) {
361	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
362	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
363	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
364							}
365	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
366							}
367	0					0	return \%answer;
368							}
369
370							###################################### Decision Tree Construction ####################################
371
372							## At the root node, we find the best feature that yields the greatest reduction in
373							## class entropy from the entropy based on just the class priors. The logic for
374							## finding this feature is different for symbolic features and for numeric features.
375							## That logic is built into the method shown later for best feature calculations.
376							sub construct_decision_tree_classifier {
377	1			1	1	7	print "\nConstructing the decision tree ...\n";
378	1					1	my $self = shift;
379	1	50				3	if ($self->{_debug3}) {
380	0					0	$self->determine_data_condition();
381	0					0	print "\nStarting construction of the decision tree:\n";
382							}
383	1					5	my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
	2					5
	1					2
384	1	50				3	if ($self->{_debug3}) {
385	0					0	print "\nPrior class probabilities: @class_probabilities\n";
386	0					0	print "\nClass names: @{$self->{_class_names}}\n";
	0					0
387							}
388	1					4	my $entropy = $self->class_entropy_on_priors();
389	1	50				3	print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};
390	1					7	my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');
391	1					1	$root_node->set_class_names(\@{$self->{_class_names}});
	1					3
392	1					1	$self->{_root_node} = $root_node;
393	1					3	$self->recursive_descent($root_node);
394	1					5	return $root_node;
395							}
396
397							## After the root node of the decision tree is calculated by the previous methods,
398							## we invoke this method recursively to create the rest of the tree. At each node,
399							## we find the feature that achieves the largest entropy reduction with regard to
400							## the partitioning of the training data samples that correspond to that node.
401							sub recursive_descent {
402	30			30	0	21	my $self = shift;
403	30					20	my $node = shift;
404							print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"
405	30	50				44	if $self->{_debug3};
406	30					31	my $node_serial_number = $node->get_serial_num();
407	30					23	my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
	30					32
408	30					35	my $existing_node_entropy = $node->get_node_entropy();
409	30	50				42	if ($self->{_debug3}) {
410	0					0	print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";
411	0					0	print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
412	0					0	print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
413	0					0	my @class_probs = @{$node->get_class_probabilities()};
	0					0
414	0					0	print "\nRD4 Class probabilities: @class_probs\n";
415							}
416	30	100				61	if ($existing_node_entropy < $self->{_entropy_threshold}) {
417	1	50				3	print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
418	1					3	return;
419							}
420	29					19	my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	29					34
421	29					42	my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
422							$self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
423	29					46	$node->set_feature($best_feature);
424	29	50				38	$node->display_node() if $self->{_debug3};
425	29	50	33			47	if (defined($self->{_max_depth_desired}) &&
426							(@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
427	0	0				0	print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3};
428	0					0	return;
429							}
430	29	50				41	return if ! defined $best_feature;
431	29	50				32	if ($self->{_debug3}) {
432	0					0	print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
433	0					0	print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
434	0					0	print "\nRD9 Best feature entropy: $best_feature_entropy\n";
435	0					0	print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
436							}
437	29					32	my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
438	29	50				32	print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
439	29	100				35	if ($entropy_gain > $self->{_entropy_threshold}) {
440	19	50	33			39	if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} &&
441							$self->{_feature_values_how_many_uniques_hash}->{$best_feature} >
442							$self->{_symbolic_to_numeric_cardinality_threshold}) {
443	0					0	my $best_threshold = $decision_val; # as returned by best feature calculator
444	0					0	my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
445							my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
446	0					0	@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	0					0
447							my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
448	0					0	@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	0					0
449	0					0	my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
450	0					0	my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
451	0					0	push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
452							$feature_threshold_combo_for_less_than;
453	0					0	push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
454							$feature_threshold_combo_for_greater_than;
455	0	0				0	if ($self->{_debug3}) {
456	0					0	print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
457							"@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
458	0					0	print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
459							"@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
460							}
461							my @class_probabilities_for_lessthan_child_node =
462	0					0	map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
463	0					0	\@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
	0					0
464							my @class_probabilities_for_greaterthan_child_node =
465	0					0	map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
466	0					0	\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
	0					0
467	0	0				0	if ($self->{_debug3}) {
468	0					0	print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
469	0					0	print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
470							}
471	0	0				0	if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
472	0					0	my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
473							\@class_probabilities_for_lessthan_child_node,
474							\@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
475	0					0	$node->add_child_link($left_child_node);
476	0					0	$self->recursive_descent($left_child_node);
477							}
478	0	0				0	if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
479	0					0	my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
480							\@class_probabilities_for_greaterthan_child_node,
481							\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
482	0					0	$node->add_child_link($right_child_node);
483	0					0	$self->recursive_descent($right_child_node);
484							}
485							} else {
486							print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
487	19	50				22	if $self->{_debug3};
488	19					14	my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
	19					36
489	19	50				25	print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
490	19					17	my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
	61					109
491	19					19	my @class_entropies_for_children = ();
492	19					30	foreach my $feature_and_value_index (0..@feature_value_combos-1) {
493							print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
494	61	50				84	if $self->{_debug3};
495	61					33	my @extended_branch_features_and_values_or_thresholds;
496	61	100				71	if (! @features_and_values_or_thresholds_on_branch) {
497	4					6	@extended_branch_features_and_values_or_thresholds =
498							($feature_value_combos[$feature_and_value_index]);
499							} else {
500							@extended_branch_features_and_values_or_thresholds =
501	57					40	@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	57					64
502	57					68	push @extended_branch_features_and_values_or_thresholds,
503							$feature_value_combos[$feature_and_value_index];
504							}
505							my @class_probabilities =
506	122					177	map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
507	61					35	\@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
	61					72
508	61					73	my $class_entropy_for_child =
509							$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
510							\@extended_branch_features_and_values_or_thresholds);
511	61	50				84	if ($self->{_debug3}) {
512	0					0	print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
513	0					0	print "\nRD20 class entropy for child: $class_entropy_for_child\n";
514							}
515	61	100				79	if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
516	29					53	my $child_node = DTNode->new(undef, $class_entropy_for_child,
517							\@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
518	29					40	$node->add_child_link($child_node);
519	29					80	$self->recursive_descent($child_node);
520							} else {
521	32	50				92	print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
522							}
523							}
524							}
525							} else {
526							print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n"
527	10	50				17	if $self->{_debug3};
528	10					39	return;
529							}
530							}
531
532							## This is the heart of the decision tree constructor. Its main job is to figure
533							## out the best feature to use for partitioning the training data samples that
534							## correspond to the current node. The search for the best feature is carried out
535							## differently for symbolic features and for numeric features. For a symbolic
536							## feature, the method estimates the entropy for each value of the feature and then
537							## averages out these entropies as a measure of the discriminatory power of that
538							## features. For a numeric feature, on the other hand, it estimates the entropy
539							## reduction that can be achieved if were to partition the set of training samples
540							## for each possible threshold. For a numeric feature, all possible sampling points
541							## relevant to the node in question are considered as candidates for thresholds.
542							sub best_feature_calculator {
543	29			29	0	24	my $self = shift;
544	29					27	my $features_and_values_or_thresholds_on_branch = shift;
545	29					16	my $existing_node_entropy = shift;
546	29					29	my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
547	29					23	my $pattern1 = '(.+)=(.+)';
548	29					18	my $pattern2 = '(.+)<(.+)';
549	29					17	my $pattern3 = '(.+)>(.+)';
550	29					34	my @all_symbolic_features = ();
551	29					19	foreach my $feature_name (@{$self->{_feature_names}}) {
	29					38
552							push @all_symbolic_features, $feature_name
553	116	50				171	if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
554							}
555	29					25	my @symbolic_features_already_used = ();
556	29					27	foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
557	81	50				309	push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
558							}
559	29					17	my @symbolic_features_not_yet_used;
560	29					71	foreach my $x (@all_symbolic_features) {
561	116	100				112	push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
562							}
563	29					26	my @true_numeric_types = ();
564	29					23	my @symbolic_types = ();
565	29					18	my @true_numeric_types_feature_names = ();
566	29					32	my @symbolic_types_feature_names = ();
567	29					26	foreach my $item (@features_and_values_or_thresholds_on_branch) {
568	81	50				250	if ($item =~ /$pattern2/) {
		50
		50
569	0					0	push @true_numeric_types, $item;
570	0					0	push @true_numeric_types_feature_names, $1;
571							} elsif ($item =~ /$pattern3/) {
572	0					0	push @true_numeric_types, $item;
573	0					0	push @true_numeric_types_feature_names, $1;
574							} elsif ($item =~ /$pattern1/) {
575	81					64	push @symbolic_types, $item;
576	81					103	push @symbolic_types_feature_names, $1;
577							} else {
578	0					0	die "format error in the representation of feature and values or thresholds";
579							}
580							}
581	29					34	my %seen = ();
582	29	0				27	@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
	0					0
583	29					15	%seen = ();
584	29	50				26	@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
	81					191
585							my @bounded_intervals_numeric_types =
586	29					23	@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
	29					41
587							# Calculate the upper and the lower bounds to be used when searching for the best
588							# threshold for each of the numeric features that are in play at the current node:
589	29					25	my (%upperbound, %lowerbound);
590	29					26	foreach my $feature (@true_numeric_types_feature_names) {
591	0					0	$upperbound{$feature} = undef;
592	0					0	$lowerbound{$feature} = undef;
593							}
594	29					27	foreach my $item (@bounded_intervals_numeric_types) {
595	0					0	foreach my $feature_grouping (@$item) {
596	0	0				0	if ($feature_grouping->[1] eq '>') {
597	0					0	$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
598							} else {
599	0					0	$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
600							}
601							}
602							}
603	29					30	my %entropy_values_for_different_features = ();
604	29					17	my %partitioning_point_child_entropies_hash = ();
605	29					22	my %partitioning_point_threshold = ();
606	29					25	my %entropies_for_different_values_of_symbolic_feature = ();
607	29					34	foreach my $feature (@{$self->{_feature_names}}) {
	29					37
608	116					113	$entropy_values_for_different_features{$feature} = [];
609	116					90	$partitioning_point_child_entropies_hash{$feature} = {};
610	116					89	$partitioning_point_threshold{$feature} = undef;
611	116					126	$entropies_for_different_values_of_symbolic_feature{$feature} = [];
612							}
613	29					22	foreach my $i (0..@{$self->{_feature_names}}-1) {
	29					49
614	116					95	my $feature_name = $self->{_feature_names}->[$i];
615	116	50				139	print "\n\nBFC1 FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
616	116	100	33			116	if (contained_in($feature_name, @symbolic_features_already_used)) {
		50
617	81					87	next;
618	35					58	} elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
619							$self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
620							$self->{_symbolic_to_numeric_cardinality_threshold}) {
621	0					0	my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
	0					0
622	0	0				0	print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};
623	0					0	my @newvalues = ();
624	0	0				0	if (contained_in($feature_name, @true_numeric_types_feature_names)) {
625	0	0	0			0	if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
		0	0
		0	0
		0	0
626							$lowerbound{$feature_name} >= $upperbound{$feature_name}) {
627	0					0	next;
628							} elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
629							$lowerbound{$feature_name} < $upperbound{$feature_name}) {
630	0					0	foreach my $x (@values) {
631	0	0	0			0	push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
632							}
633							} elsif (defined($upperbound{$feature_name})) {
634	0					0	foreach my $x (@values) {
635	0	0				0	push @newvalues, $x if $x <= $upperbound{$feature_name};
636							}
637							} elsif (defined($lowerbound{$feature_name})) {
638	0					0	foreach my $x (@values) {
639	0	0				0	push @newvalues, $x if $x > $lowerbound{$feature_name};
640							}
641							} else {
642	0					0	die "Error is bound specifications in best feature calculator";
643							}
644							} else {
645	0					0	@newvalues = @{deep_copy_array(\@values)};
	0					0
646							}
647	0	0				0	next if @newvalues == 0;
648	0					0	my @partitioning_entropies = ();
649	0					0	foreach my $value (@newvalues) {
650	0					0	my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
651	0					0	my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
652	0					0	my @for_left_child;
653							my @for_right_child;
654	0	0				0	if (@features_and_values_or_thresholds_on_branch) {
655	0					0	@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	0					0
656	0					0	push @for_left_child, $feature_and_less_than_value_string;
657	0					0	@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	0					0
658	0					0	push @for_right_child, $feature_and_greater_than_value_string;
659							} else {
660	0					0	@for_left_child = ($feature_and_less_than_value_string);
661	0					0	@for_right_child = ($feature_and_greater_than_value_string);
662							}
663	0					0	my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(
664							\@features_and_values_or_thresholds_on_branch, $feature_name, $value);
665	0					0	my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
666							\@features_and_values_or_thresholds_on_branch, $feature_name, $value);
667	0					0	my $partitioning_entropy = $entropy1 *
668							$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
669							$entropy2 *
670							$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);
671
672	0					0	push @partitioning_entropies, $partitioning_entropy;
673	0					0	$partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
674							}
675	0					0	my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
676	0	0				0	if ($min_entropy < $existing_node_entropy) {
677	0					0	$partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
678	0					0	$entropy_values_for_different_features{$feature_name} = $min_entropy;
679							}
680							} else {
681	35	50				44	print "\nBFC2: Entering section reserved for symbolic features\n" if $self->{_debug3};
682	35	50				49	print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
683	35					21	my %seen;
684	110	50				359	my @values = grep {$_ ne 'NA' && !$seen{$_}++}
685	35					24	@{$self->{_features_and_unique_values_hash}->{$feature_name}};
	35					47
686	35					65	@values = sort @values;
687	35	50				45	print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};
688
689	35					25	my $entropy = 0;
690	35					29	foreach my $value (@values) {
691	110					163	my $feature_value_string = "$feature_name" . '=' . "$value";
692	110	50				139	print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
693	110					67	my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	110					133
694	110	100				154	if (@features_and_values_or_thresholds_on_branch) {
695	96					79	push @extended_attributes, $feature_value_string;
696							} else {
697	14					15	@extended_attributes = ($feature_value_string);
698							}
699	110					143	$entropy +=
700							$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) *
701							$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
702							print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
703	110	50				160	"is $entropy\n" if $self->{_debug3};
704	110					68	push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
	110					208
705							}
706	35	50				49	if ($entropy < $existing_node_entropy) {
707	35					88	$entropy_values_for_different_features{$feature_name} = $entropy;
708							}
709							}
710							}
711	29					25	my $min_entropy_for_best_feature;
712							my $best_feature_name;
713	29					53	foreach my $feature_nom (keys %entropy_values_for_different_features) {
714	116	100				112	if (!defined($best_feature_name)) {
715	29					24	$best_feature_name = $feature_nom;
716	29					44	$min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
717							} else {
718	87	100				133	if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
719	30					27	$best_feature_name = $feature_nom;
720	30					32	$min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
721							}
722							}
723							}
724	29					30	my $threshold_for_best_feature;
725	29	50				31	if (exists $partitioning_point_threshold{$best_feature_name}) {
726	29					22	$threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
727							} else {
728	0					0	$threshold_for_best_feature = undef;
729							}
730	29					31	my $best_feature_entropy = $min_entropy_for_best_feature;
731	29					22	my @val_based_entropies_to_be_returned;
732							my $decision_val_to_be_returned;
733	29	50	33			74	if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} &&
734							$self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} >
735							$self->{_symbolic_to_numeric_cardinality_threshold}) {
736							@val_based_entropies_to_be_returned =
737	0					0	@{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
	0					0
738							} else {
739	29					25	@val_based_entropies_to_be_returned = ();
740							}
741	29	50				32	if (exists $partitioning_point_threshold{$best_feature_name}) {
742	29					20	$decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
743							} else {
744	0					0	$decision_val_to_be_returned = undef;
745							}
746							print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
747	29	50				47	"@val_based_entropies_to_be_returned\n" if $self->{_debug3};
748	29					164	return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned,
749							$decision_val_to_be_returned);
750							}
751
752							######################################### Entropy Calculators #####################################
753
754							sub class_entropy_on_priors {
755	1			1	0	2	my $self = shift;
756							return $self->{_entropy_cache}->{'priors'}
757	1	50				2	if exists $self->{_entropy_cache}->{"priors"};
758	1					2	my @class_names = @{$self->{_class_names}};
	1					2
759	1					1	my $entropy;
760	1					2	foreach my $class (@class_names) {
761	2					6	my $prob = $self->prior_probability_for_class($class);
762	2	50	33			18	my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;
763	2	50				5	$log_prob = 0 if $prob < 0.0001; # since X.log(X)->0 as X->0
764	2	50				4	$log_prob = 0 if $prob > 0.999; # since log(1) = 0
765	2	100				4	if (!defined $entropy) {
766	1					1	$entropy = -1.0 * $prob * $log_prob;
767	1					2	next;
768							}
769	1					3	$entropy += -1.0 * $prob * $log_prob;
770							}
771	1					2	$self->{_entropy_cache}->{'priors'} = $entropy;
772	1					3	return $entropy;
773							}
774
775							sub entropy_scanner_for_a_numeric_feature {
776	0			0	0	0	local $\| = 1;
777	0					0	my $self = shift;
778	0					0	my $feature = shift;
779	0					0	my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};
	0					0
780	0					0	my @entropies_for_less_than_thresholds = ();
781	0					0	my @entropies_for_greater_than_thresholds = ();
782	0					0	foreach my $point (@all_sampling_points) {
783	0					0	print ". ";
784	0					0	push @entropies_for_less_than_thresholds,
785							$self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);
786	0					0	push @entropies_for_greater_than_thresholds,
787							$self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);
788							}
789	0					0	print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ".
790							"@entropies_for_less_than_thresholds\n";
791	0					0	print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ".
792							"@entropies_for_greater_than_thresholds\n";
793							}
794
795							sub class_entropy_for_less_than_threshold_for_feature {
796	0			0	0	0	my $self = shift;
797	0					0	my $arr = shift;
798	0					0	my $feature = shift;
799	0					0	my $threshold = shift;
800	0					0	my @array_of_features_and_values_or_thresholds = @$arr;
801	0					0	my $feature_threshold_combo = "$feature" . '<' . "$threshold";
802	0					0	my $sequence = join ":", @array_of_features_and_values_or_thresholds;
803	0					0	$sequence .= ":" . $feature_threshold_combo;
804	0	0				0	return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
805							my @copy_of_array_of_features_and_values_or_thresholds =
806	0					0	@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
	0					0
807	0					0	push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
808	0					0	my $entropy = 0;
809	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
810	0					0	my $log_prob = undef;
811	0					0	my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
812							$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
813	0	0	0			0	if ($prob >= 0.0001 && $prob <= 0.999) {
		0
		0
814	0					0	$log_prob = log($prob) / log(2.0);
815							} elsif ($prob < 0.0001) {
816	0					0	$log_prob = 0;
817							} elsif ($prob > 0.999) {
818	0					0	$log_prob = 0;
819							} else {
820	0					0	die "An error has occurred in log_prob calculation";
821							}
822	0					0	$entropy += -1.0 * $prob * $log_prob;
823							}
824	0	0				0	if (abs($entropy) < 0.0000001) {
825	0					0	$entropy = 0.0;
826							}
827	0					0	$self->{_entropy_cache}->{$sequence} = $entropy;
828	0					0	return $entropy;
829							}
830
831							sub class_entropy_for_greater_than_threshold_for_feature {
832	0			0	0	0	my $self = shift;
833	0					0	my $arr = shift;
834	0					0	my $feature = shift;
835	0					0	my $threshold = shift;
836	0					0	my @array_of_features_and_values_or_thresholds = @$arr;
837	0					0	my $feature_threshold_combo = "$feature" . '>' . "$threshold";
838	0					0	my $sequence = join ":", @array_of_features_and_values_or_thresholds;
839	0					0	$sequence .= ":" . $feature_threshold_combo;
840	0	0				0	return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
841							my @copy_of_array_of_features_and_values_or_thresholds =
842	0					0	@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
	0					0
843	0					0	push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
844	0					0	my $entropy = 0;
845	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
846	0					0	my $log_prob = undef;
847	0					0	my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
848							$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
849	0	0	0			0	if ($prob >= 0.0001 && $prob <= 0.999) {
		0
		0
850	0					0	$log_prob = log($prob) / log(2.0);
851							} elsif ($prob < 0.0001) {
852	0					0	$log_prob = 0;
853							} elsif ($prob > 0.999) {
854	0					0	$log_prob = 0;
855							} else {
856	0					0	die "An error has occurred in log_prob calculation";
857							}
858	0					0	$entropy += -1.0 * $prob * $log_prob;
859							}
860	0	0				0	if (abs($entropy) < 0.0000001) {
861	0					0	$entropy = 0.0;
862							}
863	0					0	$self->{_entropy_cache}->{$sequence} = $entropy;
864	0					0	return $entropy;
865							}
866
867							sub class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds {
868	171			171	0	119	my $self = shift;
869	171					100	my $array_of_features_and_values_or_thresholds = shift;
870	171					178	my @array_of_features_and_values_or_thresholds = @$array_of_features_and_values_or_thresholds;
871	171					191	my $sequence = join ":", @array_of_features_and_values_or_thresholds;
872	171	100				302	return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
873	110					82	my $entropy = 0;
874	110					75	foreach my $class_name (@{$self->{_class_names}}) {
	110					138
875	220					147	my $log_prob = undef;
876	220					250	my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
877							$class_name, \@array_of_features_and_values_or_thresholds);
878	220	100	100			585	if ($prob >= 0.0001 && $prob <= 0.999) {
		100
		50
879	218					242	$log_prob = log($prob) / log(2.0);
880							} elsif ($prob < 0.0001) {
881	1					2	$log_prob = 0;
882							} elsif ($prob > 0.999) {
883	1					2	$log_prob = 0;
884							} else {
885	0					0	die "An error has occurred in log_prob calculation";
886							}
887	220					258	$entropy += -1.0 * $prob * $log_prob;
888							}
889	110	100				156	if (abs($entropy) < 0.0000001) {
890	1					1	$entropy = 0.0;
891							}
892	110					150	$self->{_entropy_cache}->{$sequence} = $entropy;
893	110					172	return $entropy;
894							}
895
896
897							##################################### Probability Calculators ########################################
898
899							sub prior_probability_for_class {
900	4			4	0	3	my $self = shift;
901	4					2	my $class = shift;
902	4					5	my $class_name_in_cache = "prior" . '::' . $class;
903							return $self->{_probability_cache}->{$class_name_in_cache}
904	4	50				11	if exists $self->{_probability_cache}->{$class_name_in_cache};
905	0					0	my $total_num_of_samples = keys %{$self->{_samples_class_label_hash}};
	0					0
906	0					0	my @values = values %{$self->{_samples_class_label_hash}};
	0					0
907	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
908	0					0	my @trues = grep {$_ eq $class_name} @values;
	0					0
909	0					0	my $prior_for_this_class = (1.0 * @trues) / $total_num_of_samples;
910	0					0	my $this_class_name_in_cache = "prior" . '::' . $class_name;
911	0					0	$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
912							}
913	0					0	return $self->{_probability_cache}->{$class_name_in_cache};
914							}
915
916							sub calculate_class_priors {
917	1			1	1	6	my $self = shift;
918	1	50				1	return if scalar keys %{$self->{_class_priors_hash}} > 1;
	1					5
919	1					2	foreach my $class_name (@{$self->{_class_names}}) {
	1					2
920	2					4	my $class_name_in_cache = "prior::$class_name";
921	2					1	my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};
	2					4
922	2					2	my @all_values = values %{$self->{_samples_class_label_hash}};
	2					10
923	2					3	my @trues_for_this_class = grep {$_ eq $class_name} @all_values;
	70					54
924	2					4	my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;
925	2					2	$self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;
926	2					4	my $this_class_name_in_cache = "prior::$class_name";
927	2					7	$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
928							}
929	1	50				4	if ($self->{_debug1}) {
930	0					0	foreach my $class (sort keys %{$self->{_class_priors_hash}}) {
	0					0
931	0					0	print "$class => $self->{_class_priors_hash}->{$class}\n";
932							}
933							}
934							}
935
936							sub calculate_first_order_probabilities {
937	1			1	1	15	print "\nEstimating probabilities...\n";
938	1					2	my $self = shift;
939	1					1	foreach my $feature (@{$self->{_feature_names}}) {
	1					3
940	4					10	$self->probability_of_feature_value($feature, undef);
941	4	50				9	if ($self->{_debug2}) {
942	0	0				0	if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {
943	0					0	print "\nPresenting probability distribution for a numeric feature:\n";
944	0					0	foreach my $sampling_point (sort {$a <=> $b} keys
	0					0
945	0					0	%{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {
946	0					0	my $sampling_pt_for_display = sprintf("%.2f", $sampling_point);
947							print "$feature :: $sampling_pt_for_display=" . sprintf("%.5f",
948	0					0	$self->{_prob_distribution_numeric_features_hash}->{$feature}{$sampling_point}) . "\n";
949							}
950							} else {
951	0					0	print "\nPresenting probabilities for the values of a feature considered to be symbolic:\n";
952	0					0	my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
	0					0
953	0					0	foreach my $value (sort @values_for_feature) {
954	0					0	my $prob = $self->probability_of_feature_value($feature,$value);
955	0					0	print "$feature :: $value = " . sprintf("%.5f", $prob) . "\n";
956							}
957							}
958							}
959							}
960							}
961
962							sub probability_of_feature_value {
963	286			286	0	193	my $self = shift;
964	286					209	my $feature_name = shift;
965	286					211	my $value = shift;
966	286	50	66			1097	$value = sprintf("%.1f", $value) if defined($value) && $value =~ /^\d+$/;
967	286	50	66			535	if (defined($value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
968							$value = closest_sampling_point($value,
969	0					0	$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
970							}
971	286					153	my $feature_and_value;
972	286	100				363	if (defined($value)) {
973	282					291	$feature_and_value = "$feature_name=$value";
974							}
975	286	50	66			535	if (defined($value) && exists($self->{_probability_cache}->{$feature_and_value})) {
976	282					507	return $self->{_probability_cache}->{$feature_and_value};
977							}
978	4					7	my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
979	4	50				6	if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
980	0	0				0	if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
981							$self->{_symbolic_to_numeric_cardinality_threshold}) {
982	0	0				0	if (! exists $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}) {
983	0					0	@valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
	0					0
984	0					0	$diffrange = $valuerange[1] - $valuerange[0];
985	0					0	my %seen = ();
986	0	0	0			0	my @unique_values_for_feature = sort {$a <=> $b} grep {$_ if $_ ne 'NA' && !$seen{$_}++}
	0					0
987	0					0	@{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
988	0					0	my @diffs = sort {$a <=> $b} map {$unique_values_for_feature[$_] -
	0					0
	0					0
989							$unique_values_for_feature[$_-1]} 1..@unique_values_for_feature-1;
990	0					0	my $median_diff = $diffs[int(@diffs/2) - 1];
991	0					0	$histogram_delta = $median_diff * 2;
992	0	0				0	if ($histogram_delta < $diffrange / 500.0) {
993	0	0				0	if (defined $self->{_number_of_histogram_bins}) {
994	0					0	$histogram_delta = $diffrange / $self->{_number_of_histogram_bins};
995							} else {
996	0					0	$histogram_delta = $diffrange / 500.0;
997							}
998							}
999	0					0	$self->{_histogram_delta_hash}->{$feature_name} = $histogram_delta;
1000	0					0	$num_of_histogram_bins = int($diffrange / $histogram_delta) + 1;
1001	0					0	$self->{_num_of_histogram_bins_hash}->{$feature_name} = $num_of_histogram_bins;
1002	0					0	my @sampling_points_for_feature = map {$valuerange[0] + $histogram_delta * $_}
	0					0
1003							0..$num_of_histogram_bins-1;
1004	0					0	$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name} =
1005							\@sampling_points_for_feature;
1006							}
1007							}
1008							}
1009	4	50				6	if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1010	0	0				0	if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1011							$self->{_symbolic_to_numeric_cardinality_threshold}) {
1012							my @sampling_points_for_feature =
1013	0					0	@{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
	0					0
1014	0					0	my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1015	0					0	my @actual_values_for_feature = grep {$_ ne 'NA'}
1016	0					0	@{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
1017	0					0	foreach my $i (0..@sampling_points_for_feature-1) {
1018	0					0	foreach my $j (0..@actual_values_for_feature-1) {
1019	0	0				0	if (abs($sampling_points_for_feature[$i]-$actual_values_for_feature[$j]) < $histogram_delta) {
1020	0					0	$counts_at_sampling_points[$i]++
1021							}
1022							}
1023							}
1024	0					0	my $total_counts = 0;
1025	0					0	map {$total_counts += $_} @counts_at_sampling_points;
	0					0
1026	0					0	my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
	0					0
1027	0					0	my %bin_prob_hash = ();
1028	0					0	foreach my $i (0..@sampling_points_for_feature-1) {
1029	0					0	$bin_prob_hash{$sampling_points_for_feature[$i]} = $probs[$i];
1030							}
1031	0					0	$self->{_prob_distribution_numeric_features_hash}->{$feature_name} = \%bin_prob_hash;
1032	0					0	my @values_for_feature = map "$feature_name=$_", map {sprintf("%.5f", $_)}
	0					0
1033							@sampling_points_for_feature;
1034	0					0	foreach my $i (0..@values_for_feature-1) {
1035	0					0	$self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1036							}
1037	0	0	0			0	if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1038	0					0	return $self->{_probability_cache}->{$feature_and_value};
1039							} else {
1040	0					0	return 0;
1041							}
1042							} else {
1043	0					0	my %seen = ();
1044	0	0	0			0	my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1045	0					0	@{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
1046	0					0	@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
	0					0
1047	0					0	my @value_counts = (0) x @values_for_feature;
1048							# foreach my $sample (sort {sample_index($a) cmp sample_index($b)}keys %{$self->{_training_data_hash}}){
1049	0					0	foreach my $sample (sort {sample_index($a) <=> sample_index($b)}keys %{$self->{_training_data_hash}}){
	0					0
	0					0
1050	0					0	my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
	0					0
1051	0					0	foreach my $i (0..@values_for_feature-1) {
1052	0					0	foreach my $current_value (@features_and_values) {
1053	0	0				0	$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1054							}
1055							}
1056							}
1057	0					0	my $total_counts = 0;
1058	0					0	map {$total_counts += $_} @value_counts;
	0					0
1059	0	0				0	die "PFV Something is wrong with your training file. It contains no training samples \
1060							for feature named $feature_name" if $total_counts == 0;
1061	0					0	my @probs = map {$_ / (1.0 * $total_counts)} @value_counts;
	0					0
1062	0					0	foreach my $i (0..@values_for_feature-1) {
1063	0					0	$self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1064							}
1065	0	0	0			0	if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1066	0					0	return $self->{_probability_cache}->{$feature_and_value};
1067							} else {
1068	0					0	return 0;
1069							}
1070							}
1071							} else {
1072							# This section is only for purely symbolic features:
1073	4					4	my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
	4					18
1074	4					5	@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
	140					187
1075	4					17	my @value_counts = (0) x @values_for_feature;
1076							# foreach my $sample (sort {sample_index($a) cmp sample_index($b)} keys %{$self->{_training_data_hash}}) {
1077	4					4	foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}}) {
	564					505
	4					21
1078	140					79	my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
	140					180
1079	140					127	foreach my $i (0..@values_for_feature-1) {
1080	4900					3257	for my $current_value (@features_and_values) {
1081	19600	100				22722	$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1082							}
1083							}
1084							}
1085	4					13	foreach my $i (0..@values_for_feature-1) {
1086							$self->{_probability_cache}->{$values_for_feature[$i]} =
1087	140					82	$value_counts[$i] / (1.0 * scalar(keys %{$self->{_training_data_hash}}));
	140					176
1088							}
1089	4	50	33			11	if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1090	0					0	return $self->{_probability_cache}->{$feature_and_value};
1091							} else {
1092	4					16	return 0;
1093							}
1094							}
1095							}
1096
1097							sub probability_of_feature_value_given_class {
1098	572			572	0	393	my $self = shift;
1099	572					423	my $feature_name = shift;
1100	572					327	my $feature_value = shift;
1101	572					384	my $class_name = shift;
1102	572	50	33			1647	$feature_value = sprintf("%.1f", $feature_value) if defined($feature_value) && $feature_value =~ /^\d+$/;
1103	572	50	33			1057	if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
1104							$feature_value = closest_sampling_point($feature_value,
1105	0					0	$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
1106							}
1107	572					308	my $feature_value_class;
1108	572	50				634	if (defined($feature_value)) {
1109	572					764	$feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";
1110							}
1111	572	100	33			1190	if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {
1112							print "\nNext answer returned by cache for feature $feature_name and " .
1113	564	50				653	"value $feature_value given class $class_name\n" if $self->{_debug2};
1114	564					1101	return $self->{_probability_cache}->{$feature_value_class};
1115							}
1116	8					9	my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
1117
1118	8	50				12	if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1119	0	0				0	if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1120							$self->{_symbolic_to_numeric_cardinality_threshold}) {
1121	0					0	$histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};
1122	0					0	$num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};
1123	0					0	@valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
	0					0
1124	0					0	$diffrange = $valuerange[1] - $valuerange[0];
1125							}
1126							}
1127	8					7	my @samples_for_class = ();
1128							# Accumulate all samples names for the given class:
1129	8					3	foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
	8					34
1130	280	100				320	if ($self->{_samples_class_label_hash}->{$sample_name} eq $class_name) {
1131	140					116	push @samples_for_class, $sample_name;
1132							}
1133							}
1134	8	50				18	if (exists($self->{_numeric_features_valuerange_hash}->{$feature_name})) {
1135	0	0				0	if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1136							$self->{_symbolic_to_numeric_cardinality_threshold}) {
1137							my @sampling_points_for_feature =
1138	0					0	@{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
	0					0
1139	0					0	my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1140	0					0	my @actual_feature_values_for_samples_in_class = ();
1141	0					0	foreach my $sample (@samples_for_class) {
1142	0					0	foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
	0					0
1143	0					0	my $pattern = '(.+)=(.+)';
1144	0					0	$feature_and_value =~ /$pattern/;
1145	0					0	my ($feature, $value) = ($1, $2);
1146	0	0	0			0	if (($feature eq $feature_name) && ($value ne 'NA')) {
1147	0					0	push @actual_feature_values_for_samples_in_class, $value;
1148							}
1149							}
1150							}
1151	0					0	foreach my $i (0..@sampling_points_for_feature-1) {
1152	0					0	foreach my $j (0..@actual_feature_values_for_samples_in_class-1) {
1153	0	0				0	if (abs($sampling_points_for_feature[$i] -
1154							$actual_feature_values_for_samples_in_class[$j]) < $histogram_delta) {
1155	0					0	$counts_at_sampling_points[$i]++;
1156							}
1157							}
1158							}
1159	0					0	my $total_counts = 0;
1160	0					0	map {$total_counts += $_} @counts_at_sampling_points;
	0					0
1161	0	0				0	die "PFVC1 Something is wrong with your training file. It contains no training " .
1162							"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1163	0					0	my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
	0					0
1164	0					0	my @values_for_feature_and_class = map {"$feature_name=$_" . "::" . "$class_name"}
	0					0
1165							@sampling_points_for_feature;
1166	0					0	foreach my $i (0..@values_for_feature_and_class-1) {
1167	0					0	$self->{_probability_cache}->{$values_for_feature_and_class[$i]} = $probs[$i];
1168							}
1169	0	0				0	if (exists $self->{_probability_cache}->{$feature_value_class}) {
1170	0					0	return $self->{_probability_cache}->{$feature_value_class};
1171							} else {
1172	0					0	return 0;
1173							}
1174							} else {
1175							# This section is for numeric features that will be treated symbolically
1176	0					0	my %seen = ();
1177	0	0	0			0	my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1178	0					0	@{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
1179	0					0	@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
	0					0
1180	0					0	my @value_counts = (0) x @values_for_feature;
1181	0					0	foreach my $sample (@samples_for_class) {
1182	0					0	my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
	0					0
1183	0					0	foreach my $i (0..@values_for_feature-1) {
1184	0					0	foreach my $current_value (@features_and_values) {
1185	0	0				0	$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1186							}
1187							}
1188							}
1189	0					0	my $total_counts = 0;
1190	0					0	map {$total_counts += $_} @value_counts;
	0					0
1191	0	0				0	die "PFVC2 Something is wrong with your training file. It contains no training " .
1192							"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1193							# We normalize by total_count because the probabilities are conditioned on a given class
1194	0					0	foreach my $i (0..@values_for_feature-1) {
1195	0					0	my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1196	0					0	$self->{_probability_cache}->{$feature_and_value_and_class} =
1197							$value_counts[$i] / (1.0 * $total_counts);
1198							}
1199	0	0				0	if (exists $self->{_probability_cache}->{$feature_value_class}) {
1200	0					0	return $self->{_probability_cache}->{$feature_value_class};
1201							} else {
1202	0					0	return 0;
1203							}
1204							}
1205							} else {
1206							# This section is for purely symbolic features
1207	8					5	my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
	8					36
1208	8					8	my %seen = ();
1209	280	100	66			729	@values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1210	8					6	@{$self->{_features_and_values_hash}->{$feature_name}};
	8					11
1211	8					9	@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
	28					40
1212	8					12	my @value_counts = (0) x @values_for_feature;
1213	8					8	foreach my $sample (@samples_for_class) {
1214	140					73	my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
	140					169
1215	140					123	foreach my $i (0..@values_for_feature-1) {
1216	490					327	foreach my $current_value (@features_and_values) {
1217	1960	100				2381	$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1218							}
1219							}
1220							}
1221	8					9	my $total_counts = 0;
1222	8					6	map {$total_counts += $_} @value_counts;
	28					23
1223	8	50				10	die "PFVC2 Something is wrong with your training file. It contains no training " .
1224							"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1225							# We normalize by total_count because the probabilities are conditioned on a given class
1226	8					9	foreach my $i (0..@values_for_feature-1) {
1227	28					29	my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1228	28					55	$self->{_probability_cache}->{$feature_and_value_and_class} =
1229							$value_counts[$i] / (1.0 * $total_counts);
1230							}
1231	8	50				13	if (exists $self->{_probability_cache}->{$feature_value_class}) {
1232	8					32	return $self->{_probability_cache}->{$feature_value_class};
1233							} else {
1234	0					0	return 0;
1235							}
1236							}
1237							}
1238
1239							sub probability_of_feature_less_than_threshold {
1240	0			0	0	0	my $self = shift;
1241	0					0	my $feature_name = shift;
1242	0					0	my $threshold = shift;
1243	0					0	my $feature_threshold_combo = "$feature_name" . '<' . "$threshold";
1244							return $self->{_probability_cache}->{$feature_threshold_combo}
1245	0	0				0	if (exists $self->{_probability_cache}->{$feature_threshold_combo});
1246	0	0				0	my @all_values = grep {$_ if $_ ne 'NA'} @{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
	0					0
1247	0	0				0	my @all_values_less_than_threshold = grep {$_ if $_ <= $threshold} @all_values;
	0					0
1248	0					0	my $probability = (1.0 * @all_values_less_than_threshold) / @all_values;
1249	0					0	$self->{_probability_cache}->{$feature_threshold_combo} = $probability;
1250	0					0	return $probability;
1251							}
1252
1253							sub probability_of_feature_less_than_threshold_given_class {
1254	0			0	0	0	my $self = shift;
1255	0					0	my $feature_name = shift;
1256	0					0	my $threshold = shift;
1257	0					0	my $class_name = shift;
1258	0					0	my $feature_threshold_class_combo = "$feature_name" . "<" . "$threshold" . "::" . "$class_name";
1259							return $self->{_probability_cache}->{$feature_threshold_class_combo}
1260	0	0				0	if (exists $self->{_probability_cache}->{$feature_threshold_class_combo});
1261	0					0	my @data_samples_for_class = ();
1262							# Accumulate all samples names for the given class:
1263	0					0	foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
	0					0
1264							push @data_samples_for_class, $sample_name
1265	0	0				0	if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;
1266							}
1267	0					0	my @actual_feature_values_for_samples_in_class = ();
1268	0					0	foreach my $sample (@data_samples_for_class) {
1269	0					0	foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
	0					0
1270	0					0	my $pattern = '(.+)=(.+)';
1271	0					0	$feature_and_value =~ /$pattern/;
1272	0					0	my ($feature,$value) = ($1,$2);
1273	0	0	0			0	push @actual_feature_values_for_samples_in_class, $value
1274							if $feature eq $feature_name && $value ne 'NA';
1275							}
1276							}
1277	0	0				0	my @actual_points_for_feature_less_than_threshold = grep {$_ if $_ <= $threshold} @actual_feature_values_for_samples_in_class;
	0					0
1278							# The condition in the assignment that follows was a bug correction in Version 3.20
1279	0	0				0	my $probability = @actual_feature_values_for_samples_in_class > 0 ? ((1.0 * @actual_points_for_feature_less_than_threshold) / @actual_feature_values_for_samples_in_class) : 0.0;
1280	0					0	$self->{_probability_cache}->{$feature_threshold_class_combo} = $probability;
1281	0					0	return $probability;
1282							}
1283
1284							# This method requires that all truly numeric types only be expressed as '<' or '>'
1285							# constructs in the array of branch features and thresholds
1286							sub probability_of_a_sequence_of_features_and_values_or_thresholds {
1287	329			329	0	213	my $self = shift;
1288	329					236	my $arr = shift;
1289	329					379	my @array_of_features_and_values_or_thresholds = @$arr;
1290	329	50				422	return if scalar @array_of_features_and_values_or_thresholds == 0;
1291	329					392	my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1292	329	100				650	return $self->{_probability_cache}->{$sequence} if exists $self->{_probability_cache}->{$sequence};
1293	96					74	my $probability = undef;
1294	96					64	my $pattern1 = '(.+)=(.+)';
1295	96					55	my $pattern2 = '(.+)<(.+)';
1296	96					68	my $pattern3 = '(.+)>(.+)';
1297	96					64	my @true_numeric_types = ();
1298	96					62	my @true_numeric_types_feature_names = ();
1299	96					66	my @symbolic_types = ();
1300	96					65	my @symbolic_types_feature_names = ();
1301	96					82	foreach my $item (@array_of_features_and_values_or_thresholds) {
1302	282	50				502	if ($item =~ /$pattern2/) {
		50
1303	0					0	push @true_numeric_types, $item;
1304	0					0	my ($feature,$value) = ($1,$2);
1305	0					0	push @true_numeric_types_feature_names, $feature;
1306							} elsif ($item =~ /$pattern3/) {
1307	0					0	push @true_numeric_types, $item;
1308	0					0	my ($feature,$value) = ($1,$2);
1309	0					0	push @true_numeric_types_feature_names, $feature;
1310							} else {
1311	282					215	push @symbolic_types, $item;
1312	282					409	$item =~ /$pattern1/;
1313	282					323	my ($feature,$value) = ($1,$2);
1314	282					299	push @symbolic_types_feature_names, $feature;
1315							}
1316							}
1317	96					91	my %seen1 = ();
1318	96	0				89	@true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
	0					0
1319	96					63	my %seen2 = ();
1320	96	50				85	@symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
	282					622
1321	96					132	my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1322							print_array_with_msg("POS: Answer returned by find_bounded: ",
1323	96	50				124	$bounded_intervals_numeric_types) if $self->{_debug2};
1324							# Calculate the upper and the lower bounds to be used when searching for the best
1325							# threshold for each of the numeric features that are in play at the current node:
1326	96					68	my (%upperbound, %lowerbound);
1327	96					87	foreach my $feature_name (@true_numeric_types_feature_names) {
1328	0					0	$upperbound{$feature_name} = undef;
1329	0					0	$lowerbound{$feature_name} = undef;
1330							}
1331	96					88	foreach my $item (@$bounded_intervals_numeric_types) {
1332	0					0	foreach my $feature_grouping (@$item) {
1333	0	0				0	if ($feature_grouping->[1] eq '>') {
1334	0					0	$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1335							} else {
1336	0					0	$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1337							}
1338							}
1339							}
1340	96					85	foreach my $feature_name (@true_numeric_types_feature_names) {
1341	0	0	0			0	if (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name}) &&
		0	0
		0	0
		0	0
			0
1342							$upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1343	0					0	return 0;
1344							} elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1345	0	0				0	if (! $probability) {
1346							$probability = $self->probability_of_feature_less_than_threshold($feature_name,
1347							$upperbound{$feature_name}) -
1348	0					0	$self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name});
1349							} else {
1350							$probability *= ($self->probability_of_feature_less_than_threshold($feature_name,
1351							$upperbound{$feature_name}) -
1352	0					0	$self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name}))
1353							}
1354							} elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1355	0	0				0	if (! $probability) {
1356							$probability = $self->probability_of_feature_less_than_threshold($feature_name,
1357	0					0	$upperbound{$feature_name});
1358							} else {
1359							$probability *= $self->probability_of_feature_less_than_threshold($feature_name,
1360	0					0	$upperbound{$feature_name});
1361							}
1362							} elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1363	0	0				0	if (! $probability) {
1364							$probability = 1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1365	0					0	$lowerbound{$feature_name});
1366							} else {
1367							$probability *= (1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1368	0					0	$lowerbound{$feature_name}));
1369							}
1370							} else {
1371	0					0	die("Ill formatted call to 'probability_of_sequence' method");
1372							}
1373							}
1374	96					85	foreach my $feature_and_value (@symbolic_types) {
1375	282	50				750	if ($feature_and_value =~ /$pattern1/) {
1376	282					338	my ($feature,$value) = ($1,$2);
1377	282	100				276	if (! $probability) {
1378	96					138	$probability = $self->probability_of_feature_value($feature, $value);
1379							} else {
1380	186					203	$probability *= $self->probability_of_feature_value($feature, $value);
1381							}
1382							}
1383							}
1384	96					176	$self->{_probability_cache}->{$sequence} = $probability;
1385	96					197	return $probability;
1386							}
1387
1388							## The following method requires that all truly numeric types only be expressed as
1389							## '<' or '>' constructs in the array of branch features and thresholds
1390							sub probability_of_a_sequence_of_features_and_values_or_thresholds_given_class {
1391	220			220	0	150	my $self = shift;
1392	220					133	my $arr = shift;
1393	220					167	my $class_name = shift;
1394	220					229	my @array_of_features_and_values_or_thresholds = @$arr;
1395	220	50				315	return if scalar @array_of_features_and_values_or_thresholds == 0;
1396	220					245	my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1397	220					212	my $sequence_with_class = "$sequence" . "::" . $class_name;
1398							return $self->{_probability_cache}->{$sequence_with_class}
1399	220	100				343	if exists $self->{_probability_cache}->{$sequence_with_class};
1400	200					126	my $probability = undef;
1401	200					147	my $pattern1 = '(.+)=(.+)';
1402	200					103	my $pattern2 = '(.+)<(.+)';
1403	200					128	my $pattern3 = '(.+)>(.+)';
1404	200					134	my @true_numeric_types = ();
1405	200					144	my @true_numeric_types_feature_names = ();
1406	200					128	my @symbolic_types = ();
1407	200					132	my @symbolic_types_feature_names = ();
1408	200					171	foreach my $item (@array_of_features_and_values_or_thresholds) {
1409	572	50				1104	if ($item =~ /$pattern2/) {
		50
1410	0					0	push @true_numeric_types, $item;
1411	0					0	my ($feature,$value) = ($1,$2);
1412	0					0	push @true_numeric_types_feature_names, $feature;
1413							} elsif ($item =~ /$pattern3/) {
1414	0					0	push @true_numeric_types, $item;
1415	0					0	my ($feature,$value) = ($1,$2);
1416	0					0	push @true_numeric_types_feature_names, $feature;
1417							} else {
1418	572					416	push @symbolic_types, $item;
1419	572					898	$item =~ /$pattern1/;
1420	572					733	my ($feature,$value) = ($1,$2);
1421	572					578	push @symbolic_types_feature_names, $feature;
1422							}
1423							}
1424	200					198	my %seen1 = ();
1425	200	0				187	@true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
	0					0
1426	200					138	my %seen2 = ();
1427	200	50				161	@symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
	572					1307
1428	200					269	my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1429							print_array_with_msg("POSC: Answer returned by find_bounded: ",
1430	200	50				250	$bounded_intervals_numeric_types) if $self->{_debug2};
1431							# Calculate the upper and the lower bounds to be used when searching for the best
1432							# threshold for each of the numeric features that are in play at the current node:
1433	200					135	my (%upperbound, %lowerbound);
1434	200					177	foreach my $feature_name (@true_numeric_types_feature_names) {
1435	0					0	$upperbound{$feature_name} = undef;
1436	0					0	$lowerbound{$feature_name} = undef;
1437							}
1438	200					171	foreach my $item (@$bounded_intervals_numeric_types) {
1439	0					0	foreach my $feature_grouping (@$item) {
1440	0	0				0	if ($feature_grouping->[1] eq '>') {
1441	0					0	$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1442							} else {
1443	0					0	$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1444							}
1445							}
1446							}
1447	200					158	foreach my $feature_name (@true_numeric_types_feature_names) {
1448	0	0	0			0	if ($lowerbound{$feature_name} && $upperbound{$feature_name} &&
		0	0
		0	0
		0	0
			0
1449							$upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1450	0					0	return 0;
1451							} elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1452	0	0				0	if (! $probability) {
1453
1454							$probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1455							$upperbound{$feature_name}, $class_name) -
1456							$self->probability_of_feature_less_than_threshold_given_class($feature_name,
1457	0					0	$lowerbound{$feature_name}, $class_name);
1458							} else {
1459							$probability *= ($self->probability_of_feature_less_than_threshold_given_class($feature_name,
1460							$upperbound{$feature_name}, $class_name) -
1461							$self->probability_of_feature_less_than_threshold_given_class($feature_name,
1462	0					0	$lowerbound{$feature_name}, $class_name))
1463							}
1464							} elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1465	0	0				0	if (! $probability) {
1466							$probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1467	0					0	$upperbound{$feature_name}, $class_name);
1468							} else {
1469							$probability *= $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1470	0					0	$upperbound{$feature_name}, $class_name);
1471							}
1472							} elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1473	0	0				0	if (! $probability) {
1474							$probability = 1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1475	0					0	$lowerbound{$feature_name}, $class_name);
1476							} else {
1477							$probability *= (1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1478	0					0	$lowerbound{$feature_name}, $class_name));
1479							}
1480							} else {
1481	0					0	die("Ill formatted call to 'probability of sequence given class' method");
1482							}
1483							}
1484	200					150	foreach my $feature_and_value (@symbolic_types) {
1485	572	50				1496	if ($feature_and_value =~ /$pattern1/) {
1486	572					688	my ($feature,$value) = ($1,$2);
1487	572	100				560	if (! $probability) {
1488	200					287	$probability = $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1489							} else {
1490	372					379	$probability *= $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1491							}
1492							}
1493							}
1494	200					383	$self->{_probability_cache}->{$sequence_with_class} = $probability;
1495	200					454	return $probability;
1496							}
1497
1498							sub probability_of_a_class_given_sequence_of_features_and_values_or_thresholds {
1499	342			342	0	222	my $self = shift;
1500	342					257	my $class_name = shift;
1501	342					187	my $arr = shift;
1502	342					355	my @array_of_features_and_values_or_thresholds = @$arr;
1503	342					344	my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1504	342					327	my $class_and_sequence = "$class_name" . "::" . $sequence;
1505							return $self->{_probability_cache}->{$class_and_sequence}
1506	342	100				822	if exists $self->{_probability_cache}->{$class_and_sequence};
1507	110					119	my @array_of_class_probabilities = (0) x scalar @{$self->{_class_names}};
	110					174
1508	110					73	foreach my $i (0..@{$self->{_class_names}}-1) {
	110					137
1509	220					173	my $class_name = $self->{_class_names}->[$i];
1510	220					261	my $prob = $self->probability_of_a_sequence_of_features_and_values_or_thresholds_given_class(
1511							\@array_of_features_and_values_or_thresholds, $class_name);
1512	220	100				307	if ($prob < 0.000001) {
1513	1					1	$array_of_class_probabilities[$i] = 0.0;
1514	1					5	next;
1515							}
1516	219					278	my $prob_of_feature_sequence = $self->probability_of_a_sequence_of_features_and_values_or_thresholds(
1517							\@array_of_features_and_values_or_thresholds);
1518							# die "PCS Something is wrong with your sequence of feature values and thresholds in " .
1519							# "probability_of_a_class_given_sequence_of_features_and_values_or_thresholds()"
1520							# if ! $prob_of_feature_sequence;
1521	219					272	my $prior = $self->{_class_priors_hash}->{$self->{_class_names}->[$i]};
1522	219	50				227	if ($prob_of_feature_sequence) {
1523	219					302	$array_of_class_probabilities[$i] = $prob * $prior / $prob_of_feature_sequence;
1524							} else {
1525	0					0	$array_of_class_probabilities[$i] = $prior;
1526							}
1527							}
1528	110					78	my $sum_probability;
1529	110					98	map {$sum_probability += $_} @array_of_class_probabilities;
	220					229
1530	110	50				122	if ($sum_probability == 0) {
1531	0					0	@array_of_class_probabilities = map {1.0 / (scalar @{$self->{_class_names}})}
	0					0
1532	0					0	(0..@{$self->{_class_names}}-1);
	0					0
1533							} else {
1534	110					88	@array_of_class_probabilities = map {$_ * 1.0 / $sum_probability} @array_of_class_probabilities;
	220					240
1535							}
1536	110					97	foreach my $i (0..@{$self->{_class_names}}-1) {
	110					197
1537	220					266	my $this_class_and_sequence = "$self->{_class_names}->[$i]" . "::" . "$sequence";
1538	220					464	$self->{_probability_cache}->{$this_class_and_sequence} = $array_of_class_probabilities[$i];
1539							}
1540	110					192	return $self->{_probability_cache}->{$class_and_sequence};
1541							}
1542
1543							####################################### Class Based Utilities ##########################################
1544
1545							## Given a list of branch attributes for the numeric features of the form, say,
1546							## ['g2<1','g2<2','g2<3','age>34','age>36','age>37'], this method returns the
1547							## smallest list that is relevant for the purpose of calculating the probabilities.
1548							## To explain, the probability that the feature `g2' is less than 1 AND, at the same
1549							## time, less than 2, AND, at the same time, less than 3, is the same as the
1550							## probability that the feature less than 1. Similarly, the probability that 'age'
1551							## is greater than 34 and also greater than 37 is the same as `age' being greater
1552							## than 37.
1553							sub find_bounded_intervals_for_numeric_features {
1554	325			325	0	243	my $self = shift;
1555	325					194	my $arr = shift;
1556	325					277	my @arr = @$arr;
1557	325					180	my @features = @{$self->{_feature_names}};
	325					454
1558	325					247	my @arr1 = map {my @x = split /(>\|<)/, $_; \@x} @arr;
	0					0
	0					0
1559	325	50				436	print_array_with_msg("arr1", \@arr1) if $self->{_debug2};
1560	325					201	my @arr3 = ();
1561	325					287	foreach my $feature_name (@features) {
1562	1300					812	my @temp = ();
1563	1300					956	foreach my $x (@arr1) {
1564	0	0	0			0	push @temp, $x if @$x > 0 && $x->[0] eq $feature_name;
1565							}
1566	1300	50				1657	push @arr3, \@temp if @temp > 0;
1567							}
1568	325	50				403	print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
1569							# Sort each list so that '<' entries occur before '>' entries:
1570	325					190	my @arr4;
1571	325					237	foreach my $li (@arr3) {
1572	0					0	my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
	0					0
1573	0					0	push @arr4, \@sorted;
1574							}
1575	325	50				388	print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
1576	325					179	my @arr5;
1577	325					274	foreach my $li (@arr4) {
1578	0					0	my @temp1 = ();
1579	0					0	my @temp2 = ();
1580	0					0	foreach my $inner (@$li) {
1581	0	0				0	if ($inner->[1] eq '<') {
1582	0					0	push @temp1, $inner;
1583							} else {
1584	0					0	push @temp2, $inner;
1585							}
1586							}
1587	0	0	0			0	if (@temp1 > 0 && @temp2 > 0) {
		0
1588	0					0	push @arr5, [\@temp1, \@temp2];
1589							} elsif (@temp1 > 0) {
1590	0					0	push @arr5, [\@temp1];
1591							} else {
1592	0					0	push @arr5, [\@temp2];
1593							}
1594							}
1595	325	50				366	print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
1596	325					226	my @arr6 = ();
1597	325					224	foreach my $li (@arr5) {
1598	0					0	my @temp1 = ();
1599	0					0	foreach my $inner (@$li) {
1600	0					0	my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
	0					0
1601	0					0	push @temp1, \@sorted;
1602							}
1603	0					0	push @arr6, \@temp1;
1604							}
1605	325	50				384	print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
1606	325					214	my @arr9 = ();
1607	325					276	foreach my $li (@arr6) {
1608	0					0	foreach my $alist (@$li) {
1609	0					0	my @newalist = ();
1610	0	0				0	if ($alist->[0][1] eq '<') {
1611	0					0	push @newalist, $alist->[0];
1612							} else {
1613	0					0	push @newalist, $alist->[-1];
1614							}
1615	0	0				0	if ($alist->[0][1] ne $alist->[-1][1]) {
1616	0					0	push @newalist, $alist->[-1];
1617							}
1618	0					0	push @arr9, \@newalist;
1619							}
1620							}
1621	325	50				366	print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
1622	325					405	return \@arr9;
1623
1624							}
1625
1626							## This method is used to verify that you used legal feature names in the test
1627							## sample that you want to classify with the decision tree.
1628							sub check_names_used {
1629	0			0	0	0	my $self = shift;
1630	0					0	my $features_and_values_test_data = shift;
1631	0					0	my @features_and_values_test_data = @$features_and_values_test_data;
1632	0					0	my $pattern = '(\S+)\s=\s(\S+)';
1633	0					0	foreach my $feature_and_value (@features_and_values_test_data) {
1634	0					0	$feature_and_value =~ /$pattern/;
1635	0					0	my ($feature,$value) = ($1,$2);
1636	0	0	0			0	die "Your test data has formatting error" unless defined($feature) && defined($value);
1637	0	0				0	return 0 unless contained_in($feature, @{$self->{_feature_names}});
	0					0
1638							}
1639	0					0	return 1;
1640							}
1641
1642							####################################### Data Condition Calculator ######################################
1643
1644							## This method estimates the worst-case fan-out of the decision tree taking into
1645							## account the number of values (and therefore the number of branches emanating from
1646							## a node) for the symbolic features.
1647							sub determine_data_condition {
1648	0			0	0	0	my $self = shift;
1649	0					0	my $num_of_features = scalar @{$self->{_feature_names}};
	0					0
1650	0					0	my @values = ();
1651	0					0	my @number_of_values;
1652	0					0	foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
	0					0
1653	0					0	push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
1654	0	0				0	if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
	0					0
1655	0					0	push @number_of_values, scalar @values;
1656							}
1657	0	0				0	return if ! @values;
1658	0					0	print "Number of features: $num_of_features\n";
1659	0					0	my @minmax = minmax(\@number_of_values);
1660	0					0	my $max_num_values = $minmax[1];
1661	0					0	print "Largest number of values for symbolic features is: $max_num_values\n";
1662	0					0	my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
1663	0					0	print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
1664							"nodes. The exact number of nodes created depends critically on " .
1665							"the entropy_threshold used for node expansion (the default value " .
1666							"for this threshold is 0.01) and on the value set for max_depth_desired " .
1667							"for the depth of the tree.\n";
1668	0	0				0	if ($estimated_number_of_nodes > 10000) {
1669	0					0	print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
1670							"large value for entropy_threshold and/or a small value for " .
1671							"for max_depth_desired to reduce the number of nodes created.\n";
1672	0					0	print "\nDo you wish to continue anyway? Enter 'y' for yes: ";
1673	0					0	my $answer = ;
1674	0					0	$answer =~ s/\r?\n?$//;
1675	0		0			0	while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
1676	0					0	print "\nAnswer not recognized. Let's try again. Enter 'y' or 'n': ";
1677	0					0	$answer = ;
1678	0					0	$answer =~ s/\r?\n?$//;
1679							}
1680	0	0				0	die unless $answer =~ /y(es)?/i;
1681							}
1682							}
1683
1684
1685							#################################### Read Training Data From File ######################################
1686
1687
1688							sub get_training_data {
1689	1			1	1	6	my $self = shift;
1690	1	50				8	die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
1691	1					1	my %class_names = ();
1692	1					3	my %all_record_ids_with_class_labels;
1693							my $firstline;
1694	0					0	my %data_hash;
1695	1					3	$\|++;
1696	1		50			22	open FILEIN, $self->{_training_datafile} \|\| die "unable to open $self->{_training_datafile}: $!";
1697	1					1	my $record_index = 0;
1698	1					1	my $firsetline;
1699	1					23	while () {
1700	36	50				65	next if /^[ ]*\r?\n?$/;
1701	36					75	$_ =~ s/\r?\n?$//;
1702	36					38	my $record = cleanup_csv($_);
1703	36	100				49	if ($record_index == 0) {
1704	1					1	$firstline = $record;
1705	1					1	$record_index++;
1706	1					3	next;
1707							}
1708	35					112	my @parts = split /,/, $record;
1709	35					35	my $classname = $parts[$self->{_csv_class_column_index}];
1710	35					25	$class_names{$classname} = 1;
1711	35					23	my $record_label = shift @parts;
1712	35					45	$record_label =~ s/^\s\"\|\"\s$//g;
1713	35					40	$data_hash{$record_label} = \@parts;
1714	35					32	$all_record_ids_with_class_labels{$record_label} = $classname;
1715	35	50				43	print "." if $record_index % 10000 == 0;
1716	35					70	$record_index++;
1717							}
1718	1					5	close FILEIN;
1719	1					2	$\|--;
1720	1					3	$self->{_how_many_total_training_samples} = $record_index - 1; # must subtract 1 for the header record
1721	1	50				3	print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
1722	1					4	my @all_feature_names = split /,/, substr($firstline, index($firstline,','));
1723	1					2	my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
1724	1					3	my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
	2					6
1725	1					2	my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
	4					5
	1					2
1726	1					5	my %class_for_sample_hash = map {"sample_" . $_ => "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
	35					57
1727	1					7	my @sample_names = map {"sample_$_"} keys %data_hash;
	35					29
1728	1	50				5	my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $data_hash{$sampleID}->[$_-1] } } @{$self->{_csv_columns_for_features}} ] } keys %data_hash;
	35					26
	35					15
	140					107
	140					96
	140					288
	35					34
1729	1	50				5	my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};
	4					4
	4					3
	4					11
	140					81
	140					193
	140					108
	1					2
1730	1					2	my %numeric_features_valuerange_hash = ();
1731	1					1	my %feature_values_how_many_uniques_hash = ();
1732	1					1	my %features_and_unique_values_hash = ();
1733	1					2	my $numregex = '[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?';
1734	1					3	foreach my $feature (keys %features_and_values_hash) {
1735	4					3	my %seen1 = ();
1736	140	100	66			398	my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
1737	4					4	@{$features_and_values_hash{$feature}};
	4					4
1738	4					6	$feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
1739	4					3	my $not_all_values_float = 0;
1740	4	50				5	map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
	14					89
1741	4	50				6	if ($not_all_values_float == 0) {
1742	0					0	my @minmaxvalues = minmax(\@unique_values_for_feature);
1743	0					0	$numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
1744							}
1745	4					10	$features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
1746							}
1747	1	50				7	if ($self->{_debug1}) {
1748	0					0	print "\nAll class names: @all_class_names\n";
1749	0					0	print "\nEach sample data record:\n";
1750	0					0	foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
	0					0
1751	0					0	print "$sample => @{$feature_values_for_samples_hash{$sample}}\n";
	0					0
1752							}
1753	0					0	print "\nclass label for each data sample:\n";
1754	0					0	foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %class_for_sample_hash) {
	0					0
1755	0					0	print "$sample => $class_for_sample_hash{$sample}\n";
1756							}
1757	0					0	print "\nFeatures used: @feature_names\n\n";
1758	0					0	print "\nfeatures and the values taken by them:\n";
1759	0					0	foreach my $feature (sort keys %features_and_values_hash) {
1760	0					0	print "$feature => @{$features_and_values_hash{$feature}}\n";
	0					0
1761							}
1762	0					0	print "\nnumeric features and their ranges:\n";
1763	0					0	foreach my $feature (sort keys %numeric_features_valuerange_hash) {
1764	0					0	print "$feature => @{$numeric_features_valuerange_hash{$feature}}\n";
	0					0
1765							}
1766	0					0	print "\nnumber of unique values in each feature:\n";
1767	0					0	foreach my $feature (sort keys %feature_values_how_many_uniques_hash) {
1768	0					0	print "$feature => $feature_values_how_many_uniques_hash{$feature}\n";
1769							}
1770							}
1771	1					2	$self->{_class_names} = \@all_class_names;
1772	1					2	$self->{_feature_names} = \@feature_names;
1773	1					1	$self->{_samples_class_label_hash} = \%class_for_sample_hash;
1774	1					6	$self->{_training_data_hash} = \%feature_values_for_samples_hash;
1775	1					2	$self->{_features_and_values_hash} = \%features_and_values_hash;
1776	1					1	$self->{_features_and_unique_values_hash} = \%features_and_unique_values_hash;
1777	1					2	$self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
1778	1					19	$self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
1779							}
1780
1781							sub show_training_data {
1782	0			0	1	0	my $self = shift;
1783	0					0	my @class_names = @{$self->{_class_names}};
	0					0
1784	0					0	my %features_and_values_hash = %{$self->{_features_and_values_hash}};
	0					0
1785	0					0	my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};
	0					0
1786	0					0	my %training_data_hash = %{$self->{_training_data_hash}};
	0					0
1787	0					0	print "\n\nClass Names: @class_names\n";
1788	0					0	print "\n\nFeatures and Their Values:\n\n";
1789	0					0	while ( my ($k, $v) = each %features_and_values_hash ) {
1790	0					0	print "$k ---> @{$features_and_values_hash{$k}}\n";
	0					0
1791							}
1792	0					0	print "\n\nSamples vs. Class Labels:\n\n";
1793	0					0	foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {
	0					0
1794	0					0	print "$kee => $samples_class_label_hash{$kee}\n";
1795							}
1796	0					0	print "\n\nTraining Samples:\n\n";
1797	0					0	foreach my $kee (sort {sample_index($a) <=> sample_index($b)}
	0					0
1798							keys %training_data_hash) {
1799	0					0	print "$kee => @{$training_data_hash{$kee}}\n";
	0					0
1800							}
1801							}
1802
1803							sub get_class_names {
1804	0			0	0	0	my $self = shift;
1805	0					0	return @{$self->{_class_names}}
	0					0
1806							}
1807
1808							########################################## Utility Routines ############################################
1809
1810							sub closest_sampling_point {
1811	0			0	0	0	my $value = shift;
1812	0					0	my $arr_ref = shift;
1813	0					0	my @arr = @{$arr_ref};
	0					0
1814	0					0	my @compare = map {abs($_ - $value)} @arr;
	0					0
1815	0					0	my ($minval,$index) = minimum(\@compare);
1816	0					0	return $arr[$index];
1817							}
1818
1819							## returns the array index that contains a specified STRING value: (meant only for array of strings)
1820							sub get_index_at_value {
1821	0			0	0	0	my $value = shift;
1822	0					0	my @array = @{shift @_};
	0					0
1823	0					0	foreach my $i (0..@array-1) {
1824	0	0				0	return $i if $value eq $array[$i];
1825							}
1826							}
1827
1828							## When the training data is read from a CSV file, we assume that the first column
1829							## of each data record contains a unique integer identifier for the record in that
1830							## row. This training data is stored in a hash whose keys are the prefix 'sample_'
1831							## followed by the identifying integers. The purpose of this function is to return
1832							## the identifying integer associated with a data record.
1833							sub sample_index {
1834	1128			1128	0	629	my $arg = shift;
1835	1128					1115	$arg =~ /_(.+)$/;
1836	1128					1253	return $1;
1837							}
1838
1839							# Returns the minimum value and its positional index in an array
1840							sub minimum {
1841	0			0	0	0	my $arr = shift;
1842	0					0	my $min;
1843							my $index;
1844	0					0	foreach my $i (0..@{$arr}-1) {
	0					0
1845	0	0	0			0	if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
1846	0					0	$index = $i;
1847	0					0	$min = $arr->[$i];
1848							}
1849							}
1850	0					0	return ($min, $index);
1851							}
1852
1853							# Returns an array of two values, the min and the max, of an array of floats
1854							sub minmax {
1855	0			0	0	0	my $arr = shift;
1856	0					0	my ($min, $max);
1857	0					0	foreach my $i (0..@{$arr}-1) {
	0					0
1858	0	0	0			0	if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
1859	0					0	$min = $arr->[$i];
1860							}
1861	0	0	0			0	if ( (!defined $max) \|\| ($arr->[$i] > $max) ) {
1862	0					0	$max = $arr->[$i];
1863							}
1864							}
1865	0					0	return ($min, $max);
1866							}
1867
1868							# checks whether an element is in an array:
1869							sub contained_in {
1870	267			267	0	176	my $ele = shift;
1871	267					269	my @array = @_;
1872	267					160	my $count = 0;
1873	267	100				181	map {$count++ if $ele eq $_} @array;
	648					1035
1874	267					427	return $count;
1875							}
1876
1877							# Meant only for an array of strings (no nesting):
1878							sub deep_copy_array {
1879	196			196	0	152	my $ref_in = shift;
1880	196					116	my $ref_out;
1881	196	100				254	return [] if scalar @$ref_in == 0;
1882	181					133	foreach my $i (0..@{$ref_in}-1) {
	181					232
1883	393					423	$ref_out->[$i] = $ref_in->[$i];
1884							}
1885	181					283	return $ref_out;
1886							}
1887
1888							sub check_for_illegal_params2 {
1889	1			1	0	2	my @params = @_;
1890	1					5	my @legal_params = qw / training_datafile
1891							entropy_threshold
1892							max_depth_desired
1893							csv_class_column_index
1894							csv_columns_for_features
1895							symbolic_to_numeric_cardinality_threshold
1896							number_of_histogram_bins
1897							debug1
1898							debug2
1899							debug3
1900							/;
1901	1					1	my $found_match_flag;
1902	1					2	foreach my $param (@params) {
1903	3					3	foreach my $legal (@legal_params) {
1904	10					9	$found_match_flag = 0;
1905	10	100				14	if ($param eq $legal) {
1906	3					1	$found_match_flag = 1;
1907	3					4	last;
1908							}
1909							}
1910	3	50				4	last if $found_match_flag == 0;
1911							}
1912	1					4	return $found_match_flag;
1913							}
1914
1915							sub print_array_with_msg {
1916	0			0	0	0	my $message = shift;
1917	0					0	my $arr = shift;
1918	0					0	print "\n$message: ";
1919	0					0	print_nested_array( $arr );
1920							}
1921
1922							sub print_nested_array {
1923	0			0	0	0	my $arr = shift;
1924	0					0	my @arr = @$arr;
1925	0					0	print "[";
1926	0					0	foreach my $item (@arr) {
1927	0	0				0	if (ref $item) {
1928	0					0	print_nested_array($item);
1929							} else {
1930	0					0	print "$item";
1931							}
1932							}
1933	0					0	print "]";
1934							}
1935
1936							## Introduced in Version 3.21, I wrote this function in response to a need to
1937							## create a decision tree for a very large national econometric database. The
1938							## fields in the CSV file for this database are allowed to be double quoted and such
1939							## fields may contain commas inside them. This function also replaces empty fields
1940							## with the generic string 'NA' as a shorthand for "Not Available". IMPORTANT: This
1941							## function skips over the first field in each record. It is assumed that the first
1942							## field in the first record that defines the feature names is the empty string ("")
1943							## and the same field in all other records is an ID number for the record.
1944							sub cleanup_csv {
1945	36			36	0	28	my $line = shift;
1946	36					33	$line =~ tr/()[]{}'/ /;
1947	36					51	my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
1948	36					35	for (@double_quoted) {
1949	0					0	my $item = $_;
1950	0					0	$item = substr($item, 1, -1);
1951	0					0	$item =~ s/^s+\|,\|\s+$//g;
1952	0					0	$item = join '_', split /\s+/, $item;
1953	0					0	substr($line, index($line, $_), length($_)) = $item;
1954							}
1955	36					124	my @white_spaced = $line =~ /,\s[^,]+\s+[^,]+\s,/g;
1956	36					30	for (@white_spaced) {
1957	0					0	my $item = $_;
1958	0					0	$item = substr($item, 0, -1);
1959	0	0				0	$item = join '_', split /\s+/, $item unless $item =~ /,\s+$/;
1960	0					0	substr($line, index($line, $_), length($_)) = "$item,";
1961							}
1962	36					52	$line =~ s/,\s*(?=,)/,NA/g;
1963	36					38	return $line;
1964							}
1965
1966							######################################### Class EvalTrainingData ########################################
1967
1968							## This subclass of the DecisionTree class is used to evaluate the quality of your
1969							## training data by running a 10-fold cross-validation test on it. This test divides
1970							## all of the training data into ten parts, with nine parts used for training a
1971							## decision tree and one part used for testing its ability to classify correctly.
1972							## This selection of nine parts for training and one part for testing is carried out
1973							## in all of the ten different possible ways. This testing functionality can also
1974							## be used to find the best values to use for the constructor parameters
1975							## entropy_threshold, max_depth_desired, and
1976							## symbolic_to_numeric_cardinality_threshold.
1977
1978							## Only the CSV training files can be evaluated in this manner (because only CSV
1979							## training are allowed to have numeric features --- which is the more interesting
1980							## case for evaluation analytics.
1981
1982							package EvalTrainingData;
1983
1984							@EvalTrainingData::ISA = ('Algorithm::DecisionTree');
1985
1986							sub new {
1987	0			0		0	my $class = shift;
1988	0					0	my $instance = Algorithm::DecisionTree->new(@_);
1989	0					0	bless $instance, $class;
1990							}
1991
1992							sub evaluate_training_data {
1993	0			0		0	my $self = shift;
1994	0					0	my $evaldebug = 0;
1995							die "The data evaluation function in the module can only be used when your " .
1996	0	0				0	"training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
1997	0					0	print "\nWill run a 10-fold cross-validation test on your training data to test its " .
1998							"class-discriminatory power:\n";
1999	0					0	my %all_training_data = %{$self->{_training_data_hash}};
	0					0
2000	0					0	my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=>
	0					0
2001							Algorithm::DecisionTree::sample_index($b)} keys %all_training_data;
2002	0					0	my $fold_size = int(0.1 * (scalar keys %all_training_data));
2003	0					0	print "fold size: $fold_size\n";
2004	0					0	my %confusion_matrix = ();
2005	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
2006	0					0	foreach my $inner_class_name (@{$self->{_class_names}}) {
	0					0
2007	0					0	$confusion_matrix{$class_name}->{$inner_class_name} = 0;
2008							}
2009							}
2010	0					0	foreach my $fold_index (0..9) {
2011	0					0	print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n";
2012	0					0	my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
2013	0					0	my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],
2014							@all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
2015	0					0	my %testing_data = ();
2016	0					0	foreach my $x (@testing_samples) {
2017	0					0	$testing_data{$x} = $all_training_data{$x};
2018							}
2019	0					0	my %training_data = ();
2020	0					0	foreach my $x (@training_samples) {
2021	0					0	$training_data{$x} = $all_training_data{$x};
2022							}
2023	0					0	my $trainingDT = Algorithm::DecisionTree->new('evalmode');
2024	0					0	$trainingDT->{_training_data_hash} = \%training_data;
2025	0					0	$trainingDT->{_class_names} = $self->{_class_names};
2026	0					0	$trainingDT->{_feature_names} = $self->{_feature_names};
2027	0					0	$trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
2028	0					0	$trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
2029							$trainingDT->{_symbolic_to_numeric_cardinality_threshold} =
2030	0					0	$self->{_symbolic_to_numeric_cardinality_threshold};
2031	0					0	foreach my $sample_name (@training_samples) {
2032							$trainingDT->{_samples_class_label_hash}->{$sample_name} =
2033	0					0	$self->{_samples_class_label_hash}->{$sample_name};
2034							}
2035	0					0	foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
	0					0
2036	0					0	$trainingDT->{_features_and_values_hash}->{$feature} = ();
2037							}
2038	0					0	my $pattern = '(\S+)\s=\s(\S+)';
2039	0					0	foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=>
	0					0
2040							Algorithm::DecisionTree::sample_index($b)}
2041	0					0	keys %{$trainingDT->{_training_data_hash}}) {
2042	0					0	foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
	0					0
2043	0					0	$feature_and_value =~ /$pattern/;
2044	0					0	my ($feature,$value) = ($1,$2);
2045	0	0				0	push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
	0					0
2046							}
2047							}
2048	0					0	foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
	0					0
2049	0					0	my %seen = ();
2050	0	0	0			0	my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
2051	0					0	@{$trainingDT->{_features_and_values_hash}->{$feature}};
	0					0
2052	0	0				0	if (Algorithm::DecisionTree::contained_in($feature,
2053	0					0	keys %{$self->{_numeric_features_valuerange_hash}})) {
2054	0					0	@unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;
	0					0
2055							} else {
2056	0					0	@unique_values_for_feature = sort @unique_values_for_feature;
2057							}
2058	0					0	$trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
2059							}
2060	0					0	foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
	0					0
2061							my @minmaxvalues = Algorithm::DecisionTree::minmax(
2062	0					0	\@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
	0					0
2063	0					0	$trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
2064							}
2065	0	0				0	if ($evaldebug) {
2066	0					0	print "\n\nprinting samples in the testing set: @testing_samples\n";
2067	0					0	print "\n\nPrinting features and their values in the training set:\n";
2068	0					0	foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
	0					0
2069	0					0	print "$item => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
	0					0
2070							}
2071	0					0	print "\n\nPrinting unique values for features:\n";
2072	0					0	foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
	0					0
2073	0					0	print "$item => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
	0					0
2074							}
2075	0					0	print "\n\nPrinting unique value ranges for features:\n";
2076	0					0	foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
	0					0
2077	0					0	print "$item => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
	0					0
2078							}
2079							}
2080	0					0	foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
	0					0
2081							$trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} =
2082	0					0	scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
	0					0
2083							}
2084	0	0				0	$trainingDT->{_debug2} = 1 if $evaldebug;
2085	0					0	$trainingDT->calculate_first_order_probabilities();
2086	0					0	$trainingDT->calculate_class_priors();
2087	0					0	my $root_node = $trainingDT->construct_decision_tree_classifier();
2088	0	0				0	$root_node->display_decision_tree(" ") if $evaldebug;
2089	0					0	foreach my $test_sample_name (@testing_samples) {
2090	0					0	my @test_sample_data = @{$all_training_data{$test_sample_name}};
	0					0
2091	0	0				0	print "original data in test sample: @test_sample_data\n" if $evaldebug;
2092	0	0	0			0	@test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
	0					0
2093	0	0				0	print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
2094	0					0	my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
	0					0
2095	0					0	my @solution_path = @{$classification{'solution_path'}};
	0					0
2096	0					0	delete $classification{'solution_path'};
2097	0					0	my @which_classes = keys %classification;
2098	0					0	@which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
	0					0
2099	0					0	my $most_likely_class_label = $which_classes[0];
2100	0	0				0	if ($evaldebug) {
2101	0					0	print "\nClassification:\n\n";
2102	0					0	print " class probability\n";
2103	0					0	print " ---------- -----------\n";
2104	0					0	foreach my $which_class (@which_classes) {
2105	0					0	my $classstring = sprintf("%-30s", $which_class);
2106	0					0	my $valuestring = sprintf("%-30s", $classification{$which_class});
2107	0					0	print " $classstring $valuestring\n";
2108							}
2109	0					0	print "\nSolution path in the decision tree: @solution_path\n";
2110	0					0	print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
2111							}
2112	0					0	my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
2113	0	0				0	print "$test_sample_name: true_class: $true_class_label_for_sample " .
2114							"estimated_class: $most_likely_class_label\n" if $evaldebug;
2115	0					0	$confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
2116							}
2117							}
2118	0					0	print "\n\n DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
2119	0					0	my $matrix_header = " " x 30;
2120	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
2121	0					0	$matrix_header .= sprintf("%-30s", $class_name);
2122							}
2123	0					0	print "\n" . $matrix_header . "\n\n";
2124	0					0	foreach my $row_class_name (sort keys %confusion_matrix) {
2125	0					0	my $row_display = sprintf("%-30s", $row_class_name);
2126	0					0	foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
	0					0
2127	0					0	$row_display .= sprintf( "%-30u", $confusion_matrix{$row_class_name}->{$col_class_name} );
2128							}
2129	0					0	print "$row_display\n\n";
2130							}
2131	0					0	print "\n\n";
2132	0					0	my ($diagonal_sum, $off_diagonal_sum) = (0,0);
2133	0					0	foreach my $row_class_name (sort keys %confusion_matrix) {
2134	0					0	foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
	0					0
2135	0	0				0	if ($row_class_name eq $col_class_name) {
2136	0					0	$diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2137							} else {
2138	0					0	$off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2139							}
2140							}
2141							}
2142	0					0	my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
2143	0					0	print "\nTraining Data Quality Index: $data_quality_index (out of a possible maximum of 100)\n";
2144	0	0	0			0	if ($data_quality_index <= 80) {
		0	0
		0	0
		0
2145	0					0	print "\nYour training data does not possess much class discriminatory " .
2146							"information. It could be that the classes are inherently not well " .
2147							"separable or that your constructor parameter choices are not appropriate.\n";
2148							} elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
2149	0					0	print "\nYour training data possesses some class discriminatory information " .
2150							"but it may not be sufficient for real-world applications. You might " .
2151							"try tweaking the constructor parameters to see if that improves the " .
2152							"class discriminations.\n";
2153							} elsif ($data_quality_index > 90 && $data_quality_index <= 95) {
2154	0					0	print "\nYour training data appears to possess good class discriminatory " .
2155							"information. Whether or not it is acceptable would depend on your " .
2156							"application.\n";
2157							} elsif ($data_quality_index > 95 && $data_quality_index <= 98) {
2158	0					0	print "\nYour training data is of excellent quality.\n";
2159							} else {
2160	0					0	print "\nYour training data is perfect.\n";
2161							}
2162
2163							}
2164
2165
2166							############################################# Class DTNode #############################################
2167
2168							# The nodes of the decision tree are instances of this class:
2169
2170							package DTNode;
2171
2172	1			1		11	use strict;
	1					1
	1					19
2173	1			1		3	use Carp;
	1					1
	1					806
2174
2175							# $feature is the feature test at the current node. $branch_features_and_values is
2176							# an anonymous array holding the feature names and corresponding values on the path
2177							# from the root to the current node:
2178							sub new {
2179	30			30		35	my ($class, $feature, $entropy, $class_probabilities,
2180							$branch_features_and_values_or_thresholds, $dt, $root_or_not) = @_;
2181	30	100				42	$root_or_not = '' if !defined $root_or_not;
2182	30	100				37	if ($root_or_not eq 'root') {
2183	1					2	$dt->{nodes_created} = -1;
2184	1					2	$dt->{class_names} = undef;
2185							}
2186	30					96	my $self = {
2187							_dt => $dt,
2188							_feature => $feature,
2189							_node_creation_entropy => $entropy,
2190							_class_probabilities => $class_probabilities,
2191							_branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
2192							_linked_to => [],
2193							};
2194	30					33	bless $self, $class;
2195	30					42	$self->{_serial_number} = $self->get_next_serial_num();
2196	30					30	return $self;
2197							}
2198
2199							sub how_many_nodes {
2200	0			0		0	my $self = shift;
2201	0					0	return $self->{_dt}->{nodes_created} + 1;
2202							}
2203
2204							sub set_class_names {
2205	1			1		1	my $self = shift;
2206	1					1	my $class_names_list = shift;
2207	1					2	$self->{_dt}->{class_names} = $class_names_list;
2208							}
2209
2210							sub get_class_names {
2211	0			0		0	my $self = shift;
2212	0					0	return $self->{_dt}->{class_names};
2213							}
2214
2215							sub get_next_serial_num {
2216	30			30		16	my $self = shift;
2217	30					33	$self->{_dt}->{nodes_created} += 1;
2218	30					42	return $self->{_dt}->{nodes_created};
2219							}
2220
2221							sub get_serial_num {
2222	30			30		27	my $self = shift;
2223	30					29	$self->{_serial_number};
2224							}
2225
2226							# this returns the feature test at the current node
2227							sub get_feature {
2228	0			0		0	my $self = shift;
2229	0					0	return $self->{ _feature };
2230							}
2231
2232							sub set_feature {
2233	29			29		25	my $self = shift;
2234	29					19	my $feature = shift;
2235	29					36	$self->{_feature} = $feature;
2236							}
2237
2238							sub get_node_entropy {
2239	30			30		21	my $self = shift;
2240	30					29	return $self->{_node_creation_entropy};
2241							}
2242
2243							sub get_class_probabilities {
2244	0			0		0	my $self = shift;
2245	0					0	return $self->{ _class_probabilities};
2246							}
2247
2248							sub get_branch_features_and_values_or_thresholds {
2249	30			30		19	my $self = shift;
2250	30					39	return $self->{_branch_features_and_values_or_thresholds};
2251							}
2252
2253							sub add_to_branch_features_and_values {
2254	0			0		0	my $self = shift;
2255	0					0	my $feature_and_value = shift;
2256	0					0	push @{$self->{ _branch_features_and_values }}, $feature_and_value;
	0					0
2257							}
2258
2259							sub get_children {
2260	0			0		0	my $self = shift;
2261	0					0	return $self->{_linked_to};
2262							}
2263
2264							sub add_child_link {
2265	29			29		20	my ($self, $new_node, ) = @_;
2266	29					17	push @{$self->{_linked_to}}, $new_node;
	29					40
2267							}
2268
2269							sub delete_all_links {
2270	0			0		0	my $self = shift;
2271	0					0	$self->{_linked_to} = undef;
2272							}
2273
2274							sub display_node {
2275	0			0		0	my $self = shift;
2276	0		0			0	my $feature_at_node = $self->get_feature() \|\| " ";
2277	0					0	my $node_creation_entropy_at_node = $self->get_node_entropy();
2278	0					0	my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2279	0					0	my @class_probabilities = @{$self->get_class_probabilities()};
	0					0
2280	0					0	my @class_probabilities_for_display = map {sprintf("%0.3f", $_)} @class_probabilities;
	0					0
2281	0					0	my $serial_num = $self->get_serial_num();
2282	0					0	my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
	0					0
2283	0					0	print "\n\nNODE $serial_num" .
2284							":\n Branch features and values to this node: @branch_features_and_values_or_thresholds" .
2285							"\n Class probabilities at current node: @class_probabilities_for_display" .
2286							"\n Entropy at current node: $print_node_creation_entropy_at_node" .
2287							"\n Best feature test at current node: $feature_at_node\n\n";
2288							}
2289
2290							sub display_decision_tree {
2291	0			0		0	my $self = shift;
2292	0					0	my $offset = shift;
2293	0					0	my $serial_num = $self->get_serial_num();
2294	0	0				0	if (@{$self->get_children()} > 0) {
	0					0
2295	0		0			0	my $feature_at_node = $self->get_feature() \|\| " ";
2296	0					0	my $node_creation_entropy_at_node = $self->get_node_entropy();
2297	0					0	my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2298	0					0	my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
	0					0
2299	0					0	my @class_probabilities = @{$self->get_class_probabilities()};
	0					0
2300	0					0	my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
	0					0
2301	0					0	my @class_names = @{$self->get_class_names()};
	0					0
2302							my @print_class_probabilities_with_class =
2303	0					0	map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
	0					0
2304	0					0	print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
2305	0					0	my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2306	0					0	print "$second_line_offset" . "Decision Feature: $feature_at_node Node Creation Entropy: " ,
2307							"$print_node_creation_entropy_at_node Class Probs: @print_class_probabilities_with_class\n\n";
2308	0					0	$offset .= " ";
2309	0					0	foreach my $child (@{$self->get_children()}) {
	0					0
2310	0					0	$child->display_decision_tree($offset);
2311							}
2312							} else {
2313	0					0	my $node_creation_entropy_at_node = $self->get_node_entropy();
2314	0					0	my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2315	0					0	my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
	0					0
2316	0					0	my @class_probabilities = @{$self->get_class_probabilities()};
	0					0
2317	0					0	my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
	0					0
2318	0					0	my @class_names = @{$self->get_class_names()};
	0					0
2319							my @print_class_probabilities_with_class =
2320	0					0	map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
	0					0
2321	0					0	print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
2322	0					0	my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2323	0					0	print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node " .
2324							"Class Probs: @print_class_probabilities_with_class\n\n";
2325							}
2326							}
2327
2328
2329							############################## Generate Your Own Numeric Training Data #################################
2330							############################# Class TrainingDataGeneratorNumeric ################################
2331
2332							## See the script generate_training_data_numeric.pl in the examples
2333							## directory on how to use this class for generating your own numeric training and
2334							## test data. The training and test data are generated in accordance with the
2335							## specifications you place in the parameter file that is supplied as an argument to
2336							## the constructor of this class.
2337
2338							package TrainingDataGeneratorNumeric;
2339
2340	1			1		4	use strict;
	1					2
	1					15
2341	1			1		3	use Carp;
	1					1
	1					744
2342
2343							sub new {
2344	0			0		0	my ($class, %args) = @_;
2345	0					0	my @params = keys %args;
2346	0	0				0	croak "\nYou have used a wrong name for a keyword argument " .
2347							"--- perhaps a misspelling\n"
2348							if check_for_illegal_params3(@params) == 0;
2349							bless {
2350							_output_training_csv_file => $args{'output_training_csv_file'}
2351							\|\| croak("name for output_training_csv_file required"),
2352							_output_test_csv_file => $args{'output_test_csv_file'}
2353							\|\| croak("name for output_test_csv_file required"),
2354							_parameter_file => $args{'parameter_file'}
2355							\|\| croak("parameter_file required"),
2356							_number_of_samples_for_training => $args{'number_of_samples_for_training'}
2357							\|\| croak("number_of_samples_for_training"),
2358							_number_of_samples_for_testing => $args{'number_of_samples_for_testing'}
2359							\|\| croak("number_of_samples_for_testing"),
2360	0		0			0	_debug => $args{debug} \|\| 0,
			0
			0
			0
			0
			0
2361							_class_names => [],
2362							_class_names_and_priors => {},
2363							_features_with_value_range => {},
2364							_features_ordered => [],
2365							_classes_and_their_param_values => {},
2366							}, $class;
2367							}
2368
2369							sub check_for_illegal_params3 {
2370	0			0		0	my @params = @_;
2371	0					0	my @legal_params = qw / output_training_csv_file
2372							output_test_csv_file
2373							parameter_file
2374							number_of_samples_for_training
2375							number_of_samples_for_testing
2376							debug
2377							/;
2378	0					0	my $found_match_flag;
2379	0					0	foreach my $param (@params) {
2380	0					0	foreach my $legal (@legal_params) {
2381	0					0	$found_match_flag = 0;
2382	0	0				0	if ($param eq $legal) {
2383	0					0	$found_match_flag = 1;
2384	0					0	last;
2385							}
2386							}
2387	0	0				0	last if $found_match_flag == 0;
2388							}
2389	0					0	return $found_match_flag;
2390							}
2391
2392							## The training data generated by an instance of the class
2393							## TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
2394							## you supply to the class constructor through a constructor variable called
2395							## `parameter_file'. This method is for parsing the parameter file in order to
2396							## order to determine the names to be used for the different data classes, their
2397							## means, and their variances.
2398							sub read_parameter_file_numeric {
2399	0			0		0	my $self = shift;
2400	0					0	my @class_names = ();
2401	0					0	my %class_names_and_priors = ();
2402	0					0	my %features_with_value_range = ();
2403	0					0	my %classes_and_their_param_values = ();
2404							# my $regex8 = '[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?';
2405	0		0			0	open FILE, $self->{_parameter_file} \|\| die "unable to open parameter file: $!";
2406	0					0	my @params = ;
2407	0					0	my $params = join "", @params;
2408	0					0	my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
2409	0					0	$params =~ /$regex/si;
2410	0					0	my ($class_names, $class_priors) = ($1, $2);
2411	0					0	@class_names = split ' ', $class_names;
2412	0					0	my @class_priors = split ' ', $class_priors;
2413	0					0	foreach my $i (0..@class_names-1) {
2414	0					0	$class_names_and_priors{$class_names[$i]} = $class_priors[$i];
2415							}
2416	0	0				0	if ($self->{_debug}) {
2417	0					0	foreach my $cname (keys %class_names_and_priors) {
2418	0					0	print "$cname => $class_names_and_priors{$cname}\n";
2419							}
2420							}
2421	0					0	$regex = 'feature name: \w.?value range: [\d\. -]+';
2422	0					0	my @features = $params =~ /$regex/gsi;
2423	0					0	my @features_ordered;
2424	0					0	$regex = 'feature name: (\w+)\W?value range:\s([\d. -]+)';
2425	0					0	foreach my $feature (@features) {
2426	0					0	$feature =~ /$regex/i;
2427	0					0	my $feature_name = $1;
2428	0					0	push @features_ordered, $feature_name;
2429	0					0	my @value_range = split ' ', $2;
2430	0					0	$features_with_value_range{$feature_name} = \@value_range;
2431							}
2432	0	0				0	if ($self->{_debug}) {
2433	0					0	foreach my $fname (keys %features_with_value_range) {
2434	0					0	print "$fname => @{$features_with_value_range{$fname}}\n";
	0					0
2435							}
2436							}
2437	0					0	foreach my $i (0..@class_names-1) {
2438	0					0	$classes_and_their_param_values{$class_names[$i]} = {};
2439							}
2440	0					0	$regex = 'params for class: \w?\W+?mean:[\d\. ]+\W?covariance:\W+?(?:[ \d.]+\W+?)+';
2441	0					0	my @class_params = $params =~ /$regex/gsi;
2442	0					0	$regex = 'params for class: (\w+)\W?mean:\s([\d. -]+)\Wcovariance:\s([\s\d.]+)';
2443	0					0	foreach my $class_param (@class_params) {
2444	0					0	$class_param =~ /$regex/gsi;
2445	0					0	my $class_name = $1;
2446	0					0	my @class_mean = split ' ', $2;
2447	0					0	$classes_and_their_param_values{$class_name}->{'mean'} = \@class_mean;
2448	0					0	my $class_param_string = $3;
2449	0					0	my @covar_rows = split '\n', $class_param_string;
2450	0					0	my @covar_matrix;
2451	0					0	foreach my $row (@covar_rows) {
2452	0					0	my @row = split ' ', $row;
2453	0					0	push @covar_matrix, \@row;
2454							}
2455	0					0	$classes_and_their_param_values{$class_name}->{'covariance'} = \@covar_matrix;
2456							}
2457	0	0				0	if ($self->{_debug}) {
2458	0					0	print "\nThe class parameters are:\n\n";
2459	0					0	foreach my $cname (keys %classes_and_their_param_values) {
2460	0					0	print "\nFor class name $cname:\n";
2461	0					0	my %params_hash = %{$classes_and_their_param_values{$cname}};
	0					0
2462	0					0	foreach my $x (keys %params_hash) {
2463	0	0				0	if ($x eq 'mean') {
2464	0					0	print " $x => @{$params_hash{$x}}\n";
	0					0
2465							} else {
2466	0	0				0	if ($x eq 'covariance') {
2467	0					0	print " The covariance matrix:\n";
2468	0					0	my @matrix = @{$params_hash{'covariance'}};
	0					0
2469	0					0	foreach my $row (@matrix) {
2470	0					0	print " @$row\n";
2471							}
2472							}
2473							}
2474							}
2475							}
2476							}
2477	0					0	$self->{_class_names} = \@class_names;
2478	0					0	$self->{_class_names_and_priors} = \%class_names_and_priors;
2479	0					0	$self->{_features_with_value_range} = \%features_with_value_range;
2480	0					0	$self->{_classes_and_their_param_values} = \%classes_and_their_param_values;
2481	0					0	$self->{_features_ordered} = \@features_ordered;
2482							}
2483
2484							## After the parameter file is parsed by the previous method, this method calls on
2485							## Math::Random::random_multivariate_normal() to generate the training and test data
2486							## samples. Your training and test data can be of any number of of dimensions, can
2487							## have any mean, and any covariance. The training and test data must obviously be
2488							## drawn from the same distribution.
2489							sub gen_numeric_training_and_test_data_and_write_to_csv {
2490	1			1		579	use Math::Random;
	1					4123
	1					654
2491	0			0		0	my $self = shift;
2492	0					0	my %training_samples_for_class;
2493							my %test_samples_for_class;
2494	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
2495	0					0	$training_samples_for_class{$class_name} = [];
2496	0					0	$test_samples_for_class{$class_name} = [];
2497							}
2498	0					0	foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
	0					0
2499	0					0	my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
	0					0
2500	0					0	my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
	0					0
2501							my @new_training_data = Math::Random::random_multivariate_normal(
2502	0					0	$self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
2503							@mean, @covariance );
2504							my @new_test_data = Math::Random::random_multivariate_normal(
2505	0					0	$self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
2506							@mean, @covariance );
2507	0	0				0	if ($self->{_debug}) {
2508	0					0	print "training data for class $class_name:\n";
2509	0					0	foreach my $x (@new_training_data) {print "@$x\n";}
	0					0
2510	0					0	print "\n\ntest data for class $class_name:\n";
2511	0					0	foreach my $x (@new_test_data) {print "@$x\n";}
	0					0
2512							}
2513	0					0	$training_samples_for_class{$class_name} = \@new_training_data;
2514	0					0	$test_samples_for_class{$class_name} = \@new_test_data;
2515							}
2516	0					0	my @training_data_records = ();
2517	0					0	my @test_data_records = ();
2518	0					0	foreach my $class_name (keys %training_samples_for_class) {
2519							my $num_of_samples_for_training = $self->{_number_of_samples_for_training} *
2520	0					0	$self->{_class_names_and_priors}->{$class_name};
2521							my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} *
2522	0					0	$self->{_class_names_and_priors}->{$class_name};
2523	0					0	foreach my $sample_index (0..$num_of_samples_for_training-1) {
2524	0					0	my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};
	0					0
2525	0					0	@training_vector = map {sprintf("%.3f", $_)} @training_vector;
	0					0
2526	0					0	my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";
2527	0					0	push @training_data_records, $training_data_record;
2528							}
2529	0					0	foreach my $sample_index (0..$num_of_samples_for_testing-1) {
2530	0					0	my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
	0					0
2531	0					0	@test_vector = map {sprintf("%.3f", $_)} @test_vector;
	0					0
2532	0					0	my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
2533	0					0	push @test_data_records, $test_data_record;
2534							}
2535							}
2536	0					0	fisher_yates_shuffle(\@training_data_records);
2537	0					0	fisher_yates_shuffle(\@test_data_records);
2538	0	0				0	if ($self->{_debug}) {
2539	0					0	foreach my $record (@training_data_records) {
2540	0					0	print "$record";
2541							}
2542	0					0	foreach my $record (@test_data_records) {
2543	0					0	print "$record";
2544							}
2545							}
2546	0					0	open OUTPUT, ">$self->{_output_training_csv_file}";
2547	0					0	my @feature_names_training = @{$self->{_features_ordered}};
	0					0
2548	0					0	my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
	0					0
2549	0					0	my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
2550	0					0	print OUTPUT "$first_row_training\n";
2551	0					0	foreach my $i (0..@training_data_records-1) {
2552	0					0	my $i1 = $i+1;
2553	0					0	my $sample_record = "\"$i1\",$training_data_records[$i]";
2554	0					0	print OUTPUT "$sample_record";
2555							}
2556	0					0	close OUTPUT;
2557	0					0	open OUTPUT, ">$self->{_output_test_csv_file}";
2558	0					0	my @feature_names_testing = keys %{$self->{_features_with_value_range}};
	0					0
2559	0					0	my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
	0					0
2560	0					0	my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
2561	0					0	print OUTPUT "$first_row_testing\n";
2562	0					0	foreach my $i (0..@test_data_records-1) {
2563	0					0	my $i1 = $i+1;
2564	0					0	my $sample_record = "\"$i1\",$test_data_records[$i]";
2565	0					0	print OUTPUT "$sample_record";
2566							}
2567	0					0	close OUTPUT;
2568							}
2569
2570							# from perl docs:
2571							sub fisher_yates_shuffle {
2572	0			0		0	my $arr = shift;
2573	0					0	my $i = @$arr;
2574	0					0	while (--$i) {
2575	0					0	my $j = int rand( $i + 1 );
2576	0					0	@$arr[$i, $j] = @$arr[$j, $i];
2577							}
2578							}
2579
2580							########################### Generate Your Own Symbolic Training Data ###############################
2581							########################### Class TrainingDataGeneratorSymbolic #############################
2582
2583							## See the sample script generate_training_and_test_data_symbolic.pl for how to use
2584							## this class for generating purely symbolic training and test data. The data is
2585							## generated according to the specifications you place in a parameter file whose
2586							## name you supply as one of constructor arguments.
2587							package TrainingDataGeneratorSymbolic;
2588
2589	1			1		8	use strict;
	1					2
	1					16
2590	1			1		3	use Carp;
	1					1
	1					1641
2591
2592							sub new {
2593	1			1		12	my ($class, %args) = @_;
2594	1					3	my @params = keys %args;
2595	1	50				3	croak "\nYou have used a wrong name for a keyword argument " .
2596							"--- perhaps a misspelling\n"
2597							if check_for_illegal_params4(@params) == 0;
2598							bless {
2599							_output_training_datafile => $args{'output_training_datafile'}
2600							\|\| die("name for output_training_datafile required"),
2601							_parameter_file => $args{'parameter_file'}
2602							\|\| die("parameter_file required"),
2603							_number_of_samples_for_training => $args{'number_of_samples_for_training'}
2604							\|\| die("number_of_samples_for_training required"),
2605	1		50			18	_debug => $args{debug} \|\| 0,
			50
			50
			50
2606							_class_names => [],
2607							_class_priors => [],
2608							_features_and_values_hash => {},
2609							_bias_hash => {},
2610							_training_sample_records => {},
2611							}, $class;
2612							}
2613
2614							sub check_for_illegal_params4 {
2615	1			1		1	my @params = @_;
2616	1					3	my @legal_params = qw / output_training_datafile
2617							parameter_file
2618							number_of_samples_for_training
2619							debug
2620							/;
2621	1					1	my $found_match_flag;
2622	1					2	foreach my $param (@params) {
2623	3					3	foreach my $legal (@legal_params) {
2624	6					4	$found_match_flag = 0;
2625	6	100				9	if ($param eq $legal) {
2626	3					1	$found_match_flag = 1;
2627	3					3	last;
2628							}
2629							}
2630	3	50				5	last if $found_match_flag == 0;
2631							}
2632	1					3	return $found_match_flag;
2633							}
2634
2635							## Read a parameter file for generating symbolic training data. See the script
2636							## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
2637							## pass the name of the parameter file to the constructor of the
2638							## TrainingDataGeneratorSymbolic class.
2639							sub read_parameter_file_symbolic {
2640	1			1		4	my $self = shift;
2641	1					4	my $debug = $self->{_debug};
2642	1					2	my $number_of_training_samples = $self->{_number_of_samples_for_training};
2643	1					1	my $input_parameter_file = $self->{_parameter_file};
2644	1	50				4	croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
2645	1					1	my $output_file_training = $self->{_output_training_datafile};
2646	1					20	my $output_file_testing = $self->{_output_test_datafile};
2647	1					1	my @all_params;
2648							my $param_string;
2649	1		33			25	open INPUT, $input_parameter_file \|\| "unable to open parameter file: $!";
2650	1					28	@all_params = ;
2651	1					3	@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
	40					46
2652	1					2	@all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
	36					58
2653	1					9	$param_string = join ' ', @all_params;
2654	1					7	my ($class_names, $class_priors, $rest_param) =
2655							$param_string =~ /^\sclass names:(.?)\sclass priors:(.?)(feature: .*)/;
2656	1	50				21	my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
	3					11
2657	1					2	push @{$self->{_class_names}}, @class_names;
	1					2
2658	1	50				4	my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
	3					10
2659	1					1	push @{$self->{_class_priors}}, @class_priors;
	1					2
2660	1					7	my ($feature_string, $bias_string) = $rest_param =~ /(feature:.?) (bias:.)/;
2661	1					1	my %features_and_values_hash;
2662	1					9	my @features = split /(feature[:])/, $feature_string;
2663	1	50				2	@features = grep {defined($_) && length($_) > 0} @features;
	9					22
2664	1					2	foreach my $item (@features) {
2665	8	100				14	next if $item =~ /feature/;
2666	4					11	my @splits = split / /, $item;
2667	4	50				5	@splits = grep {defined($_) && length($_) > 0} @splits;
	27					59
2668	4					7	foreach my $i (0..@splits-1) {
2669	22	100				23	if ($i == 0) {
2670	4					7	$features_and_values_hash{$splits[0]} = [];
2671							} else {
2672	18	100				26	next if $splits[$i] =~ /values/;
2673	14					9	push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
	14					18
2674							}
2675							}
2676							}
2677	1					2	$self->{_features_and_values_hash} = \%features_and_values_hash;
2678	1					2	my %bias_hash = %{$self->{_bias_hash}};
	1					2
2679	1					6	my @biases = split /(bias[:]\s*class[:])/, $bias_string;
2680	1	50				1	@biases = grep {defined($_) && length($_) > 0} @biases;
	5					14
2681	1					1	foreach my $item (@biases) {
2682	4	100				11	next if $item =~ /bias/;
2683	2					15	my @splits = split /\s+/, $item;
2684	2	50				2	@splits = grep {defined($_) && length($_) > 0} @splits;
	18					40
2685	2					3	my $feature_name;
2686	2					3	foreach my $i (0..@splits-1) {
2687	16	100				29	if ($i == 0) {
		100
2688	2					3	$bias_hash{$splits[0]} = {};
2689							} elsif ($splits[$i] =~ /(^.+)[:]$/) {
2690	8					5	$feature_name = $1;
2691	8					15	$bias_hash{$splits[0]}->{$feature_name} = [];
2692							} else {
2693	6	50				8	next if !defined $feature_name;
2694	6	50				8	push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
	6					11
2695							if defined $feature_name;
2696							}
2697							}
2698							}
2699	1					1	$self->{_bias_hash} = \%bias_hash;
2700	1	50				10	if ($debug) {
2701	0					0	print "\n\nClass names: @class_names\n";
2702	0					0	my $num_of_classes = @class_names;
2703	0					0	print "Class priors: @class_priors\n";
2704	0					0	print "Number of classes: $num_of_classes\n";
2705	0					0	print "\nHere are the features and their possible values:\n";
2706	0					0	while ( my ($k, $v) = each %features_and_values_hash ) {
2707	0					0	print "$k ===> @$v\n";
2708							}
2709	0					0	print "\nHere is the biasing for each class:\n";
2710	0					0	while ( my ($k, $v) = each %bias_hash ) {
2711	0					0	print "$k:\n";
2712	0					0	while ( my ($k1, $v1) = each %$v ) {
2713	0					0	print " $k1 ===> @$v1\n";
2714							}
2715							}
2716							}
2717							}
2718
2719							## This method generates training data according to the specifications placed in a
2720							## parameter file that is read by the previous method.
2721							sub gen_symbolic_training_data {
2722	1			1		4	my $self = shift;
2723	1					2	my @class_names = @{$self->{_class_names}};
	1					2
2724	1					2	my @class_priors = @{$self->{_class_priors}};
	1					2
2725	1					1	my %training_sample_records;
2726	1					0	my %features_and_values_hash = %{$self->{_features_and_values_hash}};
	1					4
2727	1					2	my %bias_hash = %{$self->{_bias_hash}};
	1					1
2728	1					2	my $how_many_training_samples = $self->{_number_of_samples_for_training};
2729	1					1	my $how_many_test_samples = $self->{_number_of_samples_for_testing};
2730	1					1	my %class_priors_to_unit_interval_map;
2731	1					1	my $accumulated_interval = 0;
2732	1					2	foreach my $i (0..@class_names-1) {
2733	2					4	$class_priors_to_unit_interval_map{$class_names[$i]}
2734							= [$accumulated_interval, $accumulated_interval + $class_priors[$i]];
2735	2					3	$accumulated_interval += $class_priors[$i];
2736							}
2737	1	50				3	if ($self->{_debug}) {
2738	0					0	print "Mapping of class priors to unit interval: \n";
2739	0					0	while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {
2740	0					0	print "$k => @$v\n";
2741							}
2742	0					0	print "\n\n";
2743							}
2744	1					1	my $ele_index = 0;
2745	1					2	while ($ele_index < $how_many_training_samples) {
2746	35					33	my $sample_name = "sample" . "_$ele_index";
2747	35					50	$training_sample_records{$sample_name} = [];
2748							# Generate class label for this training sample:
2749	35					47	my $roll_the_dice = rand(1.0);
2750	35					22	my $class_label;
2751	35					38	foreach my $class_name (keys %class_priors_to_unit_interval_map ) {
2752	49					32	my $v = $class_priors_to_unit_interval_map{$class_name};
2753	49	100	66			123	if ( ($roll_the_dice >= $v->[0]) && ($roll_the_dice <= $v->[1]) ) {
2754	35					25	push @{$training_sample_records{$sample_name}},
	35					51
2755							"class=" . $class_name;
2756	35					23	$class_label = $class_name;
2757	35					26	last;
2758							}
2759							}
2760	35					47	foreach my $feature (keys %features_and_values_hash) {
2761	140					90	my @values = @{$features_and_values_hash{$feature}};
	140					190
2762	140					125	my $bias_string = $bias_hash{$class_label}->{$feature}->[0];
2763	140					111	my $no_bias = 1.0 / @values;
2764	140	100				252	$bias_string = "$values[0]" . "=$no_bias" if !defined $bias_string;
2765	140					84	my %value_priors_to_unit_interval_map;
2766	140					325	my @splits = split /\s=\s/, $bias_string;
2767	140					103	my $chosen_for_bias_value = $splits[0];
2768	140					77	my $chosen_bias = $splits[1];
2769	140					135	my $remaining_bias = 1 - $chosen_bias;
2770	140					115	my $remaining_portion_bias = $remaining_bias / (@values -1);
2771	140	50				112	@splits = grep {defined($_) && length($_) > 0} @splits;
	280					745
2772	140					95	my $accumulated = 0;
2773	140					152	foreach my $i (0..@values-1) {
2774	490	100				461	if ($values[$i] eq $chosen_for_bias_value) {
2775	140					188	$value_priors_to_unit_interval_map{$values[$i]}
2776							= [$accumulated, $accumulated + $chosen_bias];
2777	140					132	$accumulated += $chosen_bias;
2778							} else {
2779	350					415	$value_priors_to_unit_interval_map{$values[$i]}
2780							= [$accumulated, $accumulated + $remaining_portion_bias];
2781	350					269	$accumulated += $remaining_portion_bias;
2782							}
2783							}
2784	140					107	my $roll_the_dice = rand(1.0);
2785	140					82	my $value_label;
2786	140					160	foreach my $value_name (keys %value_priors_to_unit_interval_map ) {
2787	299					185	my $v = $value_priors_to_unit_interval_map{$value_name};
2788	299	100	100			702	if ( ($roll_the_dice >= $v->[0])
2789							&& ($roll_the_dice <= $v->[1]) ) {
2790	140					75	push @{$training_sample_records{$sample_name}},
	140					244
2791							$feature . "=" . $value_name;
2792	140					99	$value_label = $value_name;
2793	140					95	last;
2794							}
2795							}
2796	140	50				373	if ($self->{_debug}) {
2797	0					0	print "mapping feature value priors for '$feature' " .
2798							"to unit interval: \n";
2799	0					0	while ( my ($k, $v) =
2800							each %value_priors_to_unit_interval_map ) {
2801	0					0	print "$k => @$v\n";
2802							}
2803	0					0	print "\n\n";
2804							}
2805							}
2806	35					55	$ele_index++;
2807							}
2808	1					1	$self->{_training_sample_records} = \%training_sample_records;
2809	1	50				3	if ($self->{_debug}) {
2810	0					0	print "\n\nPRINTING TRAINING RECORDS:\n\n";
2811	0					0	foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {
	0					0
2812	0					0	print "$kee => @{$training_sample_records{$kee}}\n\n";
	0					0
2813							}
2814							}
2815	1					2	my $output_training_file = $self->{_output_training_datafile};
2816	1	50				2	print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};
2817	1					82	open FILEHANDLE, ">$output_training_file";
2818	1					7	my @features = sort keys %features_and_values_hash;
2819	1					2	my $title_string = ',class';
2820	1					1	foreach my $feature_name (@features) {
2821	4					6	$title_string .= ',' . $feature_name;
2822							}
2823	1					10	print FILEHANDLE "$title_string\n";
2824	1					11	my @sample_names = sort {$a <=> $b} map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;
	105					72
	35					49
	35					45
2825	1					4	my $record_string = '';
2826	1					2	foreach my $sample_name (@sample_names) {
2827	35					40	$record_string .= "$sample_name,";
2828	35					23	my @record = @{$training_sample_records{"sample_$sample_name"}};
	35					50
2829	35					46	my %item_parts_hash;
2830	35					26	foreach my $item (@record) {
2831	175					228	my @splits = grep $_, split /=/, $item;
2832	175					191	$item_parts_hash{$splits[0]} = $splits[1];
2833							}
2834	35					27	$record_string .= $item_parts_hash{"class"};
2835	35					26	delete $item_parts_hash{"class"};
2836	35					61	my @kees = sort keys %item_parts_hash;
2837	35					33	foreach my $kee (@kees) {
2838	140					108	$record_string .= ",$item_parts_hash{$kee}";
2839							}
2840	35					33	print FILEHANDLE "$record_string\n";
2841	35					54	$record_string = '';
2842							}
2843	1					50	close FILEHANDLE;
2844							}
2845
2846							sub sample_index {
2847	0			0			my $arg = shift;
2848	0						$arg =~ /_(.+)$/;
2849	0						return $1;
2850							}
2851
2852							################################# Decision Tree Introspection #######################################
2853							################################# Class DTIntrospection #######################################
2854
2855							package DTIntrospection;
2856
2857							## Instances constructed from this class can provide explanations for the
2858							## classification decisions at the nodes of a decision tree.
2859							##
2860							## When used in the interactive mode, the decision-tree introspection made possible
2861							## by this class provides answers to the following three questions: (1) List of the
2862							## training samples that fall in the portion of the feature space that corresponds
2863							## to a node of the decision tree; (2) The probabilities associated with the last
2864							## feature test that led to the node; and (3) The class probabilities predicated on
2865							## just the last feature test on the path to that node.
2866							##
2867							## CAVEAT: It is possible for a node to exist even when there are no training
2868							## samples in the portion of the feature space that corresponds to the node. That
2869							## is because a decision tree is based on the probability densities estimated from
2870							## the training data. When training data is non-uniformly distributed, it is
2871							## possible for the probability associated with a point in the feature space to be
2872							## non-zero even when there are no training samples at or in the vicinity of that
2873							## point.
2874							##
2875							## For a node to exist even where there are no training samples in the portion of
2876							## the feature space that belongs to the node is an indication of the generalization
2877							## ability of decision-tree based classification.
2878							##
2879							## When used in a non-interactive mode, an instance of this class can be used to
2880							## create a tabular display that shows what training samples belong directly to the
2881							## portion of the feature space that corresponds to each node of the decision tree.
2882							## An instance of this class can also construct a tabular display that shows how the
2883							## influence of each training sample propagates in the decision tree. For each
2884							## training sample, this display first shows the list of nodes that came into
2885							## existence through feature test(s) that used the data provided by that sample.
2886							## This list for each training sample is followed by a subtree of the nodes that owe
2887							## their existence indirectly to the training sample. A training sample influences a
2888							## node indirectly if the node is a descendant of another node that is affected
2889							## directly by the training sample.
2890
2891	1			1		5	use strict;
	1					0
	1					20
2892	1			1		4	use Carp;
	1					0
	1					1847
2893
2894							sub new {
2895	0			0			my ($class, $dt) = @_;
2896	0	0					croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
2897							unless ref($dt) eq "Algorithm::DecisionTree";
2898							bless {
2899							_dt => $dt,
2900							_root_dtnode => $dt->{_root_node},
2901	0						_samples_at_nodes_hash => {},
2902							_branch_features_to_nodes_hash => {},
2903							_sample_to_node_mapping_direct_hash => {},
2904							_node_serial_num_to_node_hash => {},
2905							_awareness_raising_msg_shown => 0,
2906							_debug => 0,
2907							}, $class;
2908							}
2909
2910							sub initialize {
2911	0			0			my $self = shift;
2912							croak "You must first construct the decision tree before using the DTIntrospection class"
2913	0	0					unless $self->{_root_dtnode};
2914	0						$self->recursive_descent($self->{_root_dtnode});
2915							}
2916
2917							sub recursive_descent {
2918	0			0			my $self = shift;
2919	0						my $node = shift;
2920	0						my $node_serial_number = $node->get_serial_num();
2921	0						my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2922	0	0					print "\nAt node $node_serial_number: the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
	0
2923	0						$self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
2924	0						$self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
2925	0						my @samples_at_node = ();
2926	0						foreach my $item (@$branch_features_and_values_or_thresholds) {
2927	0						my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
2928	0	0					unless (@samples_at_node) {
2929	0						@samples_at_node = @$samples_for_feature_value_combo;
2930							} else {
2931	0						my @accum;
2932	0						foreach my $sample (@samples_at_node) {
2933	0	0					push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);
2934							}
2935	0						@samples_at_node = @accum;
2936							}
2937	0	0					last unless @samples_at_node;
2938							}
2939	0						@samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node;
	0
2940	0	0					print "Node: $node_serial_number the samples are: [@samples_at_node]\n" if ($self->{_debug});
2941	0						$self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
2942	0	0					if (@samples_at_node) {
2943	0						foreach my $sample (@samples_at_node) {
2944	0	0					if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2945	0						$self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number];
2946							} else {
2947	0						push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
	0
2948							}
2949							}
2950							}
2951	0						my $children = $node->get_children();
2952	0						foreach my $child (@$children) {
2953	0						$self->recursive_descent($child);
2954							}
2955							}
2956
2957							sub display_training_samples_at_all_nodes_direct_influence_only {
2958	0			0			my $self = shift;
2959							croak "You must first construct the decision tree before using the DT Introspection class."
2960	0	0					unless $self->{_root_dtnode};
2961	0						$self->recursive_descent_for_showing_samples_at_a_node($self->{_root_dtnode});
2962							}
2963
2964							sub recursive_descent_for_showing_samples_at_a_node{
2965	0			0			my $self = shift;
2966	0						my $node = shift;
2967	0						my $node_serial_number = $node->get_serial_num();
2968	0						my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2969	0	0					if (exists $self->{_samples_at_nodes_hash}->{$node_serial_number}) {
2970	0	0					print "\nAt node $node_serial_number: the branch features and values are: [@{$branch_features_and_values_or_thresholds}]\n" if $self->{_debug};
	0
2971	0						print "Node $node_serial_number: the samples are: [@{$self->{_samples_at_nodes_hash}->{$node_serial_number}}]\n";
	0
2972							}
2973	0						map $self->recursive_descent_for_showing_samples_at_a_node($_), @{$node->get_children()};
	0
2974							}
2975
2976							sub display_training_samples_to_nodes_influence_propagation {
2977	0			0			my $self = shift;
2978	0						foreach my $sample (sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} keys %{$self->{_dt}->{_training_data_hash}}) {
	0
	0
2979	0	0					if (exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2980	0						my $nodes_directly_affected = $self->{_sample_to_node_mapping_direct_hash}->{$sample};
2981	0						print "\n$sample:\n nodes affected directly: [@{$nodes_directly_affected}]\n";
	0
2982	0						print " nodes affected through probabilistic generalization:\n";
2983	0						map $self->recursive_descent_for_sample_to_node_influence($_, $nodes_directly_affected, " "), @$nodes_directly_affected;
2984							}
2985							}
2986							}
2987
2988							sub recursive_descent_for_sample_to_node_influence {
2989	0			0			my $self = shift;
2990	0						my $node_serial_num = shift;
2991	0						my $nodes_already_accounted_for = shift;
2992	0						my $offset = shift;
2993	0						$offset .= " ";
2994	0						my $node = $self->{_node_serial_num_to_node_hash}->{$node_serial_num};
2995	0						my @children = map $_->get_serial_num(), @{$node->get_children()};
	0
2996	0						my @children_affected = grep {!Algorithm::DecisionTree::contained_in($_, @{$nodes_already_accounted_for})} @children;
	0
	0
2997	0	0					if (@children_affected) {
2998	0						print "$offset $node_serial_num => [@children_affected]\n";
2999							}
3000	0						map $self->recursive_descent_for_sample_to_node_influence($_, \@children_affected, $offset), @children_affected;
3001							}
3002
3003							sub get_samples_for_feature_value_combo {
3004	0			0			my $self = shift;
3005	0						my $feature_value_combo = shift;
3006	0						my ($feature,$op,$value) = $self->extract_feature_op_val($feature_value_combo);
3007	0						my @samples = ();
3008	0	0					if ($op eq '=') {
		0
		0
3009	0						@samples = grep Algorithm::DecisionTree::contained_in($feature_value_combo, @{$self->{_dt}->{_training_data_hash}->{$_}}), keys %{$self->{_dt}->{_training_data_hash}};
	0
	0
3010							} elsif ($op eq '<') {
3011	0						foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
	0
3012	0						my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
	0
3013	0						foreach my $item (@features_and_values) {
3014	0						my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3015	0	0	0				if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data <= $value)) {
			0
3016	0						push @samples, $sample;
3017	0						last;
3018							}
3019							}
3020							}
3021							} elsif ($op eq '>') {
3022	0						foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
	0
3023	0						my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
	0
3024	0						foreach my $item (@features_and_values) {
3025	0						my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3026	0	0	0				if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data > $value)) {
			0
3027	0						push @samples, $sample;
3028	0						last;
3029							}
3030							}
3031							}
3032							} else {
3033	0						die "Something strange is going on";
3034							}
3035	0						return \@samples;
3036							}
3037
3038							sub extract_feature_op_val {
3039	0			0			my $self = shift;
3040	0						my $feature_value_combo = shift;
3041	0						my $pattern1 = '(.+)=(.+)';
3042	0						my $pattern2 = '(.+)<(.+)';
3043	0						my $pattern3 = '(.+)>(.+)';
3044	0						my ($feature,$value,$op);
3045	0	0					if ($feature_value_combo =~ /$pattern2/) {
		0
		0
3046	0						($feature,$op,$value) = ($1,'<',$2);
3047							} elsif ($feature_value_combo =~ /$pattern3/) {
3048	0						($feature,$op,$value) = ($1,'>',$2);
3049							} elsif ($feature_value_combo =~ /$pattern1/) {
3050	0						($feature,$op,$value) = ($1,'=',$2);
3051							}
3052	0						return ($feature,$op,$value);
3053							}
3054
3055							sub explain_classifications_at_multiple_nodes_interactively {
3056	0			0			my $self = shift;
3057							croak "You called explain_classification_at_multiple_nodes_interactively() without " .
3058							"first initializing the DTIntrospection instance in your code. Aborting."
3059	0	0					unless $self->{_samples_at_nodes_hash};
3060	0						print "\n\nIn order for the decision tree to introspect\n\n";
3061	0						print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3062							" that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3063							" with it even when there are NO training data points in that region?\n" .
3064							"\nEnter 'y' for yes or any other character for no: ";
3065	0						my $ans = ;
3066	0						$ans =~ s/^\s\|\s$//g;
3067	0	0					die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decisions in the decision tree. Aborting!\n" if $ans !~ /^ye?s?$/;
3068	0						$self->{_awareness_raising_msg_shown} = 1;
3069	0						while (1) {
3070	0						my $node_id;
3071							my $ans;
3072	0						while (1) {
3073	0						print "\nEnter the integer ID of a node: ";
3074	0						$ans = ;
3075	0						$ans =~ s/^\s\|\s$//g;
3076	0	0					return if $ans =~ /^exit$/;
3077	0	0					last if Algorithm::DecisionTree::contained_in($ans, keys %{$self->{_samples_at_nodes_hash}});
	0
3078	0						print "\nYour answer must be an integer ID of a node. Try again or enter 'exit'.\n";
3079							}
3080	0						$node_id = $ans;
3081	0						$self->explain_classification_at_one_node($node_id)
3082							}
3083							}
3084
3085							sub explain_classification_at_one_node {
3086	0			0			my $self = shift;
3087	0						my $node_id = shift;
3088							croak "You called explain_classification_at_one_node() without first initializing " .
3089	0	0					"the DTIntrospection instance in your code. Aborting." unless $self->{_samples_at_nodes_hash};
3090	0	0					unless (exists $self->{_samples_at_nodes_hash}->{$node_id}) {
3091	0						print "Node $node_id is not a node in the tree\n";
3092	0						return;
3093							}
3094	0	0					unless ($self->{_awareness_raising_msg_shown}) {
3095	0						print "\n\nIn order for the decision tree to introspect at Node $node_id: \n\n";
3096	0						print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3097							" that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3098							" with it even when there are NO training data points in that region?\n" .
3099							"\nEnter 'y' for yes or any other character for no: ";
3100	0						my $ans = ;
3101	0						$ans =~ s/^\s\|\s$//g;
3102	0	0					die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decision at node $node_id\n" if $ans !~ /^ye?s?$/;
3103							}
3104	0						my @samples_at_node = @{$self->{_samples_at_nodes_hash}->{$node_id}};
	0
3105	0						my @branch_features_to_node = @{$self->{_branch_features_to_nodes_hash}->{$node_id}};
	0
3106							# my @class_names = @{DTNode->get_class_names()};
3107	0						my @class_names = @{$self->get_class_names()};
	0
3108	0						my $class_probabilities = $self->{_root_dtnode}->get_class_probabilities();
3109	0						my ($feature,$op,$value) = $self->extract_feature_op_val( $branch_features_to_node[-1] );
3110	0	0					my $msg = @samples_at_node == 0
3111							? "\n\n There are NO training data samples directly in the region of the feature space assigned to node $node_id: @samples_at_node\n\n"
3112							: "\n Samples in the portion of the feature space assigned to Node $node_id: @samples_at_node\n";
3113	0						$msg .= "\n Features tests on the branch to node $node_id: [@branch_features_to_node]\n\n";
3114	0						$msg .= "\n Would you like to see the probability associated with the last feature test on the branch leading to Node $node_id?\n";
3115	0						$msg .= "\n Enter 'y' if yes and any other character for 'no': ";
3116	0						print $msg;
3117	0						my $ans = ;
3118	0						$ans =~ s/^\s\|\s$//g;
3119	0	0					if ($ans =~ /^ye?s?$/) {
3120	0						my $sequence = [$branch_features_to_node[-1]];
3121	0						my $prob = $self->{_dt}->probability_of_a_sequence_of_features_and_values_or_thresholds($sequence);
3122	0						print "\n probability of @{$sequence} is: $prob\n";
	0
3123							}
3124	0						$msg = "\n Using Bayes rule, would you like to see the class probabilities predicated on just the last feature test on the branch leading to Node $node_id?\n";
3125	0						$msg .= "\n Enter 'y' for yes and any other character for no: ";
3126	0						print $msg;
3127	0						$ans = ;
3128	0						$ans =~ s/^\s\|\s$//g;
3129	0	0					if ($ans =~ /^ye?s?$/) {
3130	0						my $sequence = [$branch_features_to_node[-1]];
3131	0						foreach my $cls (@class_names) {
3132	0						my $prob = $self->{_dt}->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($cls, $sequence);
3133	0						print "\n probability of class $cls given just the feature test @{$sequence} is: $prob\n";
	0
3134							}
3135							} else {
3136	0						print "goodbye\n";
3137							}
3138	0						print "\n Finished supplying information on Node $node_id\n\n";
3139							}
3140
3141							1;
3142
3143							=pod
3144
3145							=head1 NAME
3146
3147							Algorithm::DecisionTree - A Perl module for decision-tree based classification of
3148							multidimensional data.
3149
3150
3151							=head1 SYNOPSIS
3152
3153							# FOR CONSTRUCTING A DECISION TREE AND FOR CLASSIFYING A SAMPLE:
3154
3155							# In general, your call for constructing an instance of the DecisionTree class
3156							# will look like:
3157
3158							my $training_datafile = "stage3cancer.csv";
3159							my $dt = Algorithm::DecisionTree->new(
3160							training_datafile => $training_datafile,
3161							csv_class_column_index => 2,
3162							csv_columns_for_features => [3,4,5,6,7,8],
3163							entropy_threshold => 0.01,
3164							max_depth_desired => 8,
3165							symbolic_to_numeric_cardinality_threshold => 10,
3166							);
3167
3168							# The constructor option `csv_class_column_index' informs the module as to which
3169							# column of your CSV file contains the class label. THE COLUMN INDEXING IS ZERO
3170							# BASED. The constructor option `csv_columns_for_features' specifies which columns
3171							# are to be used for feature values. The first row of the CSV file must specify
3172							# the names of the features. See examples of CSV files in the `Examples'
3173							# subdirectory.
3174
3175							# The option `symbolic_to_numeric_cardinality_threshold' is also important. For
3176							# the example shown above, if an ostensibly numeric feature takes on only 10 or
3177							# fewer different values in your training datafile, it will be treated like a
3178							# symbolic features. The option `entropy_threshold' determines the granularity
3179							# with which the entropies are sampled for the purpose of calculating entropy gain
3180							# with a particular choice of decision threshold for a numeric feature or a feature
3181							# value for a symbolic feature.
3182
3183							# After you have constructed an instance of the DecisionTree class as shown above,
3184							# you read in the training data file and initialize the probability cache by
3185							# calling:
3186
3187							$dt->get_training_data();
3188							$dt->calculate_first_order_probabilities();
3189							$dt->calculate_class_priors();
3190
3191							# Next you construct a decision tree for your training data by calling:
3192
3193							$root_node = $dt->construct_decision_tree_classifier();
3194
3195							# where $root_node is an instance of the DTNode class that is also defined in the
3196							# module file. Now you are ready to classify a new data record. Let's say that
3197							# your data record looks like:
3198
3199							my @test_sample = qw / g2=4.2
3200							grade=2.3
3201							gleason=4
3202							eet=1.7
3203							age=55.0
3204							ploidy=diploid /;
3205
3206							# You can classify it by calling:
3207
3208							my $classification = $dt->classify($root_node, \@test_sample);
3209
3210							# The call to `classify()' returns a reference to a hash whose keys are the class
3211							# names and the values the associated classification probabilities. This hash also
3212							# includes another key-value pair for the solution path from the root node to the
3213							# leaf node at which the final classification was carried out.
3214
3215
3216							=head1 CHANGES
3217
3218							B All the changes made in this version relate to the construction of
3219							regression trees. I have fixed a couple of bugs in the calculation of the regression
3220							coefficients. Additionally, the C class now comes with a new
3221							constructor parameter named C. For most cases, you'd set this
3222							parameter to 0, which causes the regression coefficients to be estimated through
3223							linear least-squares minimization.
3224
3225							B In addition to constructing decision trees, this version of the
3226							module also allows you to construct regression trees. The regression tree capability
3227							has been packed into a separate subclass, named C, of the main
3228							C class. The subdirectory C in the main
3229							installation directory illustrates how you can use this new functionality of the
3230							module.
3231
3232							B This version incorporates four very significant upgrades/changes to
3233							the C module: B<(1)> The CSV cleanup is now the default. So you do not
3234							have to set any special parameters in the constructor calls to initiate CSV
3235							cleanup. B<(2)> In the form of a new Perl class named C,
3236							this module provides you with an easy-to-use programming interface for attempting
3237							needle-in-a-haystack solutions for the case when your training data is overwhelmingly
3238							dominated by a single class. You need to set the constructor parameter
3239							C to invoke the logic that constructs multiple
3240							decision trees, each using the minority class samples along with samples drawn
3241							randomly from the majority class. The final classification is made through a
3242							majority vote from all the decision trees. B<(3)> Assuming you are faced with a
3243							big-data problem --- in the sense that you have been given a training database with a
3244							very large number of training records --- the class C will
3245							also let you construct multiple decision trees by pulling training data randomly from
3246							your training database (without paying attention to the relative populations of the
3247							classes). The final classification decision for a test sample is based on a majority
3248							vote from all the decision trees thus constructed. See the C
3249							directory for how to use these new features of the module. And, finally, B<(4)>
3250							Support for the old-style '.dat' training files has been dropped in this version.
3251
3252							B This version makes it easier to use a CSV training file that
3253							violates the assumption that a comma be used only to separate the different field
3254							values in a line record. Some large econometrics databases use double-quoted values
3255							for fields, and these values may contain commas (presumably for better readability).
3256							This version also allows you to specify the leftmost entry in the first CSV record
3257							that names all the fields. Previously, this entry was required to be an empty
3258							double-quoted string. I have also made some minor changes to the
3259							'C' method to make it more user friendly for large
3260							training files that may contain tens of thousands of records. When pulling training
3261							data from such files, this method prints out a dot on the terminal screen for every
3262							10000 records it has processed.
3263
3264							B This version brings the boosting capability to the C
3265							module.
3266
3267							B This version adds bagging to the C module. If your
3268							training dataset is large enough, you can ask the module to construct multiple
3269							decision trees using data bags extracted from your dataset. The module can show you
3270							the results returned by the individual decision trees and also the results obtained
3271							by taking a majority vote of the classification decisions made by the individual
3272							trees. You can specify any arbitrary extent of overlap between the data bags.
3273
3274							B The introspection capability in this version packs more of a punch.
3275							For each training data sample, you can now figure out not only the decision-tree
3276							nodes that are affected directly by that sample, but also those nodes that are
3277							affected indirectly through the generalization achieved by the probabilistic modeling
3278							of the data. The 'examples' directory of this version includes additional scripts
3279							that illustrate these enhancements to the introspection capability. See the section
3280							"The Introspection API" for a declaration of the introspection related methods, old
3281							and new.
3282
3283							B In response to requests from several users, this version includes a new
3284							capability: You can now ask the module to introspect about the classification
3285							decisions returned by the decision tree. Toward that end, the module includes a new
3286							class named C. Perhaps the most important bit of information you
3287							are likely to seek through DT introspection is the list of the training samples that
3288							fall directly in the portion of the feature space that is assigned to a node.
3289							B When training samples are non-uniformly distributed in the underlying
3290							feature space, IT IS POSSIBLE FOR A NODE TO EXIST EVEN WHEN NO TRAINING SAMPLES FALL
3291							IN THE PORTION OF THE FEATURE SPACE ASSIGNED TO THE NODE. B<(This is an important
3292							part of the generalization achieved by probabilistic modeling of the training data.)>
3293							For additional information related to DT introspection, see the section titled
3294							"DECISION TREE INTROSPECTION" in this documentation page.
3295
3296							B makes the logic of tree construction from the old-style '.dat' training
3297							files more consistent with how trees are constructed from the data in `.csv' files.
3298							The inconsistency in the past was with respect to the naming convention for the class
3299							labels associated with the different data records.
3300
3301							B fixes a bug in the part of the module that some folks use for generating
3302							synthetic data for experimenting with decision tree construction and classification.
3303							In the class C that is a part of the module, there
3304							was a problem with the order in which the features were recorded from the
3305							user-supplied parameter file. The basic code for decision tree construction and
3306							classification remains unchanged.
3307
3308							B further downshifts the required version of Perl for this module. This
3309							was a result of testing the module with Version 5.10.1 of Perl. Only one statement
3310							in the module code needed to be changed for the module to work with the older version
3311							of Perl.
3312
3313							B fixes the C restriction on the required Perl version. This
3314							version should work with Perl versions 5.14.0 and higher.
3315
3316							B changes the required version of Perl from 5.18.0 to 5.14.0. Everything
3317							else remains the same.
3318
3319							B should prove more robust when the probability distribution for the
3320							values of a feature is expected to be heavy-tailed; that is, when the supposedly rare
3321							observations can occur with significant probabilities. A new option in the
3322							DecisionTree constructor lets the user specify the precision with which the
3323							probability distributions are estimated for such features.
3324
3325							B fixes a bug that was caused by the explicitly set zero values for
3326							numerical features being misconstrued as "false" in the conditional statements in
3327							some of the method definitions.
3328
3329							B makes it easier to write code for classifying in one go all of your test
3330							data samples in a CSV file. The bulk classifications obtained can be written out to
3331							either a CSV file or to a regular text file. See the script
3332							C in the C directory for how to
3333							classify all of your test data records in a CSV file. This version also includes
3334							improved code for generating synthetic numeric/symbolic training and test data
3335							records for experimenting with the decision tree classifier.
3336
3337							B allows you to test the quality of your training data by running a 10-fold
3338							cross-validation test on the data. This test divides all of the training data into
3339							ten parts, with nine parts used for training a decision tree and one part used for
3340							testing its ability to classify correctly. This selection of nine parts for training
3341							and one part for testing is carried out in all of the ten different ways that are
3342							possible. This testing functionality in Version 2.1 can also be used to find the
3343							best values to use for the constructor parameters C,
3344							C, and C.
3345
3346							B Now you can use both numeric and
3347							symbolic features for constructing a decision tree. A feature is numeric if it can
3348							take any floating-point value over an interval.
3349
3350							B fixes a bug in the code that was triggered by 0 being declared as one of
3351							the features values in the training datafile. Version 1.71 also include an additional
3352							safety feature that is useful for training datafiles that contain a very large number
3353							of features. The new version makes sure that the number of values you declare for
3354							each sample record matches the number of features declared at the beginning of the
3355							training datafile.
3356
3357							B includes safety checks on the consistency of the data you place in your
3358							training datafile. When a training file contains thousands of samples, it is
3359							difficult to manually check that you used the same class names in your sample records
3360							that you declared at top of your training file or that the values you have for your
3361							features are legal vis-a-vis the earlier declarations of the values in the training
3362							file. Another safety feature incorporated in this version is the non-consideration
3363							of classes that are declared at the top of the training file but that have no sample
3364							records in the file.
3365
3366							B uses probability caching much more extensively compared to the previous
3367							versions. This should result in faster construction of large decision trees.
3368							Another new feature in Version 1.6 is the use of a decision tree for interactive
3369							classification. In this mode, after you have constructed a decision tree from the
3370							training data, the user is prompted for answers to the questions pertaining to the
3371							feature tests at the nodes of the tree.
3372
3373							Some key elements of the documentation were cleaned up and made more readable in
3374							B. The implementation code remains unchanged from Version 1.4.
3375
3376							B should make things faster (and easier) for folks who want to use this
3377							module with training data that creates very large decision trees (that is, trees with
3378							tens of thousands or more decision nodes). The speedup in Version 1.4 has been
3379							achieved by eliminating duplicate calculation of probabilities as the tree grows. In
3380							addition, this version provides an additional constructor parameter,
3381							C for controlling the size of the decision tree. This is in
3382							addition to the tree size control achieved by the parameter C that
3383							was introduced in Version 1.3. Since large decision trees can take a long time to
3384							create, you may find yourself wishing you could store the tree you just created in a
3385							disk file and that, subsequently, you could use the stored tree for classification
3386							work. The C directory contains two scripts, C and
3387							C, that show how you can do exactly that with the
3388							help of Perl's C module.
3389
3390							B addresses the issue that arises when the header of a training datafile
3391							declares a certain possible value for a feature but that (feature,value) pair does
3392							NOT show up anywhere in the training data. Version 1.3 also makes it possible for a
3393							user to control the size of the decision tree by changing the value of the parameter
3394							C Additionally, Version 1.3 includes a method called
3395							C that displays useful information regarding the size and
3396							some other attributes of the training data. It also warns the user if the training
3397							data might result in a decision tree that would simply be much too large --- unless
3398							the user controls the size with the entropy_threshold parameter.
3399
3400							In addition to the removal of a couple of serious bugs, B incorporates a
3401							number of enhancements: (1) Version 1.2 includes checks on the names of the features
3402							and values used in test data --- this is the data you want to classify with the
3403							decision tree classifier constructed by this module. (2) Version 1.2 includes a
3404							separate constructor for generating test data. To make it easier to generate test
3405							data whose probabilistic parameters may not be identical to that used for the
3406							training data, I have used separate routines for generating the test data. (3)
3407							Version 1.2 also includes in its examples directory a script that classifies the test
3408							data in a file and outputs the class labels into another file. This is for folks who
3409							do not wish to write their own scripts using this module. (4) Version 1.2 also
3410							includes addition to the documentation regarding the issue of numeric values for
3411							features.
3412
3413							=head1 DESCRIPTION
3414
3415							B is a I module for constructing a decision tree from
3416							a training datafile containing multidimensional data. In one form or another,
3417							decision trees have been around for about fifty years. From a statistical
3418							perspective, they are closely related to classification and regression by recursive
3419							partitioning of multidimensional data. Early work that demonstrated the usefulness
3420							of such partitioning of data for classification and regression can be traced to the
3421							work of Terry Therneau in the early 1980's in the statistics community, and to the
3422							work of Ross Quinlan in the mid 1990's in the machine learning community.
3423
3424							For those not familiar with decision tree ideas, the traditional way to classify
3425							multidimensional data is to start with a feature space whose dimensionality is the
3426							same as that of the data. Each feature in this space corresponds to the attribute
3427							that each dimension of the data measures. You then use the training data to carve up
3428							the feature space into different regions, each corresponding to a different class.
3429							Subsequently, when you try to classify a new data sample, you locate it in the
3430							feature space and find the class label of the region to which it belongs. One can
3431							also give the new data point the same class label as that of the nearest training
3432							sample. This is referred to as the nearest neighbor classification. There exist
3433							hundreds of variations of varying power on these two basic approaches to the
3434							classification of multidimensional data.
3435
3436							A decision tree classifier works differently. When you construct a decision tree,
3437							you select for the root node a feature test that partitions the training data in a
3438							way that causes maximal disambiguation of the class labels associated with the data.
3439							In terms of information content as measured by entropy, such a feature test would
3440							cause maximum reduction in class entropy in going from all of the training data taken
3441							together to the data as partitioned by the feature test. You then drop from the root
3442							node a set of child nodes, one for each partition of the training data created by the
3443							feature test at the root node. When your features are purely symbolic, you'll have
3444							one child node for each value of the feature chosen for the feature test at the root.
3445							When the test at the root involves a numeric feature, you find the decision threshold
3446							for the feature that best bipartitions the data and you drop from the root node two
3447							child nodes, one for each partition. Now at each child node you pose the same
3448							question that you posed when you found the best feature to use at the root: Which
3449							feature at the child node in question would maximally disambiguate the class labels
3450							associated with the training data corresponding to that child node?
3451
3452							As the reader would expect, the two key steps in any approach to decision-tree based
3453							classification are the construction of the decision tree itself from a file
3454							containing the training data, and then using the decision tree thus obtained for
3455							classifying new data.
3456
3457							What is cool about decision tree classification is that it gives you soft
3458							classification, meaning it may associate more than one class label with a given data
3459							vector. When this happens, it may mean that your classes are indeed overlapping in
3460							the underlying feature space. It could also mean that you simply have not supplied
3461							sufficient training data to the decision tree classifier. For a tutorial
3462							introduction to how a decision tree is constructed and used, visit
3463							L
3464
3465							This module also allows you to generate your own synthetic training and test
3466							data. Generating your own training data, using it for constructing a decision-tree
3467							classifier, and subsequently testing the classifier on a synthetically generated
3468							test set of data is a good way to develop greater proficiency with decision trees.
3469
3470
3471							=head1 WHAT PRACTICAL PROBLEM IS SOLVED BY THIS MODULE
3472
3473							If you are new to the concept of a decision tree, their practical utility is best
3474							understood with an example that only involves symbolic features. However, as
3475							mentioned earlier, versions of the module higher than 2.0 allow you to use both
3476							symbolic and numeric features.
3477
3478							Consider the following scenario: Let's say you are running a small investment company
3479							that employs a team of stockbrokers who make buy/sell decisions for the customers of
3480							your company. Assume that your company has asked the traders to make each investment
3481							decision on the basis of the following four criteria:
3482
3483							price_to_earnings_ratio (P_to_E)
3484
3485							price_to_sales_ratio (P_to_S)
3486
3487							return_on_equity (R_on_E)
3488
3489							market_share (MS)
3490
3491							Since you are the boss, you keep track of the buy/sell decisions made by the
3492							individual traders. But one unfortunate day, all of your traders decide to quit
3493							because you did not pay them enough. So what do you do? If you had a module like
3494							the one here, you could still run your company and do so in such a way that, on the
3495							average, would do better than any of the individual traders who worked for your
3496							company. This is what you do: You pool together the individual trader buy/sell
3497							decisions you have accumulated during the last one year. This pooled information is
3498							likely to look like:
3499
3500
3501							example buy/sell P_to_E P_to_S R_on_E MS
3502							============================================================+=
3503
3504							example_1 buy high low medium low
3505							example_2 buy medium medium low low
3506							example_3 sell low medium low high
3507							....
3508							....
3509
3510							This data, when formatted according to CSV, would constitute your training file. You
3511							could feed this file into the module by calling:
3512
3513							my $dt = Algorithm::DecisionTree->new(
3514							training_datafile => $training_datafile,
3515							csv_class_column_index => 1,
3516							csv_columns_for_features => [2,3,4,5],
3517							);
3518							$dt->get_training_data();
3519							$dt->calculate_first_order_probabilities();
3520							$dt->calculate_class_priors();
3521
3522							Subsequently, you would construct a decision tree by calling:
3523
3524							my $root_node = $dt->construct_decision_tree_classifier();
3525
3526							Now you and your company (with practically no employees) are ready to service the
3527							customers again. Suppose your computer needs to make a buy/sell decision about an
3528							investment prospect that is best described by:
3529
3530							price_to_earnings_ratio = low
3531							price_to_sales_ratio = very_low
3532							return_on_equity = none
3533							market_share = medium
3534
3535							All that your computer would need to do would be to construct a data vector like
3536
3537							my @data = qw / P_to_E=low
3538							P_to_S=very_low
3539							R_on_E=none
3540							MS=medium /;
3541
3542							and call the decision tree classifier you just constructed by
3543
3544							$dt->classify($root_node, \@data);
3545
3546							The answer returned will be 'buy' and 'sell', along with the associated
3547							probabilities. So if the probability of 'buy' is considerably greater than the
3548							probability of 'sell', that's what you should instruct your computer to do.
3549
3550							The chances are that, on the average, this approach would beat the performance of any
3551							of your individual traders who worked for you previously since the buy/sell decisions
3552							made by the computer would be based on the collective wisdom of all your previous
3553							traders. B
3554							captured by the silly little example here. However, it does nicely the convey the
3555							sense in which the current module could be used.>
3556
3557							=head1 SYMBOLIC FEATURES VERSUS NUMERIC FEATURES
3558
3559							A feature is symbolic when its values are compared using string comparison operators.
3560							By the same token, a feature is numeric when its values are compared using numeric
3561							comparison operators. Having said that, features that take only a small number of
3562							numeric values in the training data can be treated symbolically provided you are
3563							careful about handling their values in the test data. At the least, you have to set
3564							the test data value for such a feature to its closest value in the training data.
3565							The module does that automatically for you for those numeric features for which the
3566							number different numeric values is less than a user-specified threshold. For those
3567							numeric features that the module is allowed to treat symbolically, this snapping of
3568							the values of the features in the test data to the small set of values in the training
3569							data is carried out automatically by the module. That is, after a user has told the
3570							module which numeric features to treat symbolically, the user need not worry about
3571							how the feature values appear in the test data.
3572
3573							The constructor parameter C let's you tell
3574							the module when to consider an otherwise numeric feature symbolically. Suppose you
3575							set this parameter to 10, that means that all numeric looking features that take 10
3576							or fewer different values in the training datafile will be considered to be symbolic
3577							features by the module. See the tutorial at
3578							L for
3579							further information on the implementation issues related to the symbolic and numeric
3580							features.
3581
3582							=head1 FEATURES WITH NOT SO "NICE" STATISTICAL PROPERTIES
3583
3584							For the purpose of estimating the probabilities, it is necessary to sample the range
3585							of values taken on by a numerical feature. For features with "nice" statistical
3586							properties, this sampling interval is set to the median of the differences between
3587							the successive feature values in the training data. (Obviously, as you would expect,
3588							you first sort all the values for a feature before computing the successive
3589							differences.) This logic will not work for the sort of a feature described below.
3590
3591							Consider a feature whose values are heavy-tailed, and, at the same time, the values
3592							span a million to one range. What I mean by heavy-tailed is that rare values can
3593							occur with significant probabilities. It could happen that most of the values for
3594							such a feature are clustered at one of the two ends of the range. At the same time,
3595							there may exist a significant number of values near the end of the range that is less
3596							populated. (Typically, features related to human economic activities --- such as
3597							wealth, incomes, etc. --- are of this type.) With the logic described in the
3598							previous paragraph, you could end up with a sampling interval that is much too small,
3599							which could result in millions of sampling points for the feature if you are not
3600							careful.
3601
3602							Beginning with Version 2.22, you have two options in dealing with such features. You
3603							can choose to go with the default behavior of the module, which is to sample the
3604							value range for such a feature over a maximum of 500 points. Or, you can supply an
3605							additional option to the constructor that sets a user-defined value for the number of
3606							points to use. The name of the option is C. The following
3607							script
3608
3609							construct_dt_for_heavytailed.pl
3610
3611							in the C directory shows an example of how to call the constructor of the
3612							module with the C option.
3613
3614
3615							=head1 TESTING THE QUALITY OF YOUR TRAINING DATA
3616
3617							Versions 2.1 and higher include a new class named C, derived from
3618							the main class C, that runs a 10-fold cross-validation test on your
3619							training data to test its ability to discriminate between the classes mentioned in
3620							the training file.
3621
3622							The 10-fold cross-validation test divides all of the training data into ten parts,
3623							with nine parts used for training a decision tree and one part used for testing its
3624							ability to classify correctly. This selection of nine parts for training and one part
3625							for testing is carried out in all of the ten different possible ways.
3626
3627							The following code fragment illustrates how you invoke the testing function of the
3628							EvalTrainingData class:
3629
3630							my $training_datafile = "training.csv";
3631							my $eval_data = EvalTrainingData->new(
3632							training_datafile => $training_datafile,
3633							csv_class_column_index => 1,
3634							csv_columns_for_features => [2,3],
3635							entropy_threshold => 0.01,
3636							max_depth_desired => 3,
3637							symbolic_to_numeric_cardinality_threshold => 10,
3638							);
3639							$eval_data->get_training_data();
3640							$eval_data->evaluate_training_data()
3641
3642							The last statement above prints out a Confusion Matrix and the value of Training Data
3643							Quality Index on a scale of 0 to 100, with 100 designating perfect training data.
3644							The Confusion Matrix shows how the different classes were mislabeled in the 10-fold
3645							cross-validation test.
3646
3647							This testing functionality can also be used to find the best values to use for the
3648							constructor parameters C, C, and
3649							C.
3650
3651							The following two scripts in the C directory illustrate the use of the
3652							C class for testing the quality of your data:
3653
3654							evaluate_training_data1.pl
3655							evaluate_training_data2.pl
3656
3657
3658							=head1 HOW TO MAKE THE BEST CHOICES FOR THE CONSTRUCTOR PARAMETERS
3659
3660							Assuming your training data is good, the quality of the results you get from a
3661							decision tree would depend on the choices you make for the constructor parameters
3662							C, C, and
3663							C. You can optimize your choices for
3664							these parameters by running the 10-fold cross-validation test that is made available
3665							in Versions 2.2 and higher through the new class C that is included
3666							in the module file. A description of how to run this test is in the previous section
3667							of this document.
3668
3669
3670							=head1 DECISION TREE INTROSPECTION
3671
3672							Starting with Version 2.30, you can ask the C class of the module to
3673							explain the classification decisions made at the different nodes of the decision
3674							tree.
3675
3676							Perhaps the most important bit of information you are likely to seek through DT
3677							introspection is the list of the training samples that fall directly in the portion
3678							of the feature space that is assigned to a node.
3679
3680							However, note that, when training samples are non-uniformly distributed in the
3681							underlying feature space, it is possible for a node to exist even when there are no
3682							training samples in the portion of the feature space assigned to the node. That is
3683							because the decision tree is constructed from the probability densities estimated
3684							from the training data. When the training samples are non-uniformly distributed, it
3685							is entirely possible for the estimated probability densities to be non-zero in a
3686							small region around a point even when there are no training samples specifically in
3687							that region. (After you have created a statistical model for, say, the height
3688							distribution of people in a community, the model may return a non-zero probability
3689							for the height values in a small interval even if the community does not include a
3690							single individual whose height falls in that interval.)
3691
3692							That a decision-tree node can exist even when there are no training samples in that
3693							portion of the feature space that belongs to the node is an important indication of
3694							the generalization ability of a decision-tree-based classifier.
3695
3696							In light of the explanation provided above, before the DTIntrospection class supplies
3697							any answers at all, it asks you to accept the fact that features can take on non-zero
3698							probabilities at a point in the feature space even though there are zero training
3699							samples at that point (or in a small region around that point). If you do not accept
3700							this rudimentary fact, the introspection class will not yield any answers (since you
3701							are not going to believe the answers anyway).
3702
3703							The point made above implies that the path leading to a node in the decision tree may
3704							test a feature for a certain value or threshold despite the fact that the portion of
3705							the feature space assigned to that node is devoid of any training data.
3706
3707							See the following three scripts in the Examples directory for how to carry out DT
3708							introspection:
3709
3710							introspection_in_a_loop_interactive.pl
3711
3712							introspection_show_training_samples_at_all_nodes_direct_influence.pl
3713
3714							introspection_show_training_samples_to_nodes_influence_propagation.pl
3715
3716							The first script places you in an interactive session in which you will first be
3717							asked for the node number you are interested in. Subsequently, you will be asked for
3718							whether or not you are interested in specific questions that the introspection can
3719							provide answers for. The second script descends down the decision tree and shows for
3720							each node the training samples that fall directly in the portion of the feature space
3721							assigned to that node. The third script shows for each training sample how it
3722							affects the decision-tree nodes either directly or indirectly through the
3723							generalization achieved by the probabilistic modeling of the data.
3724
3725							The output of the script
3726							C looks like:
3727
3728							Node 0: the samples are: None
3729							Node 1: the samples are: [sample_46 sample_58]
3730							Node 2: the samples are: [sample_1 sample_4 sample_7 .....]
3731							Node 3: the samples are: []
3732							Node 4: the samples are: []
3733							...
3734							...
3735
3736							The nodes for which no samples are listed come into existence through
3737							the generalization achieved by the probabilistic modeling of the data.
3738
3739							The output produced by the script
3740							C looks like
3741
3742							sample_1:
3743							nodes affected directly: [2 5 19 23]
3744							nodes affected through probabilistic generalization:
3745							2=> [3 4 25]
3746							25=> [26]
3747							5=> [6]
3748							6=> [7 13]
3749							7=> [8 11]
3750							8=> [9 10]
3751							11=> [12]
3752							13=> [14 18]
3753							14=> [15 16]
3754							16=> [17]
3755							19=> [20]
3756							20=> [21 22]
3757							23=> [24]
3758
3759							sample_4:
3760							nodes affected directly: [2 5 6 7 11]
3761							nodes affected through probabilistic generalization:
3762							2=> [3 4 25]
3763							25=> [26]
3764							5=> [19]
3765							19=> [20 23]
3766							20=> [21 22]
3767							23=> [24]
3768							6=> [13]
3769							13=> [14 18]
3770							14=> [15 16]
3771							16=> [17]
3772							7=> [8]
3773							8=> [9 10]
3774							11=> [12]
3775
3776							...
3777							...
3778							...
3779
3780							For each training sample, the display shown above first presents the list of nodes
3781							that are directly affected by the sample. A node is affected directly by a sample if
3782							the latter falls in the portion of the feature space that belongs to the former.
3783							Subsequently, for each training sample, the display shows a subtree of the nodes that
3784							are affected indirectly by the sample through the generalization achieved by the
3785							probabilistic modeling of the data. In general, a node is affected indirectly by a
3786							sample if it is a descendant of another node that is affected directly.
3787
3788							Also see the section titled B regarding how to invoke the
3789							introspection capabilities of the module in your own code.
3790
3791							=head1 METHODS
3792
3793							The module provides the following methods for constructing a decision tree from
3794							training data in a disk file and for classifying new data records with the decision
3795							tree thus constructed:
3796
3797							=over 4
3798
3799							=item B
3800
3801							my $dt = Algorithm::DecisionTree->new(
3802							training_datafile => $training_datafile,
3803							csv_class_column_index => 2,
3804							csv_columns_for_features => [3,4,5,6,7,8],
3805							entropy_threshold => 0.01,
3806							max_depth_desired => 8,
3807							symbolic_to_numeric_cardinality_threshold => 10,
3808							);
3809
3810							A call to C constructs a new instance of the C class.
3811							For this call to make sense, the training data in the training datafile must be
3812							in the CSV format.
3813
3814							=back
3815
3816							=head2 Constructor Parameters
3817
3818							=over 8
3819
3820							=item C:
3821
3822							This parameter supplies the name of the file that contains the training data.
3823
3824							=item C:
3825
3826							When using a CSV file for your training data, this parameter supplies the zero-based
3827							column index for the column that contains the class label for each data record in the
3828							training file.
3829
3830							=item C:
3831
3832							When using a CSV file for your training data, this parameter supplies a list of
3833							columns corresponding to the features you wish to use for decision tree construction.
3834							Each column is specified by its zero-based index.
3835
3836							=item C:
3837
3838							This parameter sets the granularity with which the entropies are sampled by the
3839							module. For example, a feature test at a node in the decision tree is acceptable if
3840							the entropy gain achieved by the test exceeds this threshold. The larger the value
3841							you choose for this parameter, the smaller the tree. Its default value is 0.001.
3842
3843							=item C:
3844
3845							This parameter sets the maximum depth of the decision tree. For obvious reasons, the
3846							smaller the value you choose for this parameter, the smaller the tree.
3847
3848							=item C:
3849
3850							This parameter allows the module to treat an otherwise numeric feature symbolically
3851							if the number of different values the feature takes in the training data file does
3852							not exceed the value of this parameter.
3853
3854							=item C:
3855
3856							This parameter gives the user the option to set the number of points at which the
3857							value range for a feature should be sampled for estimating the probabilities. This
3858							parameter is effective only for those features that occupy a large value range and
3859							whose probability distributions are heavy tailed. B
3860							when you have a very large training dataset:> In general, the larger the dataset, the
3861							smaller the smallest difference between any two values for a numeric feature in
3862							relation to the overall range of values for that feature. In such cases, the module
3863							may use too large a number of bins for estimating the probabilities and that may slow
3864							down the calculation of the decision tree. You can get around this difficulty by
3865							explicitly giving a value to the 'C' parameter.
3866
3867							=back
3868
3869
3870							You can choose the best values to use for the last three constructor parameters by
3871							running a 10-fold cross-validation test on your training data through the class
3872							C that comes with Versions 2.1 and higher of this module. See the
3873							section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.
3874
3875							=over
3876
3877							=item B
3878
3879							After you have constructed a new instance of the C class,
3880							you must now read in the training data that is the file named in the call to the
3881							constructor. This you do by:
3882
3883							$dt->get_training_data();
3884
3885
3886							=item B
3887
3888							If you wish to see the training data that was just digested by the module,
3889							call
3890
3891							$dt->show_training_data();
3892
3893							=item B
3894
3895							=item B
3896
3897							After the module has read the training data file, it needs to initialize the
3898							probability cache. This you do by invoking:
3899
3900							$dt->calculate_first_order_probabilities()
3901							$dt->calculate_class_priors()
3902
3903							=item B
3904
3905							With the probability cache initialized, it is time to construct a decision tree
3906							classifier. This you do by
3907
3908							my $root_node = $dt->construct_decision_tree_classifier();
3909
3910							This call returns an instance of type C. The C class is defined
3911							within the main package file. So, don't forget, that C<$root_node> in the above
3912							example call will be instantiated to an object of type C.
3913
3914							=item B<$root_nodeC<< -> >>display_decision_tree(" "):>
3915
3916							$root_node->display_decision_tree(" ");
3917
3918							This will display the decision tree in your terminal window by using a recursively
3919							determined offset for each node as the display routine descends down the tree.
3920
3921							I have intentionally left the syntax fragment C<$root_node> in the above call to
3922							remind the reader that C is NOT called on the instance of
3923							the C we constructed earlier, but on the C instance returned by
3924							the call to C.
3925
3926							=item B
3927
3928							Let's say you want to classify the following data record:
3929
3930							my @test_sample = qw / g2=4.2
3931							grade=2.3
3932							gleason=4
3933							eet=1.7
3934							age=55.0
3935							ploidy=diploid /;
3936
3937							you'd make the following call:
3938
3939							my $classification = $dt->classify($root_node, \@test_sample);
3940
3941							where, again, C<$root_node> is an instance of type C returned by the call to
3942							C. The variable C<$classification> holds a
3943							reference to a hash whose keys are the class names and whose values the associated
3944							probabilities. The hash that is returned by the above call also includes a special
3945							key-value pair for a key named C. The value associated with this key
3946							is an anonymous array that holds the path, in the form of a list of nodes, from the
3947							root node to the leaf node in the decision tree where the final classification was
3948							made.
3949
3950
3951							=item B
3952
3953							This method allows you to use a decision-tree based classifier in an interactive
3954							mode. In this mode, a user is prompted for answers to the questions pertaining to
3955							the feature tests at the nodes of the tree. The syntax for invoking this method is:
3956
3957							my $classification = $dt->classify_by_asking_questions($root_node);
3958
3959							where C<$dt> is an instance of the C class returned by a
3960							call to C and C<$root_node> the root node of the decision tree returned by a
3961							call to C.
3962
3963							=back
3964
3965
3966							=head1 THE INTROSPECTION API
3967
3968							To construct an instance of C, you call
3969
3970							my $introspector = DTIntrospection->new($dt);
3971
3972							where you supply the instance of the C class you used for constructing
3973							the decision tree through the parameter C<$dt>. After you have constructed an
3974							instance of the introspection class, you must initialize it by
3975
3976							$introspector->initialize();
3977
3978							Subsequently, you can invoke either of the following methods:
3979
3980							$introspector->explain_classification_at_one_node($node);
3981
3982							$introspector->explain_classifications_at_multiple_nodes_interactively();
3983
3984							depending on whether you want introspection at a single specified node or inside an
3985							infinite loop for an arbitrary number of nodes.
3986
3987							If you want to output a tabular display that shows for each node in the decision tree
3988							all the training samples that fall in the portion of the feature space that belongs
3989							to that node, call
3990
3991							$introspector->display_training_samples_at_all_nodes_direct_influence_only();
3992
3993							If you want to output a tabular display that shows for each training sample a list of
3994							all the nodes that are affected directly AND indirectly by that sample, call
3995
3996							$introspector->display_training_training_samples_to_nodes_influence_propagation();
3997
3998							A training sample affects a node directly if the sample falls in the portion of the
3999							features space assigned to that node. On the other hand, a training sample is
4000							considered to affect a node indirectly if the node is a descendant of a node that is
4001							affected directly by the sample.
4002
4003
4004							=head1 BULK CLASSIFICATION OF DATA RECORDS
4005
4006							For large test datasets, you would obviously want to process an entire file of test
4007							data at a time. The following scripts in the C directory illustrate how you
4008							can do that:
4009
4010							classify_test_data_in_a_file.pl
4011
4012							This script requires three command-line arguments, the first argument names the
4013							training datafile, the second the test datafile, and the third the file in which the
4014							classification results are to be deposited.
4015
4016							The other examples directories, C, C, and
4017							C, also contain scripts that illustrate how to carry out
4018							bulk classification of data records when you wish to take advantage of bagging,
4019							boosting, or tree randomization. In their respective directories, these scripts are
4020							named:
4021
4022							bagging_for_bulk_classification.pl
4023							boosting_for_bulk_classification.pl
4024							classify_database_records.pl
4025
4026
4027							=head1 HOW THE CLASSIFICATION RESULTS ARE DISPLAYED
4028
4029							It depends on whether you apply the classifier at once to all the data samples in a
4030							file, or whether you feed one data sample at a time into the classifier.
4031
4032							In general, the classifier returns soft classification for a test data vector. What
4033							that means is that, in general, the classifier will list all the classes to which a
4034							given data vector could belong and the probability of each such class label for the
4035							data vector. Run the examples scripts in the Examples directory to see how the output
4036							of classification can be displayed.
4037
4038							With regard to the soft classifications returned by this classifier, if the
4039							probability distributions for the different classes overlap in the underlying feature
4040							space, you would want the classifier to return all of the applicable class labels for
4041							a data vector along with the corresponding class probabilities. Another reason for
4042							why the decision tree classifier may associate significant probabilities with
4043							multiple class labels is that you used inadequate number of training samples to
4044							induce the decision tree. The good thing is that the classifier does not lie to you
4045							(unlike, say, a hard classification rule that would return a single class label
4046							corresponding to the partitioning of the underlying feature space). The decision
4047							tree classifier give you the best classification that can be made given the training
4048							data you fed into it.
4049
4050
4051							=head1 USING BAGGING
4052
4053							Starting with Version 3.0, you can use the class C that
4054							comes with the module to incorporate bagging in your decision tree based
4055							classification. Bagging means constructing multiple decision trees for different
4056							(possibly overlapping) segments of the data extracted from your training dataset and
4057							then aggregating the decisions made by the individual decision trees for the final
4058							classification. The aggregation of the classification decisions can average out the
4059							noise and bias that may otherwise affect the classification decision obtained from
4060							just one tree.
4061
4062							=over 4
4063
4064							=item B
4065
4066							A typical call to the constructor for the C class looks
4067							like:
4068
4069							use Algorithm::DecisionTreeWithBagging;
4070
4071							my $training_datafile = "stage3cancer.csv";
4072
4073							my $dtbag = Algorithm::DecisionTreeWithBagging->new(
4074							training_datafile => $training_datafile,
4075							csv_class_column_index => 2,
4076							csv_columns_for_features => [3,4,5,6,7,8],
4077							entropy_threshold => 0.01,
4078							max_depth_desired => 8,
4079							symbolic_to_numeric_cardinality_threshold => 10,
4080							how_many_bags => 4,
4081							bag_overlap_fraction => 0.2,
4082							);
4083
4084							Note in particular the following two constructor parameters:
4085
4086							how_many_bags
4087
4088							bag_overlap_fraction
4089
4090							where, as the name implies, the parameter C controls how many bags
4091							(and, therefore, how many decision trees) will be constructed from your training
4092							dataset; and where the parameter C controls the degree of
4093							overlap between the bags. To understand what exactly is achieved by setting the
4094							parameter C to 0.2 in the above example, let's say that the
4095							non-overlapping partitioning of the training data between the bags results in 100
4096							training samples per bag. With bag_overlap_fraction set to 0.2, additional 20 samples
4097							drawn randomly from the other bags will be added to the data in each bag.
4098
4099							=back
4100
4101							=head2 B class>
4102
4103							=over 8
4104
4105							=item B
4106
4107							This method reads your training datafile, randomizes it, and then partitions it into
4108							the specified number of bags. Subsequently, if the constructor parameter
4109							C is non-zero, it adds to each bag additional samples drawn at
4110							random from the other bags. The number of these additional samples added to each bag
4111							is controlled by the constructor parameter C. If this
4112							parameter is set to, say, 0.2, the size of each bag will grow by 20% with the samples
4113							drawn from the other bags.
4114
4115							=item B
4116
4117							Shows for each bag the names of the training data samples in that bag.
4118
4119							=item B
4120
4121							Calls on the appropriate methods of the main C class to estimate the
4122							first-order probabilities from the data samples in each bag.
4123
4124							=item B
4125
4126							Calls on the appropriate method of the main C class to estimate the
4127							class priors for the data samples in each bag.
4128
4129							=item B
4130
4131							Calls on the appropriate method of the main C class to construct a
4132							decision tree from the training data in each bag.
4133
4134							=item B
4135
4136							Display separately the decision tree for each bag..
4137
4138							=item B
4139
4140							Calls on the appropriate methods of the main C class to classify the
4141							argument test sample.
4142
4143							=item B
4144
4145							Displays separately the classification decision made by each the decision tree
4146							constructed for each bag.
4147
4148							=item B
4149
4150							Using majority voting, this method aggregates the classification decisions made by
4151							the individual decision trees into a single decision.
4152
4153							=back
4154
4155							See the example scripts in the directory C for how to call these
4156							methods for classifying individual samples and for bulk classification when you place
4157							all your test samples in a single file.
4158
4159							=head1 USING BOOSTING
4160
4161							Starting with Version 3.20, you can use the class C for
4162							constructing a boosted decision-tree classifier. Boosting results in a cascade of
4163							decision trees in which each decision tree is constructed with samples that are
4164							mostly those that are misclassified by the previous decision tree. To be precise,
4165							you create a probability distribution over the training samples for the selection of
4166							samples for training each decision tree in the cascade. To start out, the
4167							distribution is uniform over all of the samples. Subsequently, this probability
4168							distribution changes according to the misclassifications by each tree in the cascade:
4169							if a sample is misclassified by a given tree in the cascade, the probability of its
4170							being selected for training the next tree is increased significantly. You also
4171							associate a trust factor with each decision tree depending on its power to classify
4172							correctly all of the training data samples. After a cascade of decision trees is
4173							constructed in this manner, you construct a final classifier that calculates the
4174							class label for a test data sample by taking into account the classification
4175							decisions made by each individual tree in the cascade, the decisions being weighted
4176							by the trust factors associated with the individual classifiers. These boosting
4177							notions --- generally referred to as the AdaBoost algorithm --- are based on a now
4178							celebrated paper "A Decision-Theoretic Generalization of On-Line Learning and an
4179							Application to Boosting" by Yoav Freund and Robert Schapire that appeared in 1995 in
4180							the Proceedings of the 2nd European Conf. on Computational Learning Theory. For a
4181							tutorial introduction to AdaBoost, see L
4182
4183							Keep in mind the fact that, ordinarily, the theoretical guarantees provided by
4184							boosting apply only to the case of binary classification. Additionally, your
4185							training dataset must capture all of the significant statistical variations in the
4186							classes represented therein.
4187
4188							=over 4
4189
4190							=item B
4191
4192							If you'd like to experiment with boosting, a typical call to the constructor for the
4193							C class looks like:
4194
4195							use Algorithm::BoostedDecisionTree;
4196							my $training_datafile = "training6.csv";
4197							my $boosted = Algorithm::BoostedDecisionTree->new(
4198							training_datafile => $training_datafile,
4199							csv_class_column_index => 1,
4200							csv_columns_for_features => [2,3],
4201							entropy_threshold => 0.01,
4202							max_depth_desired => 8,
4203							symbolic_to_numeric_cardinality_threshold => 10,
4204							how_many_stages => 4,
4205							);
4206
4207							Note in particular the constructor parameter:
4208
4209							how_many_stages
4210
4211							As its name implies, this parameter controls how many stages will be used in the
4212							boosted decision tree classifier. As mentioned above, a separate decision tree is
4213							constructed for each stage of boosting using a set of training samples that are drawn
4214							through a probability distribution maintained over the entire training dataset.
4215
4216							=back
4217
4218							=head2 B class>
4219
4220							=over 8
4221
4222							=item B
4223
4224							This method reads your training datafile, creates the data structures from the data
4225							ingested for constructing the base decision tree.
4226
4227							=item B
4228
4229							Writes to the standard output the training data samples and also some relevant
4230							properties of the features used in the training dataset.
4231
4232							=item B
4233
4234							Calls on the appropriate methods of the main C class to estimate the
4235							first-order probabilities and the class priors.
4236
4237							=item B
4238
4239							Calls on the appropriate method of the main C class to construct the
4240							base decision tree.
4241
4242							=item B
4243
4244							Displays the base decision tree in your terminal window. (The textual form of the
4245							decision tree is written out to the standard output.)
4246
4247							=item B
4248
4249							Uses the AdaBoost algorithm to construct a cascade of decision trees. As mentioned
4250							earlier, the training samples for each tree in the cascade are drawn using a
4251							probability distribution over the entire training dataset. This probability
4252							distribution for any given tree in the cascade is heavily influenced by which
4253							training samples are misclassified by the previous tree.
4254
4255							=item B
4256
4257							Displays separately in your terminal window the decision tree constructed for each
4258							stage of the cascade. (The textual form of the trees is written out to the standard
4259							output.)
4260
4261							=item B
4262
4263							Calls on each decision tree in the cascade to classify the argument C<$test_sample>.
4264
4265							=item B
4266
4267							You can call this method to display in your terminal window the classification
4268							decision made by each decision tree in the cascade. The method also prints out the
4269							trust factor associated with each decision tree. It is important to look
4270							simultaneously at the classification decision and the trust factor for each tree ---
4271							since a classification decision made by a specific tree may appear bizarre for a
4272							given test sample. This method is useful primarily for debugging purposes.
4273
4274							=item B
4275
4276							As with the previous method, this method is useful mostly for debugging. It returns
4277							class labels for the samples misclassified by the stage whose integer index is
4278							supplied as an argument to the method. Say you have 10 stages in your cascade. The
4279							value of the argument C would go from 0 to 9, with 0 corresponding to
4280							the base tree.
4281
4282							=item B
4283
4284							Uses the "final classifier" formula of the AdaBoost algorithm to pool together the
4285							classification decisions made by the individual trees while taking into account the
4286							trust factors associated with the trees. As mentioned earlier, we associate with
4287							each tree of the cascade a trust factor that depends on the overall misclassification
4288							rate associated with that tree.
4289
4290							=back
4291
4292							See the example scripts in the C subdirectory for how to call the
4293							methods listed above for classifying individual data samples with boosting and for
4294							bulk classification when you place all your test samples in a single file.
4295
4296
4297							=head1 USING RANDOMIZED DECISION TREES
4298
4299							As mentioned earlier, the new C class allows you to solve
4300							the following two problems: (1) Data classification using the needle-in-a-haystack
4301							metaphor, that is, when a vast majority of your training samples belong to just one
4302							class. And (2) You have access to a very large database of training samples and you
4303							wish to construct an ensemble of decision trees for classification.
4304
4305							=over 4
4306
4307							=item B
4308
4309							Here is how you'd call the C constructor for
4310							needle-in-a-haystack classification:
4311
4312							use Algorithm::RandomizedTreesForBigData;
4313							my $training_datafile = "your_database.csv";
4314							my $rt = Algorithm::RandomizedTreesForBigData->new(
4315							training_datafile => $training_datafile,
4316							csv_class_column_index => 48,
4317							csv_columns_for_features => [24,32,33,34,41],
4318							entropy_threshold => 0.01,
4319							max_depth_desired => 8,
4320							symbolic_to_numeric_cardinality_threshold => 10,
4321							how_many_trees => 5,
4322							looking_for_needles_in_haystack => 1,
4323							);
4324
4325							Note in particular the constructor parameters:
4326
4327							looking_for_needles_in_haystack
4328							how_many_trees
4329
4330							The first of these parameters, C, invokes the logic for
4331							constructing an ensemble of decision trees, each based on a training dataset that
4332							uses all of the minority class samples, and a random drawing from the majority class
4333							samples.
4334
4335							Here is how you'd call the C constructor for a more
4336							general attempt at constructing an ensemble of decision trees, with each tree trained
4337							with randomly drawn samples from a large database of training data (without paying
4338							attention to the differences in the sizes of the populations for the different
4339							classes):
4340
4341							use Algorithm::RandomizedTreesForBigData;
4342							my $training_datafile = "your_database.csv";
4343							my $rt = Algorithm::RandomizedTreesForBigData->new(
4344							training_datafile => $training_datafile,
4345							csv_class_column_index => 2,
4346							csv_columns_for_features => [3,4,5,6,7,8],
4347							entropy_threshold => 0.01,
4348							max_depth_desired => 8,
4349							symbolic_to_numeric_cardinality_threshold => 10,
4350							how_many_trees => 3,
4351							how_many_training_samples_per_tree => 50,
4352							);
4353
4354							Note in particular the constructor parameters:
4355
4356							how_many_training_samples_per_tree
4357							how_many_trees
4358
4359							When you set the C parameter, you are not allowed
4360							to also set the C parameter, and vice versa.
4361
4362							=back
4363
4364							=head2 B class>
4365
4366							=over 8
4367
4368							=item B
4369
4370							What this method does depends on which of the two constructor parameters ---
4371							C or C --- is
4372							set. When the former is set, it creates a collection of training datasets for
4373							C number of decision trees, with each dataset being a mixture of the
4374							minority class and sample drawn randomly from the majority class. However, when the
4375							latter option is set, all the datasets are drawn randomly from the training database
4376							with no particular attention given to the relative populations of the two classes.
4377
4378							=item B
4379
4380							As the name implies, this method shows the training data being used for all the
4381							decision trees. This method is useful for debugging purposes using small datasets.
4382
4383							=item B
4384
4385							Calls on the appropriate method of the main C to estimate the
4386							first-order probabilities for the training dataset to be used for each decision tree.
4387
4388							=item B
4389
4390							Calls on the appropriate method of the main C class to estimate the
4391							class priors for the training dataset to be used for each decision tree.
4392
4393							=item B
4394
4395							Calls on the appropriate method of the main C class to construct the
4396							decision trees.
4397
4398							=item B
4399
4400							Displays all the decision trees in your terminal window. (The textual form of the
4401							decision trees is written out to the standard output.)
4402
4403							=item B
4404
4405							The test_sample is sent to each decision tree for classification.
4406
4407							=item B
4408
4409							The classification decisions returned by the individual decision trees are written
4410							out to the standard output.
4411
4412							=item B
4413
4414							This method aggregates the classification results returned by the individual decision
4415							trees and returns the majority decision.
4416
4417							=back
4418
4419							=head1 CONSTRUCTING REGRESSION TREES:
4420
4421							Decision tree based modeling requires that the class labels be distinct. That is,
4422							the training dataset must contain a relatively small number of discrete class labels
4423							for all of your data records if you want to model the data with one or more decision
4424							trees. However, when one is trying to understand all of the associational
4425							relationships that exist in a large database, one often runs into situations where,
4426							instead of discrete class labels, you have a continuously valued variable as a
4427							dependent variable whose values are predicated on a set of feature values. It is for
4428							such situations that you will find useful the new class C that is now
4429							a part of the C module. The C class has been
4430							programmed as a subclass of the main C class.
4431
4432							You can think of regression with a regression tree as a powerful generalization of
4433							the very commonly used Linear Regression algorithms. Although you can certainly
4434							carry out polynomial regression with run-of-the-mill Linear Regression algorithms for
4435							modeling nonlinearities between the predictor variables and the dependent variable,
4436							specifying the degree of the polynomial is often tricky. Additionally, a polynomial
4437							can inject continuities between the predictor and the predicted variables that may
4438							not really exist in the real data. Regression trees, on the other hand, give you a
4439							piecewise linear relationship between the predictor and the predicted variables that
4440							is freed from the constraints of superimposed continuities at the joins between the
4441							different segments. See the following tutorial for further information regarding the
4442							standard linear regression approach and the regression that can be achieved with the
4443							RegressionTree class in this module:
4444							L
4445
4446							The RegressionTree class in the current version of the module assumes that all of
4447							your data is numerical. That is, unlike what is possible with the DecisionTree class
4448							(and the other more closely related classes in this module) that allow your training
4449							file to contain a mixture of numerical and symbolic data, the RegressionTree class
4450							requires that ALL of your data be numerical. I hope to relax this constraint in
4451							future versions of this module. Obviously, the dependent variable will always be
4452							numerical for regression.
4453
4454							See the example scripts in the directory C if you wish to become
4455							more familiar with the regression capabilities of the module.
4456
4457							=over 4
4458
4459							=item B
4460
4461							my $training_datafile = "gendata5.csv";
4462							my $rt = Algorithm::RegressionTree->new(
4463							training_datafile => $training_datafile,
4464							dependent_variable_column => 2,
4465							predictor_columns => [1],
4466							mse_threshold => 0.01,
4467							max_depth_desired => 2,
4468							jacobian_choice => 0,
4469							);
4470
4471							Note in particular the constructor parameters:
4472
4473							dependent_variable
4474							predictor_columns
4475							mse_threshold
4476							jacobian_choice
4477
4478							The first of these parameters, C, is set to the column index in
4479							the CSV file for the dependent variable. The second constructor parameter,
4480							C, tells the system as to which columns contain values for the
4481							predictor variables. The third parameter, C, is for deciding when to
4482							partition the data at a node into two child nodes as a regression tree is being
4483							constructed. If the minmax of MSE (Mean Squared Error) that can be achieved by
4484							partitioning any of the features at a node is smaller than C, that
4485							node becomes a leaf node of the regression tree.
4486
4487							The last parameter, C, must be set to either 0 or 1 or 2. Its
4488							default value is 0. When this parameter equals 0, the regression coefficients are
4489							calculated using the linear least-squares method and no further "refinement" of the
4490							coefficients is carried out using gradient descent. This is the fastest way to
4491							calculate the regression coefficients. When C is set to 1, you get
4492							a weak version of gradient descent in which the Jacobian is set to the "design
4493							matrix" itself. Choosing 2 for C results in a more reasonable
4494							approximation to the Jacobian. That, however, is at a cost of much longer
4495							computation time. B For most cases, using 0 for C is the
4496							best choice. See my tutorial "I" for why
4497							that is the case.
4498
4499							=back
4500
4501							=head2 B class>
4502
4503							=over 8
4504
4505							=item B
4506
4507							Only CSV training datafiles are allowed. Additionally, the first record in the file
4508							must list the names of the fields, and the first column must contain an integer ID
4509							for each record.
4510
4511							=item B
4512
4513							As the name implies, this is the method that construct a regression tree.
4514
4515							=item B
4516
4517							Displays the regression tree, as the name implies. The white-space string argument
4518							specifies the offset to use in displaying the child nodes in relation to a parent
4519							node.
4520
4521							=item B
4522
4523							You call this method after you have constructed a regression tree if you want to
4524							calculate the prediction for one sample. The parameter C<$root_node> is what is
4525							returned by the call C. The formatting of the argument
4526							bound to the C<$test_sample> parameter is important. To elaborate, let's say you are
4527							using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In
4528							this case, the C<$test_sample> parameter will be bound to a list that will look like
4529
4530							['xvar1 = 23.4', 'xvar2 = 12.9']
4531
4532							Arbitrary amount of white space, including none, on the two sides of the equality
4533							symbol is allowed in the construct shown above. A call to this method returns a
4534							dictionary with two key-value pairs. One of the keys is called C and
4535							the other C. The value associated with key C is the path
4536							in the regression tree to the leaf node that yielded the prediction. And the value
4537							associated with the key C is the answer you are looking for.
4538
4539							=item B
4540
4541							This call calculates the predictions for all of the predictor variables data in your
4542							training file. The parameter C<$root_node> is what is returned by the call to
4543							C. The values for the dependent variable thus predicted
4544							can be seen by calling C, which is the method mentioned below.
4545
4546							=item B
4547
4548							This method displays the results obtained by calling the prediction method of the
4549							previous entry. This method also creates a hardcopy of the plots and saves it as a
4550							C<.png> disk file. The name of this output file is always C.
4551
4552							=item B
4553
4554							This method carries out an error analysis of the predictions for the samples in your
4555							training datafile. It shows you the overall MSE (Mean Squared Error) with tree-based
4556							regression, the MSE for the data samples at each of the leaf nodes of the regression
4557							tree, and the MSE for the plain old Linear Regression as applied to all of the data.
4558							The parameter C<$root_node> in the call syntax is what is returned by the call to
4559							C.
4560
4561							=item B
4562
4563							Call this method if you want to apply the regression tree to all your test data in a
4564							disk file. The predictions for all of the test samples in the disk file are written
4565							out to another file whose name is the same as that of the test file except for the
4566							addition of C<_output> in the name of the file. The parameter C<$filename> is the
4567							name of the disk file that contains the test data. And the parameter C<$columns> is a
4568							list of the column indices for the predictor variables in the test file.
4569
4570							=back
4571
4572							=head1 GENERATING SYNTHETIC TRAINING DATA
4573
4574							The module file contains the following additional classes: (1)
4575							C, and (2) C for
4576							generating synthetic training data.
4577
4578							The class C outputs one CSV file for the
4579							training data and another one for the test data for experimenting with numeric
4580							features. The numeric values are generated using a multivariate Gaussian
4581							distribution whose mean and covariance are specified in a parameter file. See the
4582							file C in the C directory for an example of such a
4583							parameter file. Note that the dimensionality of the data is inferred from the
4584							information you place in the parameter file.
4585
4586							The class C generates synthetic training for the
4587							purely symbolic case. The relative frequencies of the different possible values for
4588							the features is controlled by the biasing information you place in a parameter file.
4589							See C for an example of such a file.
4590
4591
4592							=head1 THE C DIRECTORY
4593
4594							See the C directory in the distribution for how to construct a decision
4595							tree, and how to then classify new data using the decision tree. To become more
4596							familiar with the module, run the scripts
4597
4598							construct_dt_and_classify_one_sample_case1.pl
4599							construct_dt_and_classify_one_sample_case2.pl
4600							construct_dt_and_classify_one_sample_case3.pl
4601							construct_dt_and_classify_one_sample_case4.pl
4602
4603							The first script is for the purely symbolic case, the second for the case that
4604							involves both numeric and symbolic features, the third for the case of purely numeric
4605							features, and the last for the case when the training data is synthetically generated
4606							by the script C.
4607
4608							Next run the following script as it is for bulk classification of data records placed
4609							in a CSV file:
4610
4611							classify_test_data_in_a_file.pl training4.csv test4.csv out4.csv
4612
4613							The script first constructs a decision tree using the training data in the training
4614							file supplied by the first argument file C. The script then
4615							calculates the class label for each data record in the test data file supplied
4616							through the second argument file, C. The estimated class labels are
4617							written out to the output file which in the call shown above is C. An
4618							important thing to note here is that your test file --- in this case C ---
4619							must have a column for class labels. Obviously, in real-life situations, there will
4620							be no class labels in this column. What that is the case, you can place an empty
4621							string C<""> there for each data record. This is demonstrated by the following call:
4622
4623							classify_test_data_in_a_file.pl training4.csv test4_no_class_labels.csv out4.csv
4624
4625							The following script in the C directory
4626
4627							classify_by_asking_questions.pl
4628
4629							shows how you can use a decision-tree classifier interactively. In this mode, you
4630							first construct the decision tree from the training data and then the user is
4631							prompted for answers to the feature tests at the nodes of the tree.
4632
4633							If your training data has a feature whose values span a large range and, at the same
4634							time, are characterized by a heavy-tail distribution, you should look at the script
4635
4636							construct_dt_for_heavytailed.pl
4637
4638							to see how to use the option C in the call to the
4639							constructor. This option was introduced in Version 2.22 for dealing with such
4640							features. If you do not set this option, the module will use the default value of
4641							500 for the number of points at which to sample the value range for such a feature.
4642
4643							The C directory also contains the following scripts:
4644
4645							generate_training_data_numeric.pl
4646							generate_training_data_symbolic.pl
4647
4648							that show how you can use the module to generate synthetic training. Synthetic
4649							training is generated according to the specifications laid out in a parameter file.
4650							There are constraints on how the information is laid out in a parameter file. See
4651							the files C and C in the C directory
4652							for how to structure these files.
4653
4654							The C directory of Versions 2.1 and higher of the module also contains the
4655							following two scripts:
4656
4657							evaluate_training_data1.pl
4658							evaluate_training_data2.pl
4659
4660							that illustrate how the Perl class C can be used to evaluate the
4661							quality of your training data (as long as it resides in a `C<.csv>' file.) This new
4662							class is a subclass of the C class in the module file. See the README
4663							in the C directory for further information regarding these two scripts.
4664
4665							The C directory of Versions 2.31 and higher of the module contains the
4666							following three scripts:
4667
4668							introspection_in_a_loop_interactive.pl
4669
4670							introspection_show_training_samples_at_all_nodes_direct_influence.pl
4671
4672							introspection_show_training_samples_to_nodes_influence_propagation.pl
4673
4674							The first script illustrates how to use the C class of the module
4675							interactively for generating explanations for the classification decisions made at
4676							the nodes of the decision tree. In the interactive session you are first asked for
4677							the node number you are interested in. Subsequently, you are asked for whether or
4678							not you are interested in specific questions that the introspector can provide
4679							answers for. The second script generates a tabular display that shows for each node
4680							of the decision tree a list of the training samples that fall directly in the portion
4681							of the feature space assigned that node. (As mentioned elsewhere in this
4682							documentation, when this list is empty for a node, that means the node is a result of
4683							the generalization achieved by probabilistic modeling of the data. Note that this
4684							module constructs a decision tree NOT by partitioning the set of training samples,
4685							BUT by partitioning the domains of the probability density functions.) The third
4686							script listed above also generates a tabular display, but one that shows how the
4687							influence of each training sample propagates in the tree. This display first shows
4688							the list of nodes that are affected directly by the data in a training sample. This
4689							list is followed by an indented display of the nodes that are affected indirectly by
4690							the training sample. A training sample affects a node indirectly if the node is a
4691							descendant of one of the nodes affected directly.
4692
4693
4694							=head1 THE C DIRECTORY
4695
4696							The C directory contains the following scripts:
4697
4698							bagging_for_classifying_one_test_sample.pl
4699
4700							bagging_for_bulk_classification.pl
4701
4702							As the names of the scripts imply, the first shows how to call the different methods
4703							of the C class for classifying a single test sample. When
4704							you are classifying a single test sample, you can also see how each bag is
4705							classifying the test sample. You can, for example, display the training data used in
4706							each bag, the decision tree constructed for each bag, etc.
4707
4708							The second script is for the case when you place all of the test samples in a single
4709							file. The demonstration script displays for each test sample a single aggregate
4710							classification decision that is obtained through majority voting by all the decision
4711							trees.
4712
4713
4714							=head1 THE C DIRECTORY
4715
4716							The C subdirectory in the main installation directory contains the
4717							following three scripts:
4718
4719							boosting_for_classifying_one_test_sample_1.pl
4720
4721							boosting_for_classifying_one_test_sample_2.pl
4722
4723							boosting_for_bulk_classification.pl
4724
4725							As the names of the first two scripts imply, these show how to call the different
4726							methods of the C class for classifying a single test sample.
4727							When you are classifying a single test sample, you can see how each stage of the
4728							cascade of decision trees is classifying the test sample. You can also view each
4729							decision tree separately and also see the trust factor associated with the tree.
4730
4731							The third script is for the case when you place all of the test samples in a single
4732							file. The demonstration script outputs for each test sample a single aggregate
4733							classification decision that is obtained through trust-factor weighted majority
4734							voting by all the decision trees.
4735
4736							=head1 THE C DIRECTORY
4737
4738							The C directory shows example scripts that you can use to
4739							become more familiar with the C class for solving
4740							needle-in-a-haystack and big-data data classification problems. These scripts are:
4741
4742							randomized_trees_for_classifying_one_test_sample_1.pl
4743
4744							randomized_trees_for_classifying_one_test_sample_2.pl
4745
4746							classify_database_records.pl
4747
4748							The first script shows the constructor options to use for solving a
4749							needle-in-a-haystack problem --- that is, a problem in which a vast majority of the
4750							training data belongs to just one class. The second script shows the constructor
4751							options for using randomized decision trees for the case when you have access to a
4752							very large database of training samples and you'd like to construct an ensemble of
4753							decision trees using training samples pulled randomly from the training database.
4754							The last script illustrates how you can evaluate the classification power of an
4755							ensemble of decision trees as constructed by C by classifying
4756							a large number of test samples extracted randomly from the training database.
4757
4758
4759							=head1 THE C DIRECTORY
4760
4761							The C subdirectory in the main installation directory shows
4762							example scripts that you can use to become familiar with regression trees and how
4763							they can be used for nonlinear regression. If you are new to the concept of
4764							regression trees, start by executing the following scripts without changing them and
4765							see what sort of output is produced by them:
4766
4767							regression4.pl
4768
4769							regression5.pl
4770
4771							regression6.pl
4772
4773							regression8.pl
4774
4775							The C script involves only one predictor variable and one dependent
4776							variable. The training data for this exercise is drawn from the file C.
4777							This data file contains strongly nonlinear data. When you run the script
4778							C, you will see how much better the result from tree regression is
4779							compared to what you can get with linear regression.
4780
4781							The C script is essentially the same as the previous script except
4782							for the fact that the training datafile used in this case, C, consists
4783							of three noisy segments, as opposed to just two in the previous case.
4784
4785							The script C deals with the case when we have two predictor variables
4786							and one dependent variable. You can think of the data as consisting of noisy height
4787							values over an C<(x1,x2)> plane. The data used in this script is drawn from the csv
4788							file C.
4789
4790							Finally, the script C shows how you can carry out bulk prediction for
4791							all your test data records in a disk file. The script writes all the calculated
4792							predictions into another disk file whose name is derived from the name of the test
4793							data file.
4794
4795
4796							=head1 EXPORT
4797
4798							None by design.
4799
4800							=head1 BUGS
4801
4802							Please notify the author if you encounter any bugs. When sending email, please place
4803							the string 'DecisionTree' in the subject line.
4804
4805							=head1 INSTALLATION
4806
4807							Download the archive from CPAN in any directory of your choice. Unpack the archive
4808							with a command that on a Linux machine would look like:
4809
4810							tar zxvf Algorithm-DecisionTree-3.41.tar.gz
4811
4812							This will create an installation directory for you whose name will be
4813							C. Enter this directory and execute the following
4814							commands for a standard install of the module if you have root privileges:
4815
4816							perl Makefile.PL
4817							make
4818							make test
4819							sudo make install
4820
4821							If you do not have root privileges, you can carry out a non-standard install the
4822							module in any directory of your choice by:
4823
4824							perl Makefile.PL prefix=/some/other/directory/
4825							make
4826							make test
4827							make install
4828
4829							With a non-standard install, you may also have to set your PERL5LIB environment
4830							variable so that this module can find the required other modules. How you do that
4831							would depend on what platform you are working on. In order to install this module in
4832							a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
4833							variable by
4834
4835							setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4836
4837							If I used bash, I'd need to declare:
4838
4839							export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4840
4841
4842							=head1 THANKS
4843
4844							I wish to thank many users of this module for their feedback. Many of the
4845							improvements I have made to the module over the years are a result of the feedback
4846							received.
4847
4848							I thank Slaven Rezic for pointing out that the module worked with Perl 5.14.x. For
4849							Version 2.22, I had set the required version of Perl to 5.18.0 since that's what I
4850							used for testing the module. Slaven's feedback in the form of the Bug report
4851							C<#96547> resulted in Version 2.23 of the module. Version 2.25 further downshifts
4852							the required version of Perl to 5.10.
4853
4854							On the basis of the report posted by Slaven at C regarding Version 2.27,
4855							I am removing the Perl version restriction altogether from Version 2.30. Thanks
4856							Slaven!
4857
4858
4859							=head1 AUTHOR
4860
4861							The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy Project"
4862							with the publication of the book I by John-Wiley. If
4863							interested, visit his web page at Purdue to find out what this project was all
4864							about. You might like I especially if you enjoyed reading
4865							Harry Potter as a kid (or even as an adult, for that matter).
4866
4867							If you send email regarding this module, please place the string "DecisionTree" in
4868							your subject line to get past my spam filter. Avi Kak's email address is
4869							C
4870
4871							=head1 COPYRIGHT
4872
4873							This library is free software; you can redistribute it and/or modify it under the
4874							same terms as Perl itself.
4875
4876							Copyright 2016 Avinash Kak
4877
4878							=cut
4879