File Coverage

blib/lib/Algorithm/DecisionTree.pm

Criterion	Covered	Total	%
statement	774	2249	34.4
branch	193	708	27.2
condition	41	234	17.5
subroutine	44	92	47.8
pod	8	40	20.0
total	1060	3323	31.9

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::DecisionTree;
2
3							#--------------------------------------------------------------------------------------
4							# Copyright (c) 2017 Avinash Kak. All rights reserved. This program is free
5							# software. You may modify and/or distribute it under the same terms as Perl itself.
6							# This copyright notice must remain attached to the file.
7							#
8							# Algorithm::DecisionTree is a Perl module for decision-tree based classification of
9							# multidimensional data.
10							# -------------------------------------------------------------------------------------
11
12							#use 5.10.0;
13	1			1		22561	use strict;
	1					4
	1					46
14	1			1		9	use warnings;
	1					3
	1					51
15	1			1		7	use Carp;
	1					9
	1					21944
16
17							our $VERSION = '3.43';
18
19							############################################ Constructor ##############################################
20							sub new {
21	1			1	1	559	my ($class, %args, $eval_or_boosting_mode);
22	1	50				6	if (@_ % 2 != 0) {
23	1					7	($class, %args) = @_;
24							} else {
25	0					0	$class = shift;
26	0					0	$eval_or_boosting_mode = shift;
27	0	0	0			0	die unless $eval_or_boosting_mode eq 'evalmode' \|\| $eval_or_boosting_mode eq 'boostingmode';
28	0	0				0	die "Only one string arg allowed in eval and boosting modes" if @_;
29							}
30	1	50				5	unless ($eval_or_boosting_mode) {
31	1					5	my @params = keys %args;
32	1	50				5	croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
33							if check_for_illegal_params2(@params) == 0;
34							}
35							bless {
36							_training_datafile => $args{training_datafile},
37							_entropy_threshold => $args{entropy_threshold} \|\| 0.01,
38							_max_depth_desired => exists $args{max_depth_desired} ?
39							$args{max_depth_desired} : undef,
40							_debug1 => $args{debug1} \|\| 0,
41							_debug2 => $args{debug2} \|\| 0,
42							_debug3 => $args{debug3} \|\| 0,
43							_csv_class_column_index => $args{csv_class_column_index} \|\| undef,
44							_csv_columns_for_features => $args{csv_columns_for_features} \|\| undef,
45							_symbolic_to_numeric_cardinality_threshold
46							=> $args{symbolic_to_numeric_cardinality_threshold} \|\| 10,
47							_number_of_histogram_bins => $args{number_of_histogram_bins} \|\| undef,
48	1	50	50			76	_csv_cleanup_needed => $args{csv_cleanup_needed} \|\| 0,
			50
			50
			50
			50
			50
			50
			50
			50
49							_training_data => [],
50							_root_node => undef,
51							_probability_cache => {},
52							_entropy_cache => {},
53							_training_data_hash => {},
54							_features_and_values_hash => {},
55							_samples_class_label_hash => {},
56							_class_names => [],
57							_class_priors => [],
58							_class_priors_hash => {},
59							_feature_names => [],
60							_numeric_features_valuerange_hash => {},
61							_sampling_points_for_numeric_feature_hash => {},
62							_feature_values_how_many_uniques_hash => {},
63							_prob_distribution_numeric_features_hash => {},
64							_histogram_delta_hash => {},
65							_num_of_histogram_bins_hash => {},
66							}, $class;
67
68							}
69
70							#################################### Classify with Decision Tree #######################################
71
72							## Classifies one test sample at a time using the decision tree constructed from
73							## your training file. The data record for the test sample must be supplied as
74							## shown in the scripts in the `examples' subdirectory. See the scripts
75							## construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
76							sub classify {
77	0			0	1	0	my $self = shift;
78	0					0	my $root_node = shift;
79	0					0	my $feature_and_values = shift;
80	0					0	my $numregex = '[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?';
81	0					0	my @features_and_values = @$feature_and_values;
82	0					0	@features_and_values = @{deep_copy_array(\@features_and_values)};
	0					0
83	0	0				0	die "\n\nError in the names you have used for features and/or values. " .
84							"Try using the csv_cleanup_needed option in the constructor call."
85							unless $self->check_names_used(\@features_and_values);
86	0					0	my @new_features_and_values = ();
87	0					0	my $pattern = '(\S+)\s=\s(\S+)';
88	0					0	foreach my $feature_and_value (@features_and_values) {
89	0					0	$feature_and_value =~ /$pattern/;
90	0					0	my ($feature, $value) = ($1, $2);
91	0					0	my $newvalue = $value;
92	0					0	my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
	0					0
93	0					0	my $not_all_values_float = 0;
94	0	0				0	map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
	0					0
95	0	0	0			0	if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
	0					0
96							$not_all_values_float == 0) {
97	0					0	$newvalue = closest_sampling_point($value, \@unique_values_for_feature);
98							}
99	0					0	push @new_features_and_values, "$feature" . '=' . "$newvalue";
100							}
101	0					0	@features_and_values = @new_features_and_values;
102	0	0				0	print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
103	0					0	my %answer = ();
104	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
105	0					0	$answer{$class_name} = undef;
106							}
107	0					0	$answer{'solution_path'} = [];
108	0					0	my %classification = %{$self->recursive_descent_for_classification($root_node,
	0					0
109							\@features_and_values,\%answer)};
110	0					0	@{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
	0					0
	0					0
111	0	0				0	if ($self->{_debug3}) {
112	0					0	print "\nCL2 The classification:\n";
113	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
114	0					0	print " $class_name with probability $classification{$class_name}\n";
115							}
116							}
117	0					0	my %classification_for_display = ();
118	0					0	foreach my $item (keys %classification) {
119	0	0				0	if ($item ne 'solution_path') {
120	0					0	$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
121							} else {
122	0					0	my @outlist = ();
123	0					0	foreach my $x (@{$classification{$item}}) {
	0					0
124	0					0	push @outlist, "NODE$x";
125							}
126	0					0	$classification_for_display{$item} = \@outlist;
127							}
128							}
129	0					0	return \%classification_for_display;
130							}
131
132							sub recursive_descent_for_classification {
133	0			0	0	0	my $self = shift;
134	0					0	my $node = shift;
135	0					0	my $features_and_values = shift;
136	0					0	my $answer = shift;
137	0					0	my @features_and_values = @$features_and_values;
138	0					0	my %answer = %$answer;
139	0					0	my @children = @{$node->get_children()};
	0					0
140	0	0				0	if (@children == 0) {
141	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
142	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
143	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
144							}
145	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
146	0					0	return \%answer;
147							}
148	0					0	my $feature_tested_at_node = $node->get_feature();
149							print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n"
150	0	0				0	if $self->{_debug3};
151	0					0	my $value_for_feature;
152							my $path_found;
153	0					0	my $pattern = '(\S+)\s=\s(\S+)';
154	0					0	foreach my $feature_and_value (@features_and_values) {
155	0					0	$feature_and_value =~ /$pattern/;
156	0	0				0	$value_for_feature = $2 if $feature_tested_at_node eq $1;
157							}
158							# The following clause introduced in Version 3.20
159	0	0				0	if (!defined $value_for_feature) {
160	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
161	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
162	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
163							}
164	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
165	0					0	return \%answer;
166							}
167	0	0				0	if ($value_for_feature) {
168	0	0				0	if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
	0					0
169	0	0				0	print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};
170	0					0	my $pattern1 = '(.+)<(.+)';
171	0					0	my $pattern2 = '(.+)>(.+)';
172	0					0	foreach my $child (@children) {
173	0					0	my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
	0					0
174	0					0	my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
175	0	0				0	if ($last_feature_and_value_on_branch =~ /$pattern1/) {
176	0					0	my ($feature, $threshold) = ($1,$2);
177	0	0				0	if ($value_for_feature <= $threshold) {
178	0					0	$path_found = 1;
179	0					0	%answer = %{$self->recursive_descent_for_classification($child,
	0					0
180							$features_and_values,\%answer)};
181	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
182	0					0	last;
183							}
184							}
185	0	0				0	if ($last_feature_and_value_on_branch =~ /$pattern2/) {
186	0					0	my ($feature, $threshold) = ($1,$2);
187	0	0				0	if ($value_for_feature > $threshold) {
188	0					0	$path_found = 1;
189	0					0	%answer = %{$self->recursive_descent_for_classification($child,
	0					0
190							$features_and_values,\%answer)};
191	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
192	0					0	last;
193							}
194							}
195							}
196	0	0				0	return \%answer if $path_found;
197							} else {
198	0					0	my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";
199							print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n"
200	0	0				0	if $self->{_debug3};
201	0					0	foreach my $child (@children) {
202	0					0	my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
	0					0
203	0	0				0	print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};
204	0					0	my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
205	0	0				0	if ($last_feature_and_value_on_branch eq $feature_value_combo) {
206	0					0	%answer = %{$self->recursive_descent_for_classification($child,
	0					0
207							$features_and_values,\%answer)};
208	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
209	0					0	$path_found = 1;
210	0					0	last;
211							}
212							}
213	0	0				0	return \%answer if $path_found;
214							}
215							}
216	0	0				0	if (! $path_found) {
217	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
218	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
219	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
220							}
221	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
222							}
223	0					0	return \%answer;
224							}
225
226							## If you want classification to be carried out by engaging a human user in a
227							## question-answer session, this is the method to use for that purpose. See, for
228							## example, the script classify_by_asking_questions.pl in the `examples'
229							## subdirectory for an illustration of how to do that.
230							sub classify_by_asking_questions {
231	0			0	1	0	my $self = shift;
232	0					0	my $root_node = shift;
233	0					0	my %answer = ();
234	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
235	0					0	$answer{$class_name} = undef;
236							}
237	0					0	$answer{'solution_path'} = [];
238	0					0	my %scratchpad_for_numeric_answers = ();
239	0					0	foreach my $feature_name (keys %{$self->{_prob_distribution_numeric_features_hash}}) {
	0					0
240	0					0	$scratchpad_for_numeric_answers{$feature_name} = undef;
241							}
242	0					0	my %classification = %{$self->interactive_recursive_descent_for_classification($root_node,
	0					0
243							\%answer, \%scratchpad_for_numeric_answers)};
244	0					0	@{$classification{'solution_path'}} = reverse @{$classification{'solution_path'}};
	0					0
	0					0
245	0					0	my %classification_for_display = ();
246	0					0	foreach my $item (keys %classification) {
247	0	0				0	if ($item ne 'solution_path') {
248	0					0	$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
249							} else {
250	0					0	my @outlist = ();
251	0					0	foreach my $x (@{$classification{$item}}) {
	0					0
252	0					0	push @outlist, "NODE$x";
253							}
254	0					0	$classification_for_display{$item} = \@outlist;
255							}
256							}
257	0					0	return \%classification_for_display;
258							}
259
260							sub interactive_recursive_descent_for_classification {
261	0			0	0	0	my $self = shift;
262	0					0	my $node = shift;
263	0					0	my $answer = shift;
264	0					0	my $scratchpad_for_numerics = shift;
265	0					0	my %answer = %$answer;
266	0					0	my %scratchpad_for_numerics = %$scratchpad_for_numerics;
267	0					0	my $pattern1 = '(.+)<(.+)';
268	0					0	my $pattern2 = '(.+)>(.+)';
269	0					0	my $user_value_for_feature;
270	0					0	my @children = @{$node->get_children()};
	0					0
271	0	0				0	if (@children == 0) {
272	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
273	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
274	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
275							}
276	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
277	0					0	return \%answer;
278							}
279	0					0	my @list_of_branch_attributes_to_children = ();
280	0					0	foreach my $child (@children) {
281	0					0	my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
	0					0
282	0					0	my $feature_and_value_on_branch = $branch_features_and_values[-1];
283	0					0	push @list_of_branch_attributes_to_children, $feature_and_value_on_branch;
284							}
285	0					0	my $feature_tested_at_node = $node->get_feature();
286	0					0	my $feature_value_combo;
287	0					0	my $path_found = 0;
288	0	0				0	if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
	0					0
289	0	0				0	if ($scratchpad_for_numerics{$feature_tested_at_node}) {
290	0					0	$user_value_for_feature = $scratchpad_for_numerics{$feature_tested_at_node};
291							} else {
292	0					0	my @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_tested_at_node}};
	0					0
293	0					0	while (1) {
294	0					0	print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
295	0					0	print "\nEnter a value in the range (@valuerange): ";
296	0					0	$user_value_for_feature = ;
297	0					0	$user_value_for_feature =~ s/\r?\n?$//;
298	0					0	$user_value_for_feature =~ s/^\s(\S+)\s$/$1/;
299	0					0	my $answer_found = 0;
300	0	0	0			0	if ($user_value_for_feature >= $valuerange[0] && $user_value_for_feature <= $valuerange[1]) {
301	0					0	$answer_found = 1;
302	0					0	last;
303							}
304	0	0				0	last if $answer_found;
305	0					0	print("You entered illegal value. Let's try again")
306							}
307	0					0	$scratchpad_for_numerics{$feature_tested_at_node} = $user_value_for_feature;
308							}
309	0					0	foreach my $i (0..@list_of_branch_attributes_to_children-1) {
310	0					0	my $branch_attribute = $list_of_branch_attributes_to_children[$i];
311	0	0				0	if ($branch_attribute =~ /$pattern1/) {
312	0					0	my ($feature,$threshold) = ($1,$2);
313	0	0				0	if ($user_value_for_feature <= $threshold) {
314	0					0	%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
	0					0
315							\%answer, \%scratchpad_for_numerics)};
316	0					0	$path_found = 1;
317	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
318	0					0	last;
319							}
320							}
321	0	0				0	if ($branch_attribute =~ /$pattern2/) {
322	0					0	my ($feature,$threshold) = ($1,$2);
323	0	0				0	if ($user_value_for_feature > $threshold) {
324	0					0	%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
	0					0
325							\%answer, \%scratchpad_for_numerics)};
326	0					0	$path_found = 1;
327	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
328	0					0	last;
329							}
330							}
331							}
332	0	0				0	return \%answer if $path_found;
333							} else {
334	0					0	my @possible_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature_tested_at_node}};
	0					0
335	0					0	while (1) {
336	0					0	print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
337	0					0	print "\nEnter a value from the list (@possible_values_for_feature): ";
338	0					0	$user_value_for_feature = ;
339	0					0	$user_value_for_feature =~ s/\r?\n?$//;
340	0					0	$user_value_for_feature =~ s/^\s(\S+)\s$/$1/;
341	0					0	my $answer_found = 0;
342	0	0				0	if (contained_in($user_value_for_feature, @possible_values_for_feature)) {
343	0					0	$answer_found = 1;
344	0					0	last;
345							}
346	0	0				0	last if $answer_found;
347	0					0	print("You entered illegal value. Let's try again");
348							}
349	0					0	$feature_value_combo = "$feature_tested_at_node=$user_value_for_feature";
350	0					0	foreach my $i (0..@list_of_branch_attributes_to_children-1) {
351	0					0	my $branch_attribute = $list_of_branch_attributes_to_children[$i];
352	0	0				0	if ($branch_attribute eq $feature_value_combo) {
353	0					0	%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
	0					0
354							\%answer, \%scratchpad_for_numerics)};
355	0					0	$path_found = 1;
356	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
357	0					0	last;
358							}
359							}
360	0	0				0	return \%answer if $path_found;
361							}
362	0	0				0	if (! $path_found) {
363	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
364	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
365	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
366							}
367	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
368							}
369	0					0	return \%answer;
370							}
371
372							###################################### Decision Tree Construction ####################################
373
374							## At the root node, we find the best feature that yields the greatest reduction in
375							## class entropy from the entropy based on just the class priors. The logic for
376							## finding this feature is different for symbolic features and for numeric features.
377							## That logic is built into the method shown later for best feature calculations.
378							sub construct_decision_tree_classifier {
379	1			1	1	18	print "\nConstructing the decision tree ...\n";
380	1					6	my $self = shift;
381	1	50				7	if ($self->{_debug3}) {
382	0					0	$self->determine_data_condition();
383	0					0	print "\nStarting construction of the decision tree:\n";
384							}
385	1					4	my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
	2					12
	1					6
386	1	50				9	if ($self->{_debug3}) {
387	0					0	print "\nPrior class probabilities: @class_probabilities\n";
388	0					0	print "\nClass names: @{$self->{_class_names}}\n";
	0					0
389							}
390	1					8	my $entropy = $self->class_entropy_on_priors();
391	1	50				9	print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};
392	1					35	my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');
393	1					4	$root_node->set_class_names(\@{$self->{_class_names}});
	1					8
394	1					4	$self->{_root_node} = $root_node;
395	1					7	$self->recursive_descent($root_node);
396	1					11	return $root_node;
397							}
398
399							## After the root node of the decision tree is calculated by the previous methods,
400							## we invoke this method recursively to create the rest of the tree. At each node,
401							## we find the feature that achieves the largest entropy reduction with regard to
402							## the partitioning of the training data samples that correspond to that node.
403							sub recursive_descent {
404	46			46	0	119	my $self = shift;
405	46					109	my $node = shift;
406							print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"
407	46	50				182	if $self->{_debug3};
408	46					171	my $node_serial_number = $node->get_serial_num();
409	46					106	my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
	46					175
410	46					165	my $existing_node_entropy = $node->get_node_entropy();
411	46	50				315	if ($self->{_debug3}) {
412	0					0	print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";
413	0					0	print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
414	0					0	print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
415	0					0	my @class_probs = @{$node->get_class_probabilities()};
	0					0
416	0					0	print "\nRD4 Class probabilities: @class_probs\n";
417							}
418	46	50				249	if ($existing_node_entropy < $self->{_entropy_threshold}) {
419	0	0				0	print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
420	0					0	return;
421							}
422	46					108	my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	46					157
423	46					245	my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
424							$self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
425	46					329	$node->set_feature($best_feature);
426	46	50				205	$node->display_node() if $self->{_debug3};
427	46	50	33			188	if (defined($self->{_max_depth_desired}) &&
428							(@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
429	0	0				0	print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3};
430	0					0	return;
431							}
432	46	50				175	return if ! defined $best_feature;
433	46	50				161	if ($self->{_debug3}) {
434	0					0	print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
435	0					0	print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
436	0					0	print "\nRD9 Best feature entropy: $best_feature_entropy\n";
437	0					0	print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
438							}
439	46					140	my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
440	46	50				174	print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
441	46	100				189	if ($entropy_gain > $self->{_entropy_threshold}) {
442	26	50	33			154	if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} &&
443							$self->{_feature_values_how_many_uniques_hash}->{$best_feature} >
444							$self->{_symbolic_to_numeric_cardinality_threshold}) {
445	0					0	my $best_threshold = $decision_val; # as returned by best feature calculator
446	0					0	my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
447							my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
448	0					0	@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	0					0
449							my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
450	0					0	@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	0					0
451	0					0	my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
452	0					0	my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
453	0					0	push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
454							$feature_threshold_combo_for_less_than;
455	0					0	push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
456							$feature_threshold_combo_for_greater_than;
457	0	0				0	if ($self->{_debug3}) {
458	0					0	print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
459							"@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
460	0					0	print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
461							"@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
462							}
463							my @class_probabilities_for_lessthan_child_node =
464	0					0	map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
465	0					0	\@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
	0					0
466							my @class_probabilities_for_greaterthan_child_node =
467	0					0	map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
468	0					0	\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
	0					0
469	0	0				0	if ($self->{_debug3}) {
470	0					0	print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
471	0					0	print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
472							}
473	0	0				0	if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
474	0					0	my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
475							\@class_probabilities_for_lessthan_child_node,
476							\@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
477	0					0	$node->add_child_link($left_child_node);
478	0					0	$self->recursive_descent($left_child_node);
479							}
480	0	0				0	if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
481	0					0	my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
482							\@class_probabilities_for_greaterthan_child_node,
483							\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
484	0					0	$node->add_child_link($right_child_node);
485	0					0	$self->recursive_descent($right_child_node);
486							}
487							} else {
488							print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
489	26	50				108	if $self->{_debug3};
490	26					68	my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
	26					189
491	26	50				116	print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
492	26					95	my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
	88					495
493	26					88	my @class_entropies_for_children = ();
494	26					126	foreach my $feature_and_value_index (0..@feature_value_combos-1) {
495							print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
496	88	50				328	if $self->{_debug3};
497	88					186	my @extended_branch_features_and_values_or_thresholds;
498	88	100				296	if (! @features_and_values_or_thresholds_on_branch) {
499	4					18	@extended_branch_features_and_values_or_thresholds =
500							($feature_value_combos[$feature_and_value_index]);
501							} else {
502							@extended_branch_features_and_values_or_thresholds =
503	84					177	@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	84					258
504	84					279	push @extended_branch_features_and_values_or_thresholds,
505							$feature_value_combos[$feature_and_value_index];
506							}
507							my @class_probabilities =
508	176					568	map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
509	88					191	\@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
	88					290
510	88					310	my $class_entropy_for_child =
511							$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
512							\@extended_branch_features_and_values_or_thresholds);
513	88	50				297	if ($self->{_debug3}) {
514	0					0	print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
515	0					0	print "\nRD20 class entropy for child: $class_entropy_for_child\n";
516							}
517	88	100				302	if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
518	45					290	my $child_node = DTNode->new(undef, $class_entropy_for_child,
519							\@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
520	45					227	$node->add_child_link($child_node);
521	45					289	$self->recursive_descent($child_node);
522							} else {
523	43	50				281	print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
524							}
525							}
526							}
527							} else {
528							print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n"
529	20	50				74	if $self->{_debug3};
530	20					159	return;
531							}
532							}
533
534							## This is the heart of the decision tree constructor. Its main job is to figure
535							## out the best feature to use for partitioning the training data samples that
536							## correspond to the current node. The search for the best feature is carried out
537							## differently for symbolic features and for numeric features. For a symbolic
538							## feature, the method estimates the entropy for each value of the feature and then
539							## averages out these entropies as a measure of the discriminatory power of that
540							## features. For a numeric feature, on the other hand, it estimates the entropy
541							## reduction that can be achieved if were to partition the set of training samples
542							## for each possible threshold. For a numeric feature, all possible sampling points
543							## relevant to the node in question are considered as candidates for thresholds.
544							sub best_feature_calculator {
545	46			46	0	108	my $self = shift;
546	46					105	my $features_and_values_or_thresholds_on_branch = shift;
547	46					120	my $existing_node_entropy = shift;
548	46					217	my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
549	46					109	my $pattern1 = '(.+)=(.+)';
550	46					106	my $pattern2 = '(.+)<(.+)';
551	46					145	my $pattern3 = '(.+)>(.+)';
552	46					125	my @all_symbolic_features = ();
553	46					112	foreach my $feature_name (@{$self->{_feature_names}}) {
	46					153
554							push @all_symbolic_features, $feature_name
555	184	50				611	if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
556							}
557	46					150	my @symbolic_features_already_used = ();
558	46					136	foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
559	138	50				1156	push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
560							}
561	46					150	my @symbolic_features_not_yet_used;
562	46					130	foreach my $x (@all_symbolic_features) {
563	184	100				539	push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
564							}
565	46					128	my @true_numeric_types = ();
566	46					103	my @symbolic_types = ();
567	46					160	my @true_numeric_types_feature_names = ();
568	46					116	my @symbolic_types_feature_names = ();
569	46					118	foreach my $item (@features_and_values_or_thresholds_on_branch) {
570	138	50				1047	if ($item =~ /$pattern2/) {
		50
		50
571	0					0	push @true_numeric_types, $item;
572	0					0	push @true_numeric_types_feature_names, $1;
573							} elsif ($item =~ /$pattern3/) {
574	0					0	push @true_numeric_types, $item;
575	0					0	push @true_numeric_types_feature_names, $1;
576							} elsif ($item =~ /$pattern1/) {
577	138					342	push @symbolic_types, $item;
578	138					439	push @symbolic_types_feature_names, $1;
579							} else {
580	0					0	die "format error in the representation of feature and values or thresholds";
581							}
582							}
583	46					182	my %seen = ();
584	46	0				130	@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
	0					0
585	46					123	%seen = ();
586	46	50				141	@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
	138					765
587							my @bounded_intervals_numeric_types =
588	46					112	@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
	46					205
589							# Calculate the upper and the lower bounds to be used when searching for the best
590							# threshold for each of the numeric features that are in play at the current node:
591	46					141	my (%upperbound, %lowerbound);
592	46					144	foreach my $feature (@true_numeric_types_feature_names) {
593	0					0	$upperbound{$feature} = undef;
594	0					0	$lowerbound{$feature} = undef;
595							}
596	46					130	foreach my $item (@bounded_intervals_numeric_types) {
597	0					0	foreach my $feature_grouping (@$item) {
598	0	0				0	if ($feature_grouping->[1] eq '>') {
599	0					0	$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
600							} else {
601	0					0	$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
602							}
603							}
604							}
605	46					141	my %entropy_values_for_different_features = ();
606	46					114	my %partitioning_point_child_entropies_hash = ();
607	46					107	my %partitioning_point_threshold = ();
608	46					118	my %entropies_for_different_values_of_symbolic_feature = ();
609	46					106	foreach my $feature (@{$self->{_feature_names}}) {
	46					159
610	184					448	$entropy_values_for_different_features{$feature} = [];
611	184					453	$partitioning_point_child_entropies_hash{$feature} = {};
612	184					374	$partitioning_point_threshold{$feature} = undef;
613	184					519	$entropies_for_different_values_of_symbolic_feature{$feature} = [];
614							}
615	46					112	foreach my $i (0..@{$self->{_feature_names}}-1) {
	46					167
616	184					544	my $feature_name = $self->{_feature_names}->[$i];
617	184	50				545	print "\n\nBFC1 FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
618	184	100	33			590	if (contained_in($feature_name, @symbolic_features_already_used)) {
		50
619	138					431	next;
620	46					231	} elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
621							$self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
622							$self->{_symbolic_to_numeric_cardinality_threshold}) {
623	0					0	my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
	0					0
624	0	0				0	print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};
625	0					0	my @newvalues = ();
626	0	0				0	if (contained_in($feature_name, @true_numeric_types_feature_names)) {
627	0	0	0			0	if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
		0	0
		0	0
		0	0
628							$lowerbound{$feature_name} >= $upperbound{$feature_name}) {
629	0					0	next;
630							} elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
631							$lowerbound{$feature_name} < $upperbound{$feature_name}) {
632	0					0	foreach my $x (@values) {
633	0	0	0			0	push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
634							}
635							} elsif (defined($upperbound{$feature_name})) {
636	0					0	foreach my $x (@values) {
637	0	0				0	push @newvalues, $x if $x <= $upperbound{$feature_name};
638							}
639							} elsif (defined($lowerbound{$feature_name})) {
640	0					0	foreach my $x (@values) {
641	0	0				0	push @newvalues, $x if $x > $lowerbound{$feature_name};
642							}
643							} else {
644	0					0	die "Error is bound specifications in best feature calculator";
645							}
646							} else {
647	0					0	@newvalues = @{deep_copy_array(\@values)};
	0					0
648							}
649	0	0				0	next if @newvalues == 0;
650	0					0	my @partitioning_entropies = ();
651	0					0	foreach my $value (@newvalues) {
652	0					0	my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
653	0					0	my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
654	0					0	my @for_left_child;
655							my @for_right_child;
656	0	0				0	if (@features_and_values_or_thresholds_on_branch) {
657	0					0	@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	0					0
658	0					0	push @for_left_child, $feature_and_less_than_value_string;
659	0					0	@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	0					0
660	0					0	push @for_right_child, $feature_and_greater_than_value_string;
661							} else {
662	0					0	@for_left_child = ($feature_and_less_than_value_string);
663	0					0	@for_right_child = ($feature_and_greater_than_value_string);
664							}
665	0					0	my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(
666							\@features_and_values_or_thresholds_on_branch, $feature_name, $value);
667	0					0	my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
668							\@features_and_values_or_thresholds_on_branch, $feature_name, $value);
669	0					0	my $partitioning_entropy = $entropy1 *
670							$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
671							$entropy2 *
672							$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);
673
674	0					0	push @partitioning_entropies, $partitioning_entropy;
675	0					0	$partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
676							}
677	0					0	my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
678	0	0				0	if ($min_entropy < $existing_node_entropy) {
679	0					0	$partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
680	0					0	$entropy_values_for_different_features{$feature_name} = $min_entropy;
681							}
682							} else {
683	46	50				190	print "\nBFC2: Entering section reserved for symbolic features\n" if $self->{_debug3};
684	46	50				175	print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
685	46					140	my %seen;
686	159	50				1182	my @values = grep {$_ ne 'NA' && !$seen{$_}++}
687	46					117	@{$self->{_features_and_unique_values_hash}->{$feature_name}};
	46					223
688	46					282	@values = sort @values;
689	46	50				210	print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};
690
691	46					134	my $entropy = 0;
692	46					154	foreach my $value (@values) {
693	159					674	my $feature_value_string = "$feature_name" . '=' . "$value";
694	159	50				536	print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
695	159					313	my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	159					528
696	159	100				570	if (@features_and_values_or_thresholds_on_branch) {
697	145					385	push @extended_attributes, $feature_value_string;
698							} else {
699	14					62	@extended_attributes = ($feature_value_string);
700							}
701	159					608	$entropy +=
702							$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) *
703							$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
704							print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
705	159	50				595	"is $entropy\n" if $self->{_debug3};
706	159					377	push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
	159					799
707							}
708	46	50				225	if ($entropy < $existing_node_entropy) {
709	46					465	$entropy_values_for_different_features{$feature_name} = $entropy;
710							}
711							}
712							}
713	46					189	my $min_entropy_for_best_feature;
714							my $best_feature_name;
715	46					224	foreach my $feature_nom (keys %entropy_values_for_different_features) {
716	184	100				478	if (!defined($best_feature_name)) {
717	46					106	$best_feature_name = $feature_nom;
718	46					157	$min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
719							} else {
720	138	100				503	if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
721	43					131	$best_feature_name = $feature_nom;
722	43					118	$min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
723							}
724							}
725							}
726	46					120	my $threshold_for_best_feature;
727	46	50				155	if (exists $partitioning_point_threshold{$best_feature_name}) {
728	46					126	$threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
729							} else {
730	0					0	$threshold_for_best_feature = undef;
731							}
732	46					116	my $best_feature_entropy = $min_entropy_for_best_feature;
733	46					112	my @val_based_entropies_to_be_returned;
734							my $decision_val_to_be_returned;
735	46	50	33			236	if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} &&
736							$self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} >
737							$self->{_symbolic_to_numeric_cardinality_threshold}) {
738							@val_based_entropies_to_be_returned =
739	0					0	@{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
	0					0
740							} else {
741	46					125	@val_based_entropies_to_be_returned = ();
742							}
743	46	50				152	if (exists $partitioning_point_threshold{$best_feature_name}) {
744	46					110	$decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
745							} else {
746	0					0	$decision_val_to_be_returned = undef;
747							}
748							print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
749	46	50				174	"@val_based_entropies_to_be_returned\n" if $self->{_debug3};
750	46					1506	return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned,
751							$decision_val_to_be_returned);
752							}
753
754							######################################### Entropy Calculators #####################################
755
756							sub class_entropy_on_priors {
757	1			1	0	5	my $self = shift;
758							return $self->{_entropy_cache}->{'priors'}
759	1	50				8	if exists $self->{_entropy_cache}->{"priors"};
760	1					4	my @class_names = @{$self->{_class_names}};
	1					7
761	1					4	my $entropy;
762	1					5	foreach my $class (@class_names) {
763	2					11	my $prob = $self->prior_probability_for_class($class);
764	2	50	33			28	my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;
765	2	50				10	$log_prob = 0 if $prob < 0.0001; # since X.log(X)->0 as X->0
766	2	50				10	$log_prob = 0 if $prob > 0.999; # since log(1) = 0
767	2	100				10	if (!defined $entropy) {
768	1					4	$entropy = -1.0 * $prob * $log_prob;
769	1					5	next;
770							}
771	1					7	$entropy += -1.0 * $prob * $log_prob;
772							}
773	1					5	$self->{_entropy_cache}->{'priors'} = $entropy;
774	1					5	return $entropy;
775							}
776
777							sub entropy_scanner_for_a_numeric_feature {
778	0			0	0	0	local $\| = 1;
779	0					0	my $self = shift;
780	0					0	my $feature = shift;
781	0					0	my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};
	0					0
782	0					0	my @entropies_for_less_than_thresholds = ();
783	0					0	my @entropies_for_greater_than_thresholds = ();
784	0					0	foreach my $point (@all_sampling_points) {
785	0					0	print ". ";
786	0					0	push @entropies_for_less_than_thresholds,
787							$self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);
788	0					0	push @entropies_for_greater_than_thresholds,
789							$self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);
790							}
791	0					0	print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ".
792							"@entropies_for_less_than_thresholds\n";
793	0					0	print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ".
794							"@entropies_for_greater_than_thresholds\n";
795							}
796
797							sub class_entropy_for_less_than_threshold_for_feature {
798	0			0	0	0	my $self = shift;
799	0					0	my $arr = shift;
800	0					0	my $feature = shift;
801	0					0	my $threshold = shift;
802	0					0	my @array_of_features_and_values_or_thresholds = @$arr;
803	0					0	my $feature_threshold_combo = "$feature" . '<' . "$threshold";
804	0					0	my $sequence = join ":", @array_of_features_and_values_or_thresholds;
805	0					0	$sequence .= ":" . $feature_threshold_combo;
806	0	0				0	return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
807							my @copy_of_array_of_features_and_values_or_thresholds =
808	0					0	@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
	0					0
809	0					0	push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
810	0					0	my $entropy = 0;
811	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
812	0					0	my $log_prob = undef;
813	0					0	my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
814							$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
815	0	0	0			0	if ($prob >= 0.0001 && $prob <= 0.999) {
		0
		0
816	0					0	$log_prob = log($prob) / log(2.0);
817							} elsif ($prob < 0.0001) {
818	0					0	$log_prob = 0;
819							} elsif ($prob > 0.999) {
820	0					0	$log_prob = 0;
821							} else {
822	0					0	die "An error has occurred in log_prob calculation";
823							}
824	0					0	$entropy += -1.0 * $prob * $log_prob;
825							}
826	0	0				0	if (abs($entropy) < 0.0000001) {
827	0					0	$entropy = 0.0;
828							}
829	0					0	$self->{_entropy_cache}->{$sequence} = $entropy;
830	0					0	return $entropy;
831							}
832
833							sub class_entropy_for_greater_than_threshold_for_feature {
834	0			0	0	0	my $self = shift;
835	0					0	my $arr = shift;
836	0					0	my $feature = shift;
837	0					0	my $threshold = shift;
838	0					0	my @array_of_features_and_values_or_thresholds = @$arr;
839	0					0	my $feature_threshold_combo = "$feature" . '>' . "$threshold";
840	0					0	my $sequence = join ":", @array_of_features_and_values_or_thresholds;
841	0					0	$sequence .= ":" . $feature_threshold_combo;
842	0	0				0	return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
843							my @copy_of_array_of_features_and_values_or_thresholds =
844	0					0	@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
	0					0
845	0					0	push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
846	0					0	my $entropy = 0;
847	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
848	0					0	my $log_prob = undef;
849	0					0	my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
850							$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
851	0	0	0			0	if ($prob >= 0.0001 && $prob <= 0.999) {
		0
		0
852	0					0	$log_prob = log($prob) / log(2.0);
853							} elsif ($prob < 0.0001) {
854	0					0	$log_prob = 0;
855							} elsif ($prob > 0.999) {
856	0					0	$log_prob = 0;
857							} else {
858	0					0	die "An error has occurred in log_prob calculation";
859							}
860	0					0	$entropy += -1.0 * $prob * $log_prob;
861							}
862	0	0				0	if (abs($entropy) < 0.0000001) {
863	0					0	$entropy = 0.0;
864							}
865	0					0	$self->{_entropy_cache}->{$sequence} = $entropy;
866	0					0	return $entropy;
867							}
868
869							sub class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds {
870	247			247	0	566	my $self = shift;
871	247					456	my $array_of_features_and_values_or_thresholds = shift;
872	247					670	my @array_of_features_and_values_or_thresholds = @$array_of_features_and_values_or_thresholds;
873	247					725	my $sequence = join ":", @array_of_features_and_values_or_thresholds;
874	247	100				1033	return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
875	159					363	my $entropy = 0;
876	159					310	foreach my $class_name (@{$self->{_class_names}}) {
	159					469
877	318					698	my $log_prob = undef;
878	318					1015	my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
879							$class_name, \@array_of_features_and_values_or_thresholds);
880	318	50	33			1672	if ($prob >= 0.0001 && $prob <= 0.999) {
		0
		0
881	318					1056	$log_prob = log($prob) / log(2.0);
882							} elsif ($prob < 0.0001) {
883	0					0	$log_prob = 0;
884							} elsif ($prob > 0.999) {
885	0					0	$log_prob = 0;
886							} else {
887	0					0	die "An error has occurred in log_prob calculation";
888							}
889	318					918	$entropy += -1.0 * $prob * $log_prob;
890							}
891	159	50				593	if (abs($entropy) < 0.0000001) {
892	0					0	$entropy = 0.0;
893							}
894	159					578	$self->{_entropy_cache}->{$sequence} = $entropy;
895	159					791	return $entropy;
896							}
897
898
899							##################################### Probability Calculators ########################################
900
901							sub prior_probability_for_class {
902	4			4	0	10	my $self = shift;
903	4					11	my $class = shift;
904	4					14	my $class_name_in_cache = "prior" . '::' . $class;
905							return $self->{_probability_cache}->{$class_name_in_cache}
906	4	50				25	if exists $self->{_probability_cache}->{$class_name_in_cache};
907	0					0	my $total_num_of_samples = keys %{$self->{_samples_class_label_hash}};
	0					0
908	0					0	my @values = values %{$self->{_samples_class_label_hash}};
	0					0
909	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
910	0					0	my @trues = grep {$_ eq $class_name} @values;
	0					0
911	0					0	my $prior_for_this_class = (1.0 * @trues) / $total_num_of_samples;
912	0					0	my $this_class_name_in_cache = "prior" . '::' . $class_name;
913	0					0	$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
914							}
915	0					0	return $self->{_probability_cache}->{$class_name_in_cache};
916							}
917
918							sub calculate_class_priors {
919	1			1	1	19	my $self = shift;
920	1	50				4	return if scalar keys %{$self->{_class_priors_hash}} > 1;
	1					10
921	1					5	foreach my $class_name (@{$self->{_class_names}}) {
	1					6
922	2					11	my $class_name_in_cache = "prior::$class_name";
923	2					6	my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};
	2					10
924	2					6	my @all_values = values %{$self->{_samples_class_label_hash}};
	2					28
925	2					8	my @trues_for_this_class = grep {$_ eq $class_name} @all_values;
	70					196
926	2					11	my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;
927	2					11	$self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;
928	2					9	my $this_class_name_in_cache = "prior::$class_name";
929	2					17	$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
930							}
931	1	50				9	if ($self->{_debug1}) {
932	0					0	foreach my $class (sort keys %{$self->{_class_priors_hash}}) {
	0					0
933	0					0	print "$class => $self->{_class_priors_hash}->{$class}\n";
934							}
935							}
936							}
937
938							sub calculate_first_order_probabilities {
939	1			1	1	21	print "\nEstimating probabilities...\n";
940	1					3	my $self = shift;
941	1					2	foreach my $feature (@{$self->{_feature_names}}) {
	1					4
942	4					30	$self->probability_of_feature_value($feature, undef);
943	4	50				28	if ($self->{_debug2}) {
944	0	0				0	if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {
945	0					0	print "\nPresenting probability distribution for a numeric feature:\n";
946	0					0	foreach my $sampling_point (sort {$a <=> $b} keys
	0					0
947	0					0	%{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {
948	0					0	my $sampling_pt_for_display = sprintf("%.2f", $sampling_point);
949							print "$feature :: $sampling_pt_for_display=" . sprintf("%.5f",
950	0					0	$self->{_prob_distribution_numeric_features_hash}->{$feature}{$sampling_point}) . "\n";
951							}
952							} else {
953	0					0	print "\nPresenting probabilities for the values of a feature considered to be symbolic:\n";
954	0					0	my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
	0					0
955	0					0	foreach my $value (sort @values_for_feature) {
956	0					0	my $prob = $self->probability_of_feature_value($feature,$value);
957	0					0	print "$feature :: $value = " . sprintf("%.5f", $prob) . "\n";
958							}
959							}
960							}
961							}
962							}
963
964							sub probability_of_feature_value {
965	441			441	0	938	my $self = shift;
966	441					919	my $feature_name = shift;
967	441					942	my $value = shift;
968	441	50	66			2687	$value = sprintf("%.1f", $value) if defined($value) && $value =~ /^\d+$/;
969	441	50	66			2082	if (defined($value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
970							$value = closest_sampling_point($value,
971	0					0	$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
972							}
973	441					849	my $feature_and_value;
974	441	100				1224	if (defined($value)) {
975	437					1160	$feature_and_value = "$feature_name=$value";
976							}
977	441	50	66			2348	if (defined($value) && exists($self->{_probability_cache}->{$feature_and_value})) {
978	437					1732	return $self->{_probability_cache}->{$feature_and_value};
979							}
980	4					15	my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
981	4	50				19	if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
982	0	0				0	if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
983							$self->{_symbolic_to_numeric_cardinality_threshold}) {
984	0	0				0	if (! exists $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}) {
985	0					0	@valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
	0					0
986	0					0	$diffrange = $valuerange[1] - $valuerange[0];
987	0					0	my %seen = ();
988	0	0	0			0	my @unique_values_for_feature = sort {$a <=> $b} grep {$_ if $_ ne 'NA' && !$seen{$_}++}
	0					0
989	0					0	@{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
990	0					0	my @diffs = sort {$a <=> $b} map {$unique_values_for_feature[$_] -
	0					0
	0					0
991							$unique_values_for_feature[$_-1]} 1..@unique_values_for_feature-1;
992	0					0	my $median_diff = $diffs[int(@diffs/2) - 1];
993	0					0	$histogram_delta = $median_diff * 2;
994	0	0				0	if ($histogram_delta < $diffrange / 500.0) {
995	0	0				0	if (defined $self->{_number_of_histogram_bins}) {
996	0					0	$histogram_delta = $diffrange / $self->{_number_of_histogram_bins};
997							} else {
998	0					0	$histogram_delta = $diffrange / 500.0;
999							}
1000							}
1001	0					0	$self->{_histogram_delta_hash}->{$feature_name} = $histogram_delta;
1002	0					0	$num_of_histogram_bins = int($diffrange / $histogram_delta) + 1;
1003	0					0	$self->{_num_of_histogram_bins_hash}->{$feature_name} = $num_of_histogram_bins;
1004	0					0	my @sampling_points_for_feature = map {$valuerange[0] + $histogram_delta * $_}
	0					0
1005							0..$num_of_histogram_bins-1;
1006	0					0	$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name} =
1007							\@sampling_points_for_feature;
1008							}
1009							}
1010							}
1011	4	50				15	if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1012	0	0				0	if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1013							$self->{_symbolic_to_numeric_cardinality_threshold}) {
1014							my @sampling_points_for_feature =
1015	0					0	@{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
	0					0
1016	0					0	my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1017	0					0	my @actual_values_for_feature = grep {$_ ne 'NA'}
1018	0					0	@{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
1019	0					0	foreach my $i (0..@sampling_points_for_feature-1) {
1020	0					0	foreach my $j (0..@actual_values_for_feature-1) {
1021	0	0				0	if (abs($sampling_points_for_feature[$i]-$actual_values_for_feature[$j]) < $histogram_delta) {
1022	0					0	$counts_at_sampling_points[$i]++
1023							}
1024							}
1025							}
1026	0					0	my $total_counts = 0;
1027	0					0	map {$total_counts += $_} @counts_at_sampling_points;
	0					0
1028	0					0	my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
	0					0
1029	0					0	my %bin_prob_hash = ();
1030	0					0	foreach my $i (0..@sampling_points_for_feature-1) {
1031	0					0	$bin_prob_hash{$sampling_points_for_feature[$i]} = $probs[$i];
1032							}
1033	0					0	$self->{_prob_distribution_numeric_features_hash}->{$feature_name} = \%bin_prob_hash;
1034	0					0	my @values_for_feature = map "$feature_name=$_", map {sprintf("%.5f", $_)}
	0					0
1035							@sampling_points_for_feature;
1036	0					0	foreach my $i (0..@values_for_feature-1) {
1037	0					0	$self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1038							}
1039	0	0	0			0	if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1040	0					0	return $self->{_probability_cache}->{$feature_and_value};
1041							} else {
1042	0					0	return 0;
1043							}
1044							} else {
1045	0					0	my %seen = ();
1046	0	0	0			0	my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1047	0					0	@{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
1048	0					0	@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
	0					0
1049	0					0	my @value_counts = (0) x @values_for_feature;
1050							# foreach my $sample (sort {sample_index($a) cmp sample_index($b)}keys %{$self->{_training_data_hash}}){
1051	0					0	foreach my $sample (sort {sample_index($a) <=> sample_index($b)}keys %{$self->{_training_data_hash}}){
	0					0
	0					0
1052	0					0	my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
	0					0
1053	0					0	foreach my $i (0..@values_for_feature-1) {
1054	0					0	foreach my $current_value (@features_and_values) {
1055	0	0				0	$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1056							}
1057							}
1058							}
1059	0					0	my $total_counts = 0;
1060	0					0	map {$total_counts += $_} @value_counts;
	0					0
1061	0	0				0	die "PFV Something is wrong with your training file. It contains no training samples \
1062							for feature named $feature_name" if $total_counts == 0;
1063	0					0	my @probs = map {$_ / (1.0 * $total_counts)} @value_counts;
	0					0
1064	0					0	foreach my $i (0..@values_for_feature-1) {
1065	0					0	$self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1066							}
1067	0	0	0			0	if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1068	0					0	return $self->{_probability_cache}->{$feature_and_value};
1069							} else {
1070	0					0	return 0;
1071							}
1072							}
1073							} else {
1074							# This section is only for purely symbolic features:
1075	4					12	my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
	4					33
1076	4					15	@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
	140					272
1077	4					21	my @value_counts = (0) x @values_for_feature;
1078							# foreach my $sample (sort {sample_index($a) cmp sample_index($b)} keys %{$self->{_training_data_hash}}) {
1079	4					10	foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}}) {
	544					1144
	4					53
1080	140					281	my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
	140					657
1081	140					401	foreach my $i (0..@values_for_feature-1) {
1082	4900					9059	for my $current_value (@features_and_values) {
1083	19600	100				47918	$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1084							}
1085							}
1086							}
1087	4					56	foreach my $i (0..@values_for_feature-1) {
1088							$self->{_probability_cache}->{$values_for_feature[$i]} =
1089	140					221	$value_counts[$i] / (1.0 * scalar(keys %{$self->{_training_data_hash}}));
	140					382
1090							}
1091	4	50	33			32	if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1092	0					0	return $self->{_probability_cache}->{$feature_and_value};
1093							} else {
1094	4					40	return 0;
1095							}
1096							}
1097							}
1098
1099							sub probability_of_feature_value_given_class {
1100	882			882	0	1879	my $self = shift;
1101	882					1754	my $feature_name = shift;
1102	882					1672	my $feature_value = shift;
1103	882					1720	my $class_name = shift;
1104	882	50	33			4736	$feature_value = sprintf("%.1f", $feature_value) if defined($feature_value) && $feature_value =~ /^\d+$/;
1105	882	50	33			4195	if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
1106							$feature_value = closest_sampling_point($feature_value,
1107	0					0	$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
1108							}
1109	882					1780	my $feature_value_class;
1110	882	50				2234	if (defined($feature_value)) {
1111	882					2648	$feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";
1112							}
1113	882	100	66			4649	if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {
1114							print "\nNext answer returned by cache for feature $feature_name and " .
1115	874	50				2536	"value $feature_value given class $class_name\n" if $self->{_debug2};
1116	874					3495	return $self->{_probability_cache}->{$feature_value_class};
1117							}
1118	8					42	my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
1119
1120	8	50				45	if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1121	0	0				0	if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1122							$self->{_symbolic_to_numeric_cardinality_threshold}) {
1123	0					0	$histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};
1124	0					0	$num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};
1125	0					0	@valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
	0					0
1126	0					0	$diffrange = $valuerange[1] - $valuerange[0];
1127							}
1128							}
1129	8					24	my @samples_for_class = ();
1130							# Accumulate all samples names for the given class:
1131	8					22	foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
	8					98
1132	280	100				907	if ($self->{_samples_class_label_hash}->{$sample_name} eq $class_name) {
1133	140					384	push @samples_for_class, $sample_name;
1134							}
1135							}
1136	8	50				54	if (exists($self->{_numeric_features_valuerange_hash}->{$feature_name})) {
1137	0	0				0	if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1138							$self->{_symbolic_to_numeric_cardinality_threshold}) {
1139							my @sampling_points_for_feature =
1140	0					0	@{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
	0					0
1141	0					0	my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1142	0					0	my @actual_feature_values_for_samples_in_class = ();
1143	0					0	foreach my $sample (@samples_for_class) {
1144	0					0	foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
	0					0
1145	0					0	my $pattern = '(.+)=(.+)';
1146	0					0	$feature_and_value =~ /$pattern/;
1147	0					0	my ($feature, $value) = ($1, $2);
1148	0	0	0			0	if (($feature eq $feature_name) && ($value ne 'NA')) {
1149	0					0	push @actual_feature_values_for_samples_in_class, $value;
1150							}
1151							}
1152							}
1153	0					0	foreach my $i (0..@sampling_points_for_feature-1) {
1154	0					0	foreach my $j (0..@actual_feature_values_for_samples_in_class-1) {
1155	0	0				0	if (abs($sampling_points_for_feature[$i] -
1156							$actual_feature_values_for_samples_in_class[$j]) < $histogram_delta) {
1157	0					0	$counts_at_sampling_points[$i]++;
1158							}
1159							}
1160							}
1161	0					0	my $total_counts = 0;
1162	0					0	map {$total_counts += $_} @counts_at_sampling_points;
	0					0
1163	0	0				0	die "PFVC1 Something is wrong with your training file. It contains no training " .
1164							"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1165	0					0	my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
	0					0
1166	0					0	my @values_for_feature_and_class = map {"$feature_name=$_" . "::" . "$class_name"}
	0					0
1167							@sampling_points_for_feature;
1168	0					0	foreach my $i (0..@values_for_feature_and_class-1) {
1169	0					0	$self->{_probability_cache}->{$values_for_feature_and_class[$i]} = $probs[$i];
1170							}
1171	0	0				0	if (exists $self->{_probability_cache}->{$feature_value_class}) {
1172	0					0	return $self->{_probability_cache}->{$feature_value_class};
1173							} else {
1174	0					0	return 0;
1175							}
1176							} else {
1177							# This section is for numeric features that will be treated symbolically
1178	0					0	my %seen = ();
1179	0	0	0			0	my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1180	0					0	@{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
1181	0					0	@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
	0					0
1182	0					0	my @value_counts = (0) x @values_for_feature;
1183	0					0	foreach my $sample (@samples_for_class) {
1184	0					0	my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
	0					0
1185	0					0	foreach my $i (0..@values_for_feature-1) {
1186	0					0	foreach my $current_value (@features_and_values) {
1187	0	0				0	$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1188							}
1189							}
1190							}
1191	0					0	my $total_counts = 0;
1192	0					0	map {$total_counts += $_} @value_counts;
	0					0
1193	0	0				0	die "PFVC2 Something is wrong with your training file. It contains no training " .
1194							"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1195							# We normalize by total_count because the probabilities are conditioned on a given class
1196	0					0	foreach my $i (0..@values_for_feature-1) {
1197	0					0	my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1198	0					0	$self->{_probability_cache}->{$feature_and_value_and_class} =
1199							$value_counts[$i] / (1.0 * $total_counts);
1200							}
1201	0	0				0	if (exists $self->{_probability_cache}->{$feature_value_class}) {
1202	0					0	return $self->{_probability_cache}->{$feature_value_class};
1203							} else {
1204	0					0	return 0;
1205							}
1206							}
1207							} else {
1208							# This section is for purely symbolic features
1209	8					25	my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
	8					107
1210	8					35	my %seen = ();
1211	280	100	66			1926	@values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1212	8					20	@{$self->{_features_and_values_hash}->{$feature_name}};
	8					36
1213	8					33	@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
	28					118
1214	8					41	my @value_counts = (0) x @values_for_feature;
1215	8					31	foreach my $sample (@samples_for_class) {
1216	140					331	my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
	140					631
1217	140					440	foreach my $i (0..@values_for_feature-1) {
1218	490					1165	foreach my $current_value (@features_and_values) {
1219	1960	100				6044	$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1220							}
1221							}
1222							}
1223	8					22	my $total_counts = 0;
1224	8					28	map {$total_counts += $_} @value_counts;
	28					79
1225	8	50				39	die "PFVC2 Something is wrong with your training file. It contains no training " .
1226							"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1227							# We normalize by total_count because the probabilities are conditioned on a given class
1228	8					41	foreach my $i (0..@values_for_feature-1) {
1229	28					110	my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1230	28					189	$self->{_probability_cache}->{$feature_and_value_and_class} =
1231							$value_counts[$i] / (1.0 * $total_counts);
1232							}
1233	8	50				43	if (exists $self->{_probability_cache}->{$feature_value_class}) {
1234	8					89	return $self->{_probability_cache}->{$feature_value_class};
1235							} else {
1236	0					0	return 0;
1237							}
1238							}
1239							}
1240
1241							sub probability_of_feature_less_than_threshold {
1242	0			0	0	0	my $self = shift;
1243	0					0	my $feature_name = shift;
1244	0					0	my $threshold = shift;
1245	0					0	my $feature_threshold_combo = "$feature_name" . '<' . "$threshold";
1246							return $self->{_probability_cache}->{$feature_threshold_combo}
1247	0	0				0	if (exists $self->{_probability_cache}->{$feature_threshold_combo});
1248	0	0				0	my @all_values = grep {$_ if $_ ne 'NA'} @{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
	0					0
1249	0	0				0	my @all_values_less_than_threshold = grep {$_ if $_ <= $threshold} @all_values;
	0					0
1250	0					0	my $probability = (1.0 * @all_values_less_than_threshold) / @all_values;
1251	0					0	$self->{_probability_cache}->{$feature_threshold_combo} = $probability;
1252	0					0	return $probability;
1253							}
1254
1255							sub probability_of_feature_less_than_threshold_given_class {
1256	0			0	0	0	my $self = shift;
1257	0					0	my $feature_name = shift;
1258	0					0	my $threshold = shift;
1259	0					0	my $class_name = shift;
1260	0					0	my $feature_threshold_class_combo = "$feature_name" . "<" . "$threshold" . "::" . "$class_name";
1261							return $self->{_probability_cache}->{$feature_threshold_class_combo}
1262	0	0				0	if (exists $self->{_probability_cache}->{$feature_threshold_class_combo});
1263	0					0	my @data_samples_for_class = ();
1264							# Accumulate all samples names for the given class:
1265	0					0	foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
	0					0
1266							push @data_samples_for_class, $sample_name
1267	0	0				0	if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;
1268							}
1269	0					0	my @actual_feature_values_for_samples_in_class = ();
1270	0					0	foreach my $sample (@data_samples_for_class) {
1271	0					0	foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
	0					0
1272	0					0	my $pattern = '(.+)=(.+)';
1273	0					0	$feature_and_value =~ /$pattern/;
1274	0					0	my ($feature,$value) = ($1,$2);
1275	0	0	0			0	push @actual_feature_values_for_samples_in_class, $value
1276							if $feature eq $feature_name && $value ne 'NA';
1277							}
1278							}
1279	0	0				0	my @actual_points_for_feature_less_than_threshold = grep {$_ if $_ <= $threshold} @actual_feature_values_for_samples_in_class;
	0					0
1280							# The condition in the assignment that follows was a bug correction in Version 3.20
1281	0	0				0	my $probability = @actual_feature_values_for_samples_in_class > 0 ? ((1.0 * @actual_points_for_feature_less_than_threshold) / @actual_feature_values_for_samples_in_class) : 0.0;
1282	0					0	$self->{_probability_cache}->{$feature_threshold_class_combo} = $probability;
1283	0					0	return $probability;
1284							}
1285
1286							# This method requires that all truly numeric types only be expressed as '<' or '>'
1287							# constructs in the array of branch features and thresholds
1288							sub probability_of_a_sequence_of_features_and_values_or_thresholds {
1289	477			477	0	1038	my $self = shift;
1290	477					880	my $arr = shift;
1291	477					1471	my @array_of_features_and_values_or_thresholds = @$arr;
1292	477	50				1506	return if scalar @array_of_features_and_values_or_thresholds == 0;
1293	477					1469	my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1294	477	100				2124	return $self->{_probability_cache}->{$sequence} if exists $self->{_probability_cache}->{$sequence};
1295	145					319	my $probability = undef;
1296	145					316	my $pattern1 = '(.+)=(.+)';
1297	145					306	my $pattern2 = '(.+)<(.+)';
1298	145					293	my $pattern3 = '(.+)>(.+)';
1299	145					320	my @true_numeric_types = ();
1300	145					312	my @true_numeric_types_feature_names = ();
1301	145					317	my @symbolic_types = ();
1302	145					317	my @symbolic_types_feature_names = ();
1303	145					371	foreach my $item (@array_of_features_and_values_or_thresholds) {
1304	437	50				1933	if ($item =~ /$pattern2/) {
		50
1305	0					0	push @true_numeric_types, $item;
1306	0					0	my ($feature,$value) = ($1,$2);
1307	0					0	push @true_numeric_types_feature_names, $feature;
1308							} elsif ($item =~ /$pattern3/) {
1309	0					0	push @true_numeric_types, $item;
1310	0					0	my ($feature,$value) = ($1,$2);
1311	0					0	push @true_numeric_types_feature_names, $feature;
1312							} else {
1313	437					1003	push @symbolic_types, $item;
1314	437					1668	$item =~ /$pattern1/;
1315	437					1581	my ($feature,$value) = ($1,$2);
1316	437					1209	push @symbolic_types_feature_names, $feature;
1317							}
1318							}
1319	145					390	my %seen1 = ();
1320	145	0				360	@true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
	0					0
1321	145					325	my %seen2 = ();
1322	145	50				378	@symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
	437					2337
1323	145					524	my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1324							print_array_with_msg("POS: Answer returned by find_bounded: ",
1325	145	50				494	$bounded_intervals_numeric_types) if $self->{_debug2};
1326							# Calculate the upper and the lower bounds to be used when searching for the best
1327							# threshold for each of the numeric features that are in play at the current node:
1328	145					360	my (%upperbound, %lowerbound);
1329	145					318	foreach my $feature_name (@true_numeric_types_feature_names) {
1330	0					0	$upperbound{$feature_name} = undef;
1331	0					0	$lowerbound{$feature_name} = undef;
1332							}
1333	145					439	foreach my $item (@$bounded_intervals_numeric_types) {
1334	0					0	foreach my $feature_grouping (@$item) {
1335	0	0				0	if ($feature_grouping->[1] eq '>') {
1336	0					0	$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1337							} else {
1338	0					0	$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1339							}
1340							}
1341							}
1342	145					374	foreach my $feature_name (@true_numeric_types_feature_names) {
1343	0	0	0			0	if (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name}) &&
		0	0
		0	0
		0	0
			0
1344							$upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1345	0					0	return 0;
1346							} elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1347	0	0				0	if (! $probability) {
1348							$probability = $self->probability_of_feature_less_than_threshold($feature_name,
1349							$upperbound{$feature_name}) -
1350	0					0	$self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name});
1351							} else {
1352							$probability *= ($self->probability_of_feature_less_than_threshold($feature_name,
1353							$upperbound{$feature_name}) -
1354	0					0	$self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name}))
1355							}
1356							} elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1357	0	0				0	if (! $probability) {
1358							$probability = $self->probability_of_feature_less_than_threshold($feature_name,
1359	0					0	$upperbound{$feature_name});
1360							} else {
1361							$probability *= $self->probability_of_feature_less_than_threshold($feature_name,
1362	0					0	$upperbound{$feature_name});
1363							}
1364							} elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1365	0	0				0	if (! $probability) {
1366							$probability = 1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1367	0					0	$lowerbound{$feature_name});
1368							} else {
1369							$probability *= (1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1370	0					0	$lowerbound{$feature_name}));
1371							}
1372							} else {
1373	0					0	die("Ill formatted call to 'probability_of_sequence' method");
1374							}
1375							}
1376	145					367	foreach my $feature_and_value (@symbolic_types) {
1377	437	50				2771	if ($feature_and_value =~ /$pattern1/) {
1378	437					1751	my ($feature,$value) = ($1,$2);
1379	437	100				1326	if (! $probability) {
1380	145					501	$probability = $self->probability_of_feature_value($feature, $value);
1381							} else {
1382	292					837	$probability *= $self->probability_of_feature_value($feature, $value);
1383							}
1384							}
1385							}
1386	145					606	$self->{_probability_cache}->{$sequence} = $probability;
1387	145					746	return $probability;
1388							}
1389
1390							## The following method requires that all truly numeric types only be expressed as
1391							## '<' or '>' constructs in the array of branch features and thresholds
1392							sub probability_of_a_sequence_of_features_and_values_or_thresholds_given_class {
1393	318			318	0	665	my $self = shift;
1394	318					592	my $arr = shift;
1395	318					723	my $class_name = shift;
1396	318					901	my @array_of_features_and_values_or_thresholds = @$arr;
1397	318	50				984	return if scalar @array_of_features_and_values_or_thresholds == 0;
1398	318					958	my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1399	318					870	my $sequence_with_class = "$sequence" . "::" . $class_name;
1400							return $self->{_probability_cache}->{$sequence_with_class}
1401	318	100				1203	if exists $self->{_probability_cache}->{$sequence_with_class};
1402	298					618	my $probability = undef;
1403	298					606	my $pattern1 = '(.+)=(.+)';
1404	298					587	my $pattern2 = '(.+)<(.+)';
1405	298					550	my $pattern3 = '(.+)>(.+)';
1406	298					660	my @true_numeric_types = ();
1407	298					593	my @true_numeric_types_feature_names = ();
1408	298					652	my @symbolic_types = ();
1409	298					625	my @symbolic_types_feature_names = ();
1410	298					657	foreach my $item (@array_of_features_and_values_or_thresholds) {
1411	882	50				3740	if ($item =~ /$pattern2/) {
		50
1412	0					0	push @true_numeric_types, $item;
1413	0					0	my ($feature,$value) = ($1,$2);
1414	0					0	push @true_numeric_types_feature_names, $feature;
1415							} elsif ($item =~ /$pattern3/) {
1416	0					0	push @true_numeric_types, $item;
1417	0					0	my ($feature,$value) = ($1,$2);
1418	0					0	push @true_numeric_types_feature_names, $feature;
1419							} else {
1420	882					2021	push @symbolic_types, $item;
1421	882					3527	$item =~ /$pattern1/;
1422	882					3261	my ($feature,$value) = ($1,$2);
1423	882					2508	push @symbolic_types_feature_names, $feature;
1424							}
1425							}
1426	298					808	my %seen1 = ();
1427	298	0				717	@true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
	0					0
1428	298					621	my %seen2 = ();
1429	298	50				772	@symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
	882					4830
1430	298					1110	my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1431							print_array_with_msg("POSC: Answer returned by find_bounded: ",
1432	298	50				1013	$bounded_intervals_numeric_types) if $self->{_debug2};
1433							# Calculate the upper and the lower bounds to be used when searching for the best
1434							# threshold for each of the numeric features that are in play at the current node:
1435	298					629	my (%upperbound, %lowerbound);
1436	298					645	foreach my $feature_name (@true_numeric_types_feature_names) {
1437	0					0	$upperbound{$feature_name} = undef;
1438	0					0	$lowerbound{$feature_name} = undef;
1439							}
1440	298					676	foreach my $item (@$bounded_intervals_numeric_types) {
1441	0					0	foreach my $feature_grouping (@$item) {
1442	0	0				0	if ($feature_grouping->[1] eq '>') {
1443	0					0	$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1444							} else {
1445	0					0	$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1446							}
1447							}
1448							}
1449	298					689	foreach my $feature_name (@true_numeric_types_feature_names) {
1450	0	0	0			0	if ($lowerbound{$feature_name} && $upperbound{$feature_name} &&
		0	0
		0	0
		0	0
			0
1451							$upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1452	0					0	return 0;
1453							} elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1454	0	0				0	if (! $probability) {
1455
1456							$probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1457							$upperbound{$feature_name}, $class_name) -
1458							$self->probability_of_feature_less_than_threshold_given_class($feature_name,
1459	0					0	$lowerbound{$feature_name}, $class_name);
1460							} else {
1461							$probability *= ($self->probability_of_feature_less_than_threshold_given_class($feature_name,
1462							$upperbound{$feature_name}, $class_name) -
1463							$self->probability_of_feature_less_than_threshold_given_class($feature_name,
1464	0					0	$lowerbound{$feature_name}, $class_name))
1465							}
1466							} elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1467	0	0				0	if (! $probability) {
1468							$probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1469	0					0	$upperbound{$feature_name}, $class_name);
1470							} else {
1471							$probability *= $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1472	0					0	$upperbound{$feature_name}, $class_name);
1473							}
1474							} elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1475	0	0				0	if (! $probability) {
1476							$probability = 1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1477	0					0	$lowerbound{$feature_name}, $class_name);
1478							} else {
1479							$probability *= (1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1480	0					0	$lowerbound{$feature_name}, $class_name));
1481							}
1482							} else {
1483	0					0	die("Ill formatted call to 'probability of sequence given class' method");
1484							}
1485							}
1486	298					651	foreach my $feature_and_value (@symbolic_types) {
1487	882	50				5684	if ($feature_and_value =~ /$pattern1/) {
1488	882					3452	my ($feature,$value) = ($1,$2);
1489	882	100				2372	if (! $probability) {
1490	298					1090	$probability = $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1491							} else {
1492	584					1678	$probability *= $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1493							}
1494							}
1495							}
1496	298					1302	$self->{_probability_cache}->{$sequence_with_class} = $probability;
1497	298					1690	return $probability;
1498							}
1499
1500							sub probability_of_a_class_given_sequence_of_features_and_values_or_thresholds {
1501	494			494	0	1036	my $self = shift;
1502	494					957	my $class_name = shift;
1503	494					889	my $arr = shift;
1504	494					1370	my @array_of_features_and_values_or_thresholds = @$arr;
1505	494					1349	my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1506	494					1298	my $class_and_sequence = "$class_name" . "::" . $sequence;
1507							return $self->{_probability_cache}->{$class_and_sequence}
1508	494	100				2370	if exists $self->{_probability_cache}->{$class_and_sequence};
1509	159					385	my @array_of_class_probabilities = (0) x scalar @{$self->{_class_names}};
	159					573
1510	159					391	foreach my $i (0..@{$self->{_class_names}}-1) {
	159					501
1511	318					761	my $class_name = $self->{_class_names}->[$i];
1512	318					1356	my $prob = $self->probability_of_a_sequence_of_features_and_values_or_thresholds_given_class(
1513							\@array_of_features_and_values_or_thresholds, $class_name);
1514	318	50				1127	if ($prob < 0.000001) {
1515	0					0	$array_of_class_probabilities[$i] = 0.0;
1516	0					0	next;
1517							}
1518	318					1122	my $prob_of_feature_sequence = $self->probability_of_a_sequence_of_features_and_values_or_thresholds(
1519							\@array_of_features_and_values_or_thresholds);
1520							# die "PCS Something is wrong with your sequence of feature values and thresholds in " .
1521							# "probability_of_a_class_given_sequence_of_features_and_values_or_thresholds()"
1522							# if ! $prob_of_feature_sequence;
1523	318					1094	my $prior = $self->{_class_priors_hash}->{$self->{_class_names}->[$i]};
1524	318	50				822	if ($prob_of_feature_sequence) {
1525	318					1220	$array_of_class_probabilities[$i] = $prob * $prior / $prob_of_feature_sequence;
1526							} else {
1527	0					0	$array_of_class_probabilities[$i] = $prior;
1528							}
1529							}
1530	159					363	my $sum_probability;
1531	159					433	map {$sum_probability += $_} @array_of_class_probabilities;
	318					931
1532	159	50				491	if ($sum_probability == 0) {
1533	0					0	@array_of_class_probabilities = map {1.0 / (scalar @{$self->{_class_names}})}
	0					0
1534	0					0	(0..@{$self->{_class_names}}-1);
	0					0
1535							} else {
1536	159					392	@array_of_class_probabilities = map {$_ * 1.0 / $sum_probability} @array_of_class_probabilities;
	318					1035
1537							}
1538	159					376	foreach my $i (0..@{$self->{_class_names}}-1) {
	159					571
1539	318					1161	my $this_class_and_sequence = "$self->{_class_names}->[$i]" . "::" . "$sequence";
1540	318					1501	$self->{_probability_cache}->{$this_class_and_sequence} = $array_of_class_probabilities[$i];
1541							}
1542	159					636	return $self->{_probability_cache}->{$class_and_sequence};
1543							}
1544
1545							####################################### Class Based Utilities ##########################################
1546
1547							## Given a list of branch attributes for the numeric features of the form, say,
1548							## ['g2<1','g2<2','g2<3','age>34','age>36','age>37'], this method returns the
1549							## smallest list that is relevant for the purpose of calculating the probabilities.
1550							## To explain, the probability that the feature `g2' is less than 1 AND, at the same
1551							## time, less than 2, AND, at the same time, less than 3, is the same as the
1552							## probability that the feature less than 1. Similarly, the probability that 'age'
1553							## is greater than 34 and also greater than 37 is the same as `age' being greater
1554							## than 37.
1555							sub find_bounded_intervals_for_numeric_features {
1556	489			489	0	1104	my $self = shift;
1557	489					912	my $arr = shift;
1558	489					1061	my @arr = @$arr;
1559	489					917	my @features = @{$self->{_feature_names}};
	489					1651
1560	489					1170	my @arr1 = map {my @x = split /(>\|<)/, $_; \@x} @arr;
	0					0
	0					0
1561	489	50				1456	print_array_with_msg("arr1", \@arr1) if $self->{_debug2};
1562	489					1027	my @arr3 = ();
1563	489					1133	foreach my $feature_name (@features) {
1564	1956					3893	my @temp = ();
1565	1956					3831	foreach my $x (@arr1) {
1566	0	0	0			0	push @temp, $x if @$x > 0 && $x->[0] eq $feature_name;
1567							}
1568	1956	50				5964	push @arr3, \@temp if @temp > 0;
1569							}
1570	489	50				1444	print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
1571							# Sort each list so that '<' entries occur before '>' entries:
1572	489					915	my @arr4;
1573	489					992	foreach my $li (@arr3) {
1574	0					0	my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
	0					0
1575	0					0	push @arr4, \@sorted;
1576							}
1577	489	50				1387	print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
1578	489					904	my @arr5;
1579	489					1030	foreach my $li (@arr4) {
1580	0					0	my @temp1 = ();
1581	0					0	my @temp2 = ();
1582	0					0	foreach my $inner (@$li) {
1583	0	0				0	if ($inner->[1] eq '<') {
1584	0					0	push @temp1, $inner;
1585							} else {
1586	0					0	push @temp2, $inner;
1587							}
1588							}
1589	0	0	0			0	if (@temp1 > 0 && @temp2 > 0) {
		0
1590	0					0	push @arr5, [\@temp1, \@temp2];
1591							} elsif (@temp1 > 0) {
1592	0					0	push @arr5, [\@temp1];
1593							} else {
1594	0					0	push @arr5, [\@temp2];
1595							}
1596							}
1597	489	50				1396	print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
1598	489					1069	my @arr6 = ();
1599	489					1036	foreach my $li (@arr5) {
1600	0					0	my @temp1 = ();
1601	0					0	foreach my $inner (@$li) {
1602	0					0	my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
	0					0
1603	0					0	push @temp1, \@sorted;
1604							}
1605	0					0	push @arr6, \@temp1;
1606							}
1607	489	50				1301	print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
1608	489					1034	my @arr9 = ();
1609	489					1042	foreach my $li (@arr6) {
1610	0					0	foreach my $alist (@$li) {
1611	0					0	my @newalist = ();
1612	0	0				0	if ($alist->[0][1] eq '<') {
1613	0					0	push @newalist, $alist->[0];
1614							} else {
1615	0					0	push @newalist, $alist->[-1];
1616							}
1617	0	0				0	if ($alist->[0][1] ne $alist->[-1][1]) {
1618	0					0	push @newalist, $alist->[-1];
1619							}
1620	0					0	push @arr9, \@newalist;
1621							}
1622							}
1623	489	50				1284	print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
1624	489					1787	return \@arr9;
1625
1626							}
1627
1628							## This method is used to verify that you used legal feature names in the test
1629							## sample that you want to classify with the decision tree.
1630							sub check_names_used {
1631	0			0	0	0	my $self = shift;
1632	0					0	my $features_and_values_test_data = shift;
1633	0					0	my @features_and_values_test_data = @$features_and_values_test_data;
1634	0					0	my $pattern = '(\S+)\s=\s(\S+)';
1635	0					0	foreach my $feature_and_value (@features_and_values_test_data) {
1636	0					0	$feature_and_value =~ /$pattern/;
1637	0					0	my ($feature,$value) = ($1,$2);
1638	0	0	0			0	die "Your test data has formatting error" unless defined($feature) && defined($value);
1639	0	0				0	return 0 unless contained_in($feature, @{$self->{_feature_names}});
	0					0
1640							}
1641	0					0	return 1;
1642							}
1643
1644							####################################### Data Condition Calculator ######################################
1645
1646							## This method estimates the worst-case fan-out of the decision tree taking into
1647							## account the number of values (and therefore the number of branches emanating from
1648							## a node) for the symbolic features.
1649							sub determine_data_condition {
1650	0			0	0	0	my $self = shift;
1651	0					0	my $num_of_features = scalar @{$self->{_feature_names}};
	0					0
1652	0					0	my @values = ();
1653	0					0	my @number_of_values;
1654	0					0	foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
	0					0
1655	0					0	push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
1656	0	0				0	if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
	0					0
1657	0					0	push @number_of_values, scalar @values;
1658							}
1659	0	0				0	return if ! @values;
1660	0					0	print "Number of features: $num_of_features\n";
1661	0					0	my @minmax = minmax(\@number_of_values);
1662	0					0	my $max_num_values = $minmax[1];
1663	0					0	print "Largest number of values for symbolic features is: $max_num_values\n";
1664	0					0	my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
1665	0					0	print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
1666							"nodes. The exact number of nodes created depends critically on " .
1667							"the entropy_threshold used for node expansion (the default value " .
1668							"for this threshold is 0.01) and on the value set for max_depth_desired " .
1669							"for the depth of the tree.\n";
1670	0	0				0	if ($estimated_number_of_nodes > 10000) {
1671	0					0	print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
1672							"large value for entropy_threshold and/or a small value for " .
1673							"for max_depth_desired to reduce the number of nodes created.\n";
1674	0					0	print "\nDo you wish to continue anyway? Enter 'y' for yes: ";
1675	0					0	my $answer = ;
1676	0					0	$answer =~ s/\r?\n?$//;
1677	0		0			0	while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
1678	0					0	print "\nAnswer not recognized. Let's try again. Enter 'y' or 'n': ";
1679	0					0	$answer = ;
1680	0					0	$answer =~ s/\r?\n?$//;
1681							}
1682	0	0				0	die unless $answer =~ /y(es)?/i;
1683							}
1684							}
1685
1686
1687							#################################### Read Training Data From File ######################################
1688
1689
1690							sub get_training_data {
1691	1			1	1	7	my $self = shift;
1692	1	50				10	die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
1693	1					4	my %class_names = ();
1694	1					3	my %all_record_ids_with_class_labels;
1695							my $firstline;
1696	1					0	my %data_hash;
1697	1					4	$\|++;
1698	1		50			25	open FILEIN, $self->{_training_datafile} \|\| die "unable to open $self->{_training_datafile}: $!";
1699	1					4	my $record_index = 0;
1700	1					2	my $firsetline;
1701	1					16	while () {
1702	36	50				115	next if /^[ ]*\r?\n?$/;
1703	36					112	$_ =~ s/\r?\n?$//;
1704	36	50				84	my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
1705	36	100				77	if ($record_index == 0) {
1706	1					2	$firstline = $record;
1707	1					3	$record_index++;
1708	1					11	next;
1709							}
1710	35					152	my @parts = split /,/, $record;
1711	35					66	my $classname = $parts[$self->{_csv_class_column_index}];
1712	35					49	$class_names{$classname} = 1;
1713	35					56	my $record_label = shift @parts;
1714	35					81	$record_label =~ s/^\s\"\|\"\s$//g;
1715	35					77	$data_hash{$record_label} = \@parts;
1716	35					59	$all_record_ids_with_class_labels{$record_label} = $classname;
1717	35	50				76	print "." if $record_index % 10000 == 0;
1718	35					108	$record_index++;
1719							}
1720	1					9	close FILEIN;
1721	1					3	$\|--;
1722	1					5	$self->{_how_many_total_training_samples} = $record_index - 1; # must subtract 1 for the header record
1723	1	50				4	print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
1724	1					11	my @all_feature_names = split /,/, substr($firstline, index($firstline,','));
1725	1					4	my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
1726	1					4	my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
	2					12
1727	1					3	my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
	4					9
	1					3
1728	1					5	my %class_for_sample_hash = map {"sample_" . $_ => "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
	35					96
1729	1					10	my @sample_names = map {"sample_$_"} keys %data_hash;
	35					59
1730	1	50				8	my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $data_hash{$sampleID}->[$_-1] } } @{$self->{_csv_columns_for_features}} ] } keys %data_hash;
	35					59
	35					55
	140					220
	140					223
	140					516
	35					58
1731	1	50				10	my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};
	4					15
	4					11
	4					31
	140					300
	140					518
	140					387
	1					5
1732	1					4	my %numeric_features_valuerange_hash = ();
1733	1					3	my %feature_values_how_many_uniques_hash = ();
1734	1					4	my %features_and_unique_values_hash = ();
1735	1					3	my $numregex = '[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?';
1736	1					5	foreach my $feature (keys %features_and_values_hash) {
1737	4					9	my %seen1 = ();
1738	140	100	66			655	my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
1739	4					8	@{$features_and_values_hash{$feature}};
	4					16
1740	4					15	$feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
1741	4					11	my $not_all_values_float = 0;
1742	4	50				11	map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
	14					152
1743	4	50				13	if ($not_all_values_float == 0) {
1744	0					0	my @minmaxvalues = minmax(\@unique_values_for_feature);
1745	0					0	$numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
1746							}
1747	4					17	$features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
1748							}
1749	1	50				7	if ($self->{_debug1}) {
1750	0					0	print "\nAll class names: @all_class_names\n";
1751	0					0	print "\nEach sample data record:\n";
1752	0					0	foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
	0					0
1753	0					0	print "$sample => @{$feature_values_for_samples_hash{$sample}}\n";
	0					0
1754							}
1755	0					0	print "\nclass label for each data sample:\n";
1756	0					0	foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %class_for_sample_hash) {
	0					0
1757	0					0	print "$sample => $class_for_sample_hash{$sample}\n";
1758							}
1759	0					0	print "\nFeatures used: @feature_names\n\n";
1760	0					0	print "\nfeatures and the values taken by them:\n";
1761	0					0	foreach my $feature (sort keys %features_and_values_hash) {
1762	0					0	print "$feature => @{$features_and_values_hash{$feature}}\n";
	0					0
1763							}
1764	0					0	print "\nnumeric features and their ranges:\n";
1765	0					0	foreach my $feature (sort keys %numeric_features_valuerange_hash) {
1766	0					0	print "$feature => @{$numeric_features_valuerange_hash{$feature}}\n";
	0					0
1767							}
1768	0					0	print "\nnumber of unique values in each feature:\n";
1769	0					0	foreach my $feature (sort keys %feature_values_how_many_uniques_hash) {
1770	0					0	print "$feature => $feature_values_how_many_uniques_hash{$feature}\n";
1771							}
1772							}
1773	1					4	$self->{_class_names} = \@all_class_names;
1774	1					4	$self->{_feature_names} = \@feature_names;
1775	1					4	$self->{_samples_class_label_hash} = \%class_for_sample_hash;
1776	1					3	$self->{_training_data_hash} = \%feature_values_for_samples_hash;
1777	1					4	$self->{_features_and_values_hash} = \%features_and_values_hash;
1778	1					12	$self->{_features_and_unique_values_hash} = \%features_and_unique_values_hash;
1779	1					3	$self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
1780	1					44	$self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
1781							}
1782
1783							sub show_training_data {
1784	0			0	1	0	my $self = shift;
1785	0					0	my @class_names = @{$self->{_class_names}};
	0					0
1786	0					0	my %features_and_values_hash = %{$self->{_features_and_values_hash}};
	0					0
1787	0					0	my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};
	0					0
1788	0					0	my %training_data_hash = %{$self->{_training_data_hash}};
	0					0
1789	0					0	print "\n\nClass Names: @class_names\n";
1790	0					0	print "\n\nFeatures and Their Values:\n\n";
1791	0					0	while ( my ($k, $v) = each %features_and_values_hash ) {
1792	0					0	print "$k ---> @{$features_and_values_hash{$k}}\n";
	0					0
1793							}
1794	0					0	print "\n\nSamples vs. Class Labels:\n\n";
1795	0					0	foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {
	0					0
1796	0					0	print "$kee => $samples_class_label_hash{$kee}\n";
1797							}
1798	0					0	print "\n\nTraining Samples:\n\n";
1799	0					0	foreach my $kee (sort {sample_index($a) <=> sample_index($b)}
	0					0
1800							keys %training_data_hash) {
1801	0					0	print "$kee => @{$training_data_hash{$kee}}\n";
	0					0
1802							}
1803							}
1804
1805							sub get_class_names {
1806	0			0	0	0	my $self = shift;
1807	0					0	return @{$self->{_class_names}}
	0					0
1808							}
1809
1810							########################################## Utility Routines ############################################
1811
1812							sub closest_sampling_point {
1813	0			0	0	0	my $value = shift;
1814	0					0	my $arr_ref = shift;
1815	0					0	my @arr = @{$arr_ref};
	0					0
1816	0					0	my @compare = map {abs($_ - $value)} @arr;
	0					0
1817	0					0	my ($minval,$index) = minimum(\@compare);
1818	0					0	return $arr[$index];
1819							}
1820
1821							## returns the array index that contains a specified STRING value: (meant only for array of strings)
1822							sub get_index_at_value {
1823	0			0	0	0	my $value = shift;
1824	0					0	my @array = @{shift @_};
	0					0
1825	0					0	foreach my $i (0..@array-1) {
1826	0	0				0	return $i if $value eq $array[$i];
1827							}
1828							}
1829
1830							## When the training data is read from a CSV file, we assume that the first column
1831							## of each data record contains a unique integer identifier for the record in that
1832							## row. This training data is stored in a hash whose keys are the prefix 'sample_'
1833							## followed by the identifying integers. The purpose of this function is to return
1834							## the identifying integer associated with a data record.
1835							sub sample_index {
1836	1088			1088	0	1757	my $arg = shift;
1837	1088					2447	$arg =~ /_(.+)$/;
1838	1088					2685	return $1;
1839							}
1840
1841							# Returns the minimum value and its positional index in an array
1842							sub minimum {
1843	0			0	0	0	my $arr = shift;
1844	0					0	my $min;
1845							my $index;
1846	0					0	foreach my $i (0..@{$arr}-1) {
	0					0
1847	0	0	0			0	if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
1848	0					0	$index = $i;
1849	0					0	$min = $arr->[$i];
1850							}
1851							}
1852	0					0	return ($min, $index);
1853							}
1854
1855							# Returns an array of two values, the min and the max, of an array of floats
1856							sub minmax {
1857	0			0	0	0	my $arr = shift;
1858	0					0	my ($min, $max);
1859	0					0	foreach my $i (0..@{$arr}-1) {
	0					0
1860	0	0	0			0	if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
1861	0					0	$min = $arr->[$i];
1862							}
1863	0	0	0			0	if ( (!defined $max) \|\| ($arr->[$i] > $max) ) {
1864	0					0	$max = $arr->[$i];
1865							}
1866							}
1867	0					0	return ($min, $max);
1868							}
1869
1870							# checks whether an element is in an array:
1871							sub contained_in {
1872	414			414	0	858	my $ele = shift;
1873	414					1227	my @array = @_;
1874	414					858	my $count = 0;
1875	414	100				908	map {$count++ if $ele eq $_} @array;
	1104					3997
1876	414					1727	return $count;
1877							}
1878
1879							# Meant only for an array of strings (no nesting):
1880							sub deep_copy_array {
1881	289			289	0	619	my $ref_in = shift;
1882	289					578	my $ref_out;
1883	289	100				885	return [] if scalar @$ref_in == 0;
1884	274					525	foreach my $i (0..@{$ref_in}-1) {
	274					994
1885	628					1783	$ref_out->[$i] = $ref_in->[$i];
1886							}
1887	274					986	return $ref_out;
1888							}
1889
1890							sub check_for_illegal_params2 {
1891	1			1	0	3	my @params = @_;
1892	1					7	my @legal_params = qw / training_datafile
1893							entropy_threshold
1894							max_depth_desired
1895							csv_class_column_index
1896							csv_columns_for_features
1897							symbolic_to_numeric_cardinality_threshold
1898							number_of_histogram_bins
1899							csv_cleanup_needed
1900							debug1
1901							debug2
1902							debug3
1903							/;
1904	1					7	my $found_match_flag;
1905	1					3	foreach my $param (@params) {
1906	3					5	foreach my $legal (@legal_params) {
1907	10					16	$found_match_flag = 0;
1908	10	100				20	if ($param eq $legal) {
1909	3					4	$found_match_flag = 1;
1910	3					5	last;
1911							}
1912							}
1913	3	50				9	last if $found_match_flag == 0;
1914							}
1915	1					9	return $found_match_flag;
1916							}
1917
1918							sub print_array_with_msg {
1919	0			0	0	0	my $message = shift;
1920	0					0	my $arr = shift;
1921	0					0	print "\n$message: ";
1922	0					0	print_nested_array( $arr );
1923							}
1924
1925							sub print_nested_array {
1926	0			0	0	0	my $arr = shift;
1927	0					0	my @arr = @$arr;
1928	0					0	print "[";
1929	0					0	foreach my $item (@arr) {
1930	0	0				0	if (ref $item) {
1931	0					0	print_nested_array($item);
1932							} else {
1933	0					0	print "$item";
1934							}
1935							}
1936	0					0	print "]";
1937							}
1938
1939							sub cleanup_csv {
1940	0			0	0	0	my $line = shift;
1941	0					0	$line =~ tr/\/:?()[]{}'/ /;
1942							# my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
1943	0					0	my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
1944	0					0	for (@double_quoted) {
1945	0					0	my $item = $_;
1946	0					0	$item = substr($item, 1, -1);
1947	0					0	$item =~ s/^\s+\|,\|\s+$//g;
1948	0					0	$item = join '_', split /\s+/, $item;
1949	0					0	substr($line, index($line, $_), length($_)) = $item;
1950							}
1951	0					0	my @white_spaced = $line =~ /,(\s*[^,]+)(?=,\|$)/g;
1952	0					0	for (@white_spaced) {
1953	0					0	my $item = $_;
1954	0					0	$item =~ s/\s+/_/g;
1955	0					0	$item =~ s/^\s_\|_\s$//g;
1956	0					0	substr($line, index($line, $_), length($_)) = $item;
1957							}
1958	0					0	$line =~ s/,\s*(?=,\|$)/,NA/g;
1959	0					0	return $line;
1960							}
1961
1962							######################################### Class EvalTrainingData ########################################
1963
1964							## This subclass of the DecisionTree class is used to evaluate the quality of your
1965							## training data by running a 10-fold cross-validation test on it. This test divides
1966							## all of the training data into ten parts, with nine parts used for training a
1967							## decision tree and one part used for testing its ability to classify correctly.
1968							## This selection of nine parts for training and one part for testing is carried out
1969							## in all of the ten different possible ways. This testing functionality can also
1970							## be used to find the best values to use for the constructor parameters
1971							## entropy_threshold, max_depth_desired, and
1972							## symbolic_to_numeric_cardinality_threshold.
1973
1974							## Only the CSV training files can be evaluated in this manner (because only CSV
1975							## training are allowed to have numeric features --- which is the more interesting
1976							## case for evaluation analytics.
1977
1978							package EvalTrainingData;
1979
1980							@EvalTrainingData::ISA = ('Algorithm::DecisionTree');
1981
1982							sub new {
1983	0			0		0	my $class = shift;
1984	0					0	my $instance = Algorithm::DecisionTree->new(@_);
1985	0					0	bless $instance, $class;
1986							}
1987
1988							sub evaluate_training_data {
1989	0			0		0	my $self = shift;
1990	0					0	my $evaldebug = 0;
1991							die "The data evaluation function in the module can only be used when your " .
1992	0	0				0	"training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
1993	0					0	print "\nWill run a 10-fold cross-validation test on your training data to test its " .
1994							"class-discriminatory power:\n";
1995	0					0	my %all_training_data = %{$self->{_training_data_hash}};
	0					0
1996	0					0	my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=>
	0					0
1997							Algorithm::DecisionTree::sample_index($b)} keys %all_training_data;
1998	0					0	my $fold_size = int(0.1 * (scalar keys %all_training_data));
1999	0					0	print "fold size: $fold_size\n";
2000	0					0	my %confusion_matrix = ();
2001	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
2002	0					0	foreach my $inner_class_name (@{$self->{_class_names}}) {
	0					0
2003	0					0	$confusion_matrix{$class_name}->{$inner_class_name} = 0;
2004							}
2005							}
2006	0					0	foreach my $fold_index (0..9) {
2007	0					0	print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n";
2008	0					0	my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
2009	0					0	my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],
2010							@all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
2011	0					0	my %testing_data = ();
2012	0					0	foreach my $x (@testing_samples) {
2013	0					0	$testing_data{$x} = $all_training_data{$x};
2014							}
2015	0					0	my %training_data = ();
2016	0					0	foreach my $x (@training_samples) {
2017	0					0	$training_data{$x} = $all_training_data{$x};
2018							}
2019	0					0	my $trainingDT = Algorithm::DecisionTree->new('evalmode');
2020	0					0	$trainingDT->{_training_data_hash} = \%training_data;
2021	0					0	$trainingDT->{_class_names} = $self->{_class_names};
2022	0					0	$trainingDT->{_feature_names} = $self->{_feature_names};
2023	0					0	$trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
2024	0					0	$trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
2025							$trainingDT->{_symbolic_to_numeric_cardinality_threshold} =
2026	0					0	$self->{_symbolic_to_numeric_cardinality_threshold};
2027	0					0	foreach my $sample_name (@training_samples) {
2028							$trainingDT->{_samples_class_label_hash}->{$sample_name} =
2029	0					0	$self->{_samples_class_label_hash}->{$sample_name};
2030							}
2031	0					0	foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
	0					0
2032	0					0	$trainingDT->{_features_and_values_hash}->{$feature} = ();
2033							}
2034	0					0	my $pattern = '(\S+)\s=\s(\S+)';
2035	0					0	foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=>
	0					0
2036							Algorithm::DecisionTree::sample_index($b)}
2037	0					0	keys %{$trainingDT->{_training_data_hash}}) {
2038	0					0	foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
	0					0
2039	0					0	$feature_and_value =~ /$pattern/;
2040	0					0	my ($feature,$value) = ($1,$2);
2041	0	0				0	push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
	0					0
2042							}
2043							}
2044	0					0	foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
	0					0
2045	0					0	my %seen = ();
2046	0	0	0			0	my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
2047	0					0	@{$trainingDT->{_features_and_values_hash}->{$feature}};
	0					0
2048	0	0				0	if (Algorithm::DecisionTree::contained_in($feature,
2049	0					0	keys %{$self->{_numeric_features_valuerange_hash}})) {
2050	0					0	@unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;
	0					0
2051							} else {
2052	0					0	@unique_values_for_feature = sort @unique_values_for_feature;
2053							}
2054	0					0	$trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
2055							}
2056	0					0	foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
	0					0
2057							my @minmaxvalues = Algorithm::DecisionTree::minmax(
2058	0					0	\@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
	0					0
2059	0					0	$trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
2060							}
2061	0	0				0	if ($evaldebug) {
2062	0					0	print "\n\nprinting samples in the testing set: @testing_samples\n";
2063	0					0	print "\n\nPrinting features and their values in the training set:\n";
2064	0					0	foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
	0					0
2065	0					0	print "$item => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
	0					0
2066							}
2067	0					0	print "\n\nPrinting unique values for features:\n";
2068	0					0	foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
	0					0
2069	0					0	print "$item => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
	0					0
2070							}
2071	0					0	print "\n\nPrinting unique value ranges for features:\n";
2072	0					0	foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
	0					0
2073	0					0	print "$item => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
	0					0
2074							}
2075							}
2076	0					0	foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
	0					0
2077							$trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} =
2078	0					0	scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
	0					0
2079							}
2080	0	0				0	$trainingDT->{_debug2} = 1 if $evaldebug;
2081	0					0	$trainingDT->calculate_first_order_probabilities();
2082	0					0	$trainingDT->calculate_class_priors();
2083	0					0	my $root_node = $trainingDT->construct_decision_tree_classifier();
2084	0	0				0	$root_node->display_decision_tree(" ") if $evaldebug;
2085	0					0	foreach my $test_sample_name (@testing_samples) {
2086	0					0	my @test_sample_data = @{$all_training_data{$test_sample_name}};
	0					0
2087	0	0				0	print "original data in test sample: @test_sample_data\n" if $evaldebug;
2088	0	0	0			0	@test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
	0					0
2089	0	0				0	print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
2090	0					0	my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
	0					0
2091	0					0	my @solution_path = @{$classification{'solution_path'}};
	0					0
2092	0					0	delete $classification{'solution_path'};
2093	0					0	my @which_classes = keys %classification;
2094	0					0	@which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
	0					0
2095	0					0	my $most_likely_class_label = $which_classes[0];
2096	0	0				0	if ($evaldebug) {
2097	0					0	print "\nClassification:\n\n";
2098	0					0	print " class probability\n";
2099	0					0	print " ---------- -----------\n";
2100	0					0	foreach my $which_class (@which_classes) {
2101	0					0	my $classstring = sprintf("%-30s", $which_class);
2102	0					0	my $valuestring = sprintf("%-30s", $classification{$which_class});
2103	0					0	print " $classstring $valuestring\n";
2104							}
2105	0					0	print "\nSolution path in the decision tree: @solution_path\n";
2106	0					0	print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
2107							}
2108	0					0	my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
2109	0	0				0	print "$test_sample_name: true_class: $true_class_label_for_sample " .
2110							"estimated_class: $most_likely_class_label\n" if $evaldebug;
2111	0					0	$confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
2112							}
2113							}
2114	0					0	print "\n\n DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
2115	0					0	my $matrix_header = " " x 30;
2116	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
2117	0					0	$matrix_header .= sprintf("%-30s", $class_name);
2118							}
2119	0					0	print "\n" . $matrix_header . "\n\n";
2120	0					0	foreach my $row_class_name (sort keys %confusion_matrix) {
2121	0					0	my $row_display = sprintf("%-30s", $row_class_name);
2122	0					0	foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
	0					0
2123	0					0	$row_display .= sprintf( "%-30u", $confusion_matrix{$row_class_name}->{$col_class_name} );
2124							}
2125	0					0	print "$row_display\n\n";
2126							}
2127	0					0	print "\n\n";
2128	0					0	my ($diagonal_sum, $off_diagonal_sum) = (0,0);
2129	0					0	foreach my $row_class_name (sort keys %confusion_matrix) {
2130	0					0	foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
	0					0
2131	0	0				0	if ($row_class_name eq $col_class_name) {
2132	0					0	$diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2133							} else {
2134	0					0	$off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2135							}
2136							}
2137							}
2138	0					0	my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
2139	0					0	print "\nTraining Data Quality Index: $data_quality_index (out of a possible maximum of 100)\n";
2140	0	0	0			0	if ($data_quality_index <= 80) {
		0	0
		0	0
		0
2141	0					0	print "\nYour training data does not possess much class discriminatory " .
2142							"information. It could be that the classes are inherently not well " .
2143							"separable or that your constructor parameter choices are not appropriate.\n";
2144							} elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
2145	0					0	print "\nYour training data possesses some class discriminatory information " .
2146							"but it may not be sufficient for real-world applications. You might " .
2147							"try tweaking the constructor parameters to see if that improves the " .
2148							"class discriminations.\n";
2149							} elsif ($data_quality_index > 90 && $data_quality_index <= 95) {
2150	0					0	print "\nYour training data appears to possess good class discriminatory " .
2151							"information. Whether or not it is acceptable would depend on your " .
2152							"application.\n";
2153							} elsif ($data_quality_index > 95 && $data_quality_index <= 98) {
2154	0					0	print "\nYour training data is of excellent quality.\n";
2155							} else {
2156	0					0	print "\nYour training data is perfect.\n";
2157							}
2158
2159							}
2160
2161
2162							############################################# Class DTNode #############################################
2163
2164							# The nodes of the decision tree are instances of this class:
2165
2166							package DTNode;
2167
2168	1			1		23	use strict;
	1					5
	1					39
2169	1			1		14	use Carp;
	1					4
	1					1578
2170
2171							# $feature is the feature test at the current node. $branch_features_and_values is
2172							# an anonymous array holding the feature names and corresponding values on the path
2173							# from the root to the current node:
2174							sub new {
2175	46			46		235	my ($class, $feature, $entropy, $class_probabilities,
2176							$branch_features_and_values_or_thresholds, $dt, $root_or_not) = @_;
2177	46	100				229	$root_or_not = '' if !defined $root_or_not;
2178	46	100				187	if ($root_or_not eq 'root') {
2179	1					5	$dt->{nodes_created} = -1;
2180	1					8	$dt->{class_names} = undef;
2181							}
2182	46					400	my $self = {
2183							_dt => $dt,
2184							_feature => $feature,
2185							_node_creation_entropy => $entropy,
2186							_class_probabilities => $class_probabilities,
2187							_branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
2188							_linked_to => [],
2189							};
2190	46					153	bless $self, $class;
2191	46					199	$self->{_serial_number} = $self->get_next_serial_num();
2192	46					136	return $self;
2193							}
2194
2195							sub how_many_nodes {
2196	0			0		0	my $self = shift;
2197	0					0	return $self->{_dt}->{nodes_created} + 1;
2198							}
2199
2200							sub set_class_names {
2201	1			1		5	my $self = shift;
2202	1					4	my $class_names_list = shift;
2203	1					4	$self->{_dt}->{class_names} = $class_names_list;
2204							}
2205
2206							sub get_class_names {
2207	0			0		0	my $self = shift;
2208	0					0	return $self->{_dt}->{class_names};
2209							}
2210
2211							sub get_next_serial_num {
2212	46			46		110	my $self = shift;
2213	46					158	$self->{_dt}->{nodes_created} += 1;
2214	46					154	return $self->{_dt}->{nodes_created};
2215							}
2216
2217							sub get_serial_num {
2218	46			46		115	my $self = shift;
2219	46					135	$self->{_serial_number};
2220							}
2221
2222							# this returns the feature test at the current node
2223							sub get_feature {
2224	0			0		0	my $self = shift;
2225	0					0	return $self->{ _feature };
2226							}
2227
2228							sub set_feature {
2229	46			46		114	my $self = shift;
2230	46					105	my $feature = shift;
2231	46					168	$self->{_feature} = $feature;
2232							}
2233
2234							sub get_node_entropy {
2235	46			46		103	my $self = shift;
2236	46					132	return $self->{_node_creation_entropy};
2237							}
2238
2239							sub get_class_probabilities {
2240	0			0		0	my $self = shift;
2241	0					0	return $self->{ _class_probabilities};
2242							}
2243
2244							sub get_branch_features_and_values_or_thresholds {
2245	46			46		117	my $self = shift;
2246	46					178	return $self->{_branch_features_and_values_or_thresholds};
2247							}
2248
2249							sub add_to_branch_features_and_values {
2250	0			0		0	my $self = shift;
2251	0					0	my $feature_and_value = shift;
2252	0					0	push @{$self->{ _branch_features_and_values }}, $feature_and_value;
	0					0
2253							}
2254
2255							sub get_children {
2256	0			0		0	my $self = shift;
2257	0					0	return $self->{_linked_to};
2258							}
2259
2260							sub add_child_link {
2261	45			45		136	my ($self, $new_node, ) = @_;
2262	45					103	push @{$self->{_linked_to}}, $new_node;
	45					151
2263							}
2264
2265							sub delete_all_links {
2266	0			0		0	my $self = shift;
2267	0					0	$self->{_linked_to} = undef;
2268							}
2269
2270							sub display_node {
2271	0			0		0	my $self = shift;
2272	0		0			0	my $feature_at_node = $self->get_feature() \|\| " ";
2273	0					0	my $node_creation_entropy_at_node = $self->get_node_entropy();
2274	0					0	my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2275	0					0	my @class_probabilities = @{$self->get_class_probabilities()};
	0					0
2276	0					0	my @class_probabilities_for_display = map {sprintf("%0.3f", $_)} @class_probabilities;
	0					0
2277	0					0	my $serial_num = $self->get_serial_num();
2278	0					0	my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
	0					0
2279	0					0	print "\n\nNODE $serial_num" .
2280							":\n Branch features and values to this node: @branch_features_and_values_or_thresholds" .
2281							"\n Class probabilities at current node: @class_probabilities_for_display" .
2282							"\n Entropy at current node: $print_node_creation_entropy_at_node" .
2283							"\n Best feature test at current node: $feature_at_node\n\n";
2284							}
2285
2286							sub display_decision_tree {
2287	0			0		0	my $self = shift;
2288	0					0	my $offset = shift;
2289	0					0	my $serial_num = $self->get_serial_num();
2290	0	0				0	if (@{$self->get_children()} > 0) {
	0					0
2291	0		0			0	my $feature_at_node = $self->get_feature() \|\| " ";
2292	0					0	my $node_creation_entropy_at_node = $self->get_node_entropy();
2293	0					0	my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2294	0					0	my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
	0					0
2295	0					0	my @class_probabilities = @{$self->get_class_probabilities()};
	0					0
2296	0					0	my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
	0					0
2297	0					0	my @class_names = @{$self->get_class_names()};
	0					0
2298							my @print_class_probabilities_with_class =
2299	0					0	map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
	0					0
2300	0					0	print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
2301	0					0	my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2302	0					0	print "$second_line_offset" . "Decision Feature: $feature_at_node Node Creation Entropy: " ,
2303							"$print_node_creation_entropy_at_node Class Probs: @print_class_probabilities_with_class\n\n";
2304	0					0	$offset .= " ";
2305	0					0	foreach my $child (@{$self->get_children()}) {
	0					0
2306	0					0	$child->display_decision_tree($offset);
2307							}
2308							} else {
2309	0					0	my $node_creation_entropy_at_node = $self->get_node_entropy();
2310	0					0	my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2311	0					0	my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
	0					0
2312	0					0	my @class_probabilities = @{$self->get_class_probabilities()};
	0					0
2313	0					0	my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
	0					0
2314	0					0	my @class_names = @{$self->get_class_names()};
	0					0
2315							my @print_class_probabilities_with_class =
2316	0					0	map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
	0					0
2317	0					0	print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
2318	0					0	my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2319	0					0	print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node " .
2320							"Class Probs: @print_class_probabilities_with_class\n\n";
2321							}
2322							}
2323
2324
2325							############################## Generate Your Own Numeric Training Data #################################
2326							############################# Class TrainingDataGeneratorNumeric ################################
2327
2328							## See the script generate_training_data_numeric.pl in the examples
2329							## directory on how to use this class for generating your own numeric training and
2330							## test data. The training and test data are generated in accordance with the
2331							## specifications you place in the parameter file that is supplied as an argument to
2332							## the constructor of this class.
2333
2334							package TrainingDataGeneratorNumeric;
2335
2336	1			1		20	use strict;
	1					3
	1					36
2337	1			1		8	use Carp;
	1					4
	1					1485
2338
2339							sub new {
2340	0			0		0	my ($class, %args) = @_;
2341	0					0	my @params = keys %args;
2342	0	0				0	croak "\nYou have used a wrong name for a keyword argument " .
2343							"--- perhaps a misspelling\n"
2344							if check_for_illegal_params3(@params) == 0;
2345							bless {
2346							_output_training_csv_file => $args{'output_training_csv_file'}
2347							\|\| croak("name for output_training_csv_file required"),
2348							_output_test_csv_file => $args{'output_test_csv_file'}
2349							\|\| croak("name for output_test_csv_file required"),
2350							_parameter_file => $args{'parameter_file'}
2351							\|\| croak("parameter_file required"),
2352							_number_of_samples_for_training => $args{'number_of_samples_for_training'}
2353							\|\| croak("number_of_samples_for_training"),
2354							_number_of_samples_for_testing => $args{'number_of_samples_for_testing'}
2355							\|\| croak("number_of_samples_for_testing"),
2356	0		0			0	_debug => $args{debug} \|\| 0,
			0
			0
			0
			0
			0
2357							_class_names => [],
2358							_class_names_and_priors => {},
2359							_features_with_value_range => {},
2360							_features_ordered => [],
2361							_classes_and_their_param_values => {},
2362							}, $class;
2363							}
2364
2365							sub check_for_illegal_params3 {
2366	0			0		0	my @params = @_;
2367	0					0	my @legal_params = qw / output_training_csv_file
2368							output_test_csv_file
2369							parameter_file
2370							number_of_samples_for_training
2371							number_of_samples_for_testing
2372							debug
2373							/;
2374	0					0	my $found_match_flag;
2375	0					0	foreach my $param (@params) {
2376	0					0	foreach my $legal (@legal_params) {
2377	0					0	$found_match_flag = 0;
2378	0	0				0	if ($param eq $legal) {
2379	0					0	$found_match_flag = 1;
2380	0					0	last;
2381							}
2382							}
2383	0	0				0	last if $found_match_flag == 0;
2384							}
2385	0					0	return $found_match_flag;
2386							}
2387
2388							## The training data generated by an instance of the class
2389							## TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
2390							## you supply to the class constructor through a constructor variable called
2391							## `parameter_file'. This method is for parsing the parameter file in order to
2392							## order to determine the names to be used for the different data classes, their
2393							## means, and their variances.
2394							sub read_parameter_file_numeric {
2395	0			0		0	my $self = shift;
2396	0					0	my @class_names = ();
2397	0					0	my %class_names_and_priors = ();
2398	0					0	my %features_with_value_range = ();
2399	0					0	my %classes_and_their_param_values = ();
2400							# my $regex8 = '[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?';
2401	0		0			0	open FILE, $self->{_parameter_file} \|\| die "unable to open parameter file: $!";
2402	0					0	my @params = ;
2403	0					0	my $params = join "", @params;
2404	0					0	my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
2405	0					0	$params =~ /$regex/si;
2406	0					0	my ($class_names, $class_priors) = ($1, $2);
2407	0					0	@class_names = split ' ', $class_names;
2408	0					0	my @class_priors = split ' ', $class_priors;
2409	0					0	foreach my $i (0..@class_names-1) {
2410	0					0	$class_names_and_priors{$class_names[$i]} = $class_priors[$i];
2411							}
2412	0	0				0	if ($self->{_debug}) {
2413	0					0	foreach my $cname (keys %class_names_and_priors) {
2414	0					0	print "$cname => $class_names_and_priors{$cname}\n";
2415							}
2416							}
2417	0					0	$regex = 'feature name: \w.?value range: [\d\. -]+';
2418	0					0	my @features = $params =~ /$regex/gsi;
2419	0					0	my @features_ordered;
2420	0					0	$regex = 'feature name: (\w+)\W?value range:\s([\d. -]+)';
2421	0					0	foreach my $feature (@features) {
2422	0					0	$feature =~ /$regex/i;
2423	0					0	my $feature_name = $1;
2424	0					0	push @features_ordered, $feature_name;
2425	0					0	my @value_range = split ' ', $2;
2426	0					0	$features_with_value_range{$feature_name} = \@value_range;
2427							}
2428	0	0				0	if ($self->{_debug}) {
2429	0					0	foreach my $fname (keys %features_with_value_range) {
2430	0					0	print "$fname => @{$features_with_value_range{$fname}}\n";
	0					0
2431							}
2432							}
2433	0					0	foreach my $i (0..@class_names-1) {
2434	0					0	$classes_and_their_param_values{$class_names[$i]} = {};
2435							}
2436	0					0	$regex = 'params for class: \w?\W+?mean:[\d\. ]+\W?covariance:\W+?(?:[ \d.]+\W+?)+';
2437	0					0	my @class_params = $params =~ /$regex/gsi;
2438	0					0	$regex = 'params for class: (\w+)\W?mean:\s([\d. -]+)\Wcovariance:\s([\s\d.]+)';
2439	0					0	foreach my $class_param (@class_params) {
2440	0					0	$class_param =~ /$regex/gsi;
2441	0					0	my $class_name = $1;
2442	0					0	my @class_mean = split ' ', $2;
2443	0					0	$classes_and_their_param_values{$class_name}->{'mean'} = \@class_mean;
2444	0					0	my $class_param_string = $3;
2445	0					0	my @covar_rows = split '\n', $class_param_string;
2446	0					0	my @covar_matrix;
2447	0					0	foreach my $row (@covar_rows) {
2448	0					0	my @row = split ' ', $row;
2449	0					0	push @covar_matrix, \@row;
2450							}
2451	0					0	$classes_and_their_param_values{$class_name}->{'covariance'} = \@covar_matrix;
2452							}
2453	0	0				0	if ($self->{_debug}) {
2454	0					0	print "\nThe class parameters are:\n\n";
2455	0					0	foreach my $cname (keys %classes_and_their_param_values) {
2456	0					0	print "\nFor class name $cname:\n";
2457	0					0	my %params_hash = %{$classes_and_their_param_values{$cname}};
	0					0
2458	0					0	foreach my $x (keys %params_hash) {
2459	0	0				0	if ($x eq 'mean') {
2460	0					0	print " $x => @{$params_hash{$x}}\n";
	0					0
2461							} else {
2462	0	0				0	if ($x eq 'covariance') {
2463	0					0	print " The covariance matrix:\n";
2464	0					0	my @matrix = @{$params_hash{'covariance'}};
	0					0
2465	0					0	foreach my $row (@matrix) {
2466	0					0	print " @$row\n";
2467							}
2468							}
2469							}
2470							}
2471							}
2472							}
2473	0					0	$self->{_class_names} = \@class_names;
2474	0					0	$self->{_class_names_and_priors} = \%class_names_and_priors;
2475	0					0	$self->{_features_with_value_range} = \%features_with_value_range;
2476	0					0	$self->{_classes_and_their_param_values} = \%classes_and_their_param_values;
2477	0					0	$self->{_features_ordered} = \@features_ordered;
2478							}
2479
2480							## After the parameter file is parsed by the previous method, this method calls on
2481							## Math::Random::random_multivariate_normal() to generate the training and test data
2482							## samples. Your training and test data can be of any number of of dimensions, can
2483							## have any mean, and any covariance. The training and test data must obviously be
2484							## drawn from the same distribution.
2485							sub gen_numeric_training_and_test_data_and_write_to_csv {
2486	1			1		864	use Math::Random;
	1					8174
	1					1453
2487	0			0		0	my $self = shift;
2488	0					0	my %training_samples_for_class;
2489							my %test_samples_for_class;
2490	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
2491	0					0	$training_samples_for_class{$class_name} = [];
2492	0					0	$test_samples_for_class{$class_name} = [];
2493							}
2494	0					0	foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
	0					0
2495	0					0	my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
	0					0
2496	0					0	my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
	0					0
2497							my @new_training_data = Math::Random::random_multivariate_normal(
2498	0					0	$self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
2499							@mean, @covariance );
2500							my @new_test_data = Math::Random::random_multivariate_normal(
2501	0					0	$self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
2502							@mean, @covariance );
2503	0	0				0	if ($self->{_debug}) {
2504	0					0	print "training data for class $class_name:\n";
2505	0					0	foreach my $x (@new_training_data) {print "@$x\n";}
	0					0
2506	0					0	print "\n\ntest data for class $class_name:\n";
2507	0					0	foreach my $x (@new_test_data) {print "@$x\n";}
	0					0
2508							}
2509	0					0	$training_samples_for_class{$class_name} = \@new_training_data;
2510	0					0	$test_samples_for_class{$class_name} = \@new_test_data;
2511							}
2512	0					0	my @training_data_records = ();
2513	0					0	my @test_data_records = ();
2514	0					0	foreach my $class_name (keys %training_samples_for_class) {
2515							my $num_of_samples_for_training = $self->{_number_of_samples_for_training} *
2516	0					0	$self->{_class_names_and_priors}->{$class_name};
2517							my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} *
2518	0					0	$self->{_class_names_and_priors}->{$class_name};
2519	0					0	foreach my $sample_index (0..$num_of_samples_for_training-1) {
2520	0					0	my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};
	0					0
2521	0					0	@training_vector = map {sprintf("%.3f", $_)} @training_vector;
	0					0
2522	0					0	my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";
2523	0					0	push @training_data_records, $training_data_record;
2524							}
2525	0					0	foreach my $sample_index (0..$num_of_samples_for_testing-1) {
2526	0					0	my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
	0					0
2527	0					0	@test_vector = map {sprintf("%.3f", $_)} @test_vector;
	0					0
2528	0					0	my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
2529	0					0	push @test_data_records, $test_data_record;
2530							}
2531							}
2532	0					0	fisher_yates_shuffle(\@training_data_records);
2533	0					0	fisher_yates_shuffle(\@test_data_records);
2534	0	0				0	if ($self->{_debug}) {
2535	0					0	foreach my $record (@training_data_records) {
2536	0					0	print "$record";
2537							}
2538	0					0	foreach my $record (@test_data_records) {
2539	0					0	print "$record";
2540							}
2541							}
2542	0					0	open OUTPUT, ">$self->{_output_training_csv_file}";
2543	0					0	my @feature_names_training = @{$self->{_features_ordered}};
	0					0
2544	0					0	my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
	0					0
2545	0					0	my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
2546	0					0	print OUTPUT "$first_row_training\n";
2547	0					0	foreach my $i (0..@training_data_records-1) {
2548	0					0	my $i1 = $i+1;
2549	0					0	my $sample_record = "\"$i1\",$training_data_records[$i]";
2550	0					0	print OUTPUT "$sample_record";
2551							}
2552	0					0	close OUTPUT;
2553	0					0	open OUTPUT, ">$self->{_output_test_csv_file}";
2554	0					0	my @feature_names_testing = keys %{$self->{_features_with_value_range}};
	0					0
2555	0					0	my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
	0					0
2556	0					0	my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
2557	0					0	print OUTPUT "$first_row_testing\n";
2558	0					0	foreach my $i (0..@test_data_records-1) {
2559	0					0	my $i1 = $i+1;
2560	0					0	my $sample_record = "\"$i1\",$test_data_records[$i]";
2561	0					0	print OUTPUT "$sample_record";
2562							}
2563	0					0	close OUTPUT;
2564							}
2565
2566							# from perl docs:
2567							sub fisher_yates_shuffle {
2568	0			0		0	my $arr = shift;
2569	0					0	my $i = @$arr;
2570	0					0	while (--$i) {
2571	0					0	my $j = int rand( $i + 1 );
2572	0					0	@$arr[$i, $j] = @$arr[$j, $i];
2573							}
2574							}
2575
2576							########################### Generate Your Own Symbolic Training Data ###############################
2577							########################### Class TrainingDataGeneratorSymbolic #############################
2578
2579							## See the sample script generate_training_and_test_data_symbolic.pl for how to use
2580							## this class for generating purely symbolic training and test data. The data is
2581							## generated according to the specifications you place in a parameter file whose
2582							## name you supply as one of constructor arguments.
2583							package TrainingDataGeneratorSymbolic;
2584
2585	1			1		19	use strict;
	1					5
	1					39
2586	1			1		9	use Carp;
	1					3
	1					3354
2587
2588							sub new {
2589	1			1		32	my ($class, %args) = @_;
2590	1					7	my @params = keys %args;
2591	1	50				8	croak "\nYou have used a wrong name for a keyword argument " .
2592							"--- perhaps a misspelling\n"
2593							if check_for_illegal_params4(@params) == 0;
2594							bless {
2595							_output_training_datafile => $args{'output_training_datafile'}
2596							\|\| die("name for output_training_datafile required"),
2597							_parameter_file => $args{'parameter_file'}
2598							\|\| die("parameter_file required"),
2599							_number_of_samples_for_training => $args{'number_of_samples_for_training'}
2600							\|\| die("number_of_samples_for_training required"),
2601	1		50			31	_debug => $args{debug} \|\| 0,
			50
			50
			50
2602							_class_names => [],
2603							_class_priors => [],
2604							_features_and_values_hash => {},
2605							_bias_hash => {},
2606							_training_sample_records => {},
2607							}, $class;
2608							}
2609
2610							sub check_for_illegal_params4 {
2611	1			1		5	my @params = @_;
2612	1					5	my @legal_params = qw / output_training_datafile
2613							parameter_file
2614							number_of_samples_for_training
2615							debug
2616							/;
2617	1					4	my $found_match_flag;
2618	1					4	foreach my $param (@params) {
2619	3					10	foreach my $legal (@legal_params) {
2620	6					15	$found_match_flag = 0;
2621	6	100				23	if ($param eq $legal) {
2622	3					9	$found_match_flag = 1;
2623	3					8	last;
2624							}
2625							}
2626	3	50				14	last if $found_match_flag == 0;
2627							}
2628	1					8	return $found_match_flag;
2629							}
2630
2631							## Read a parameter file for generating symbolic training data. See the script
2632							## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
2633							## pass the name of the parameter file to the constructor of the
2634							## TrainingDataGeneratorSymbolic class.
2635							sub read_parameter_file_symbolic {
2636	1			1		9	my $self = shift;
2637	1					10	my $debug = $self->{_debug};
2638	1					5	my $number_of_training_samples = $self->{_number_of_samples_for_training};
2639	1					3	my $input_parameter_file = $self->{_parameter_file};
2640	1	50				7	croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
2641	1					3	my $output_file_training = $self->{_output_training_datafile};
2642	1					4	my $output_file_testing = $self->{_output_test_datafile};
2643	1					4	my @all_params;
2644							my $param_string;
2645	1		33			36	open INPUT, $input_parameter_file \|\| "unable to open parameter file: $!";
2646	1					53	@all_params = ;
2647	1					9	@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
	40					150
2648	1					5	@all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
	36					173
2649	1					9	$param_string = join ' ', @all_params;
2650	1					16	my ($class_names, $class_priors, $rest_param) =
2651							$param_string =~ /^\sclass names:(.?)\sclass priors:(.?)(feature: .*)/;
2652	1	50				12	my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
	3					26
2653	1					25	push @{$self->{_class_names}}, @class_names;
	1					7
2654	1	50				16	my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
	3					24
2655	1					4	push @{$self->{_class_priors}}, @class_priors;
	1					6
2656	1					15	my ($feature_string, $bias_string) = $rest_param =~ /(feature:.?) (bias:.)/;
2657	1					4	my %features_and_values_hash;
2658	1					24	my @features = split /(feature[:])/, $feature_string;
2659	1	50				5	@features = grep {defined($_) && length($_) > 0} @features;
	9					52
2660	1					4	foreach my $item (@features) {
2661	8	100				38	next if $item =~ /feature/;
2662	4					24	my @splits = split / /, $item;
2663	4	50				14	@splits = grep {defined($_) && length($_) > 0} @splits;
	27					148
2664	4					16	foreach my $i (0..@splits-1) {
2665	22	100				67	if ($i == 0) {
2666	4					20	$features_and_values_hash{$splits[0]} = [];
2667							} else {
2668	18	100				70	next if $splits[$i] =~ /values/;
2669	14					29	push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
	14					56
2670							}
2671							}
2672							}
2673	1					6	$self->{_features_and_values_hash} = \%features_and_values_hash;
2674	1					4	my %bias_hash = %{$self->{_bias_hash}};
	1					6
2675	1					20	my @biases = split /(bias[:]\s*class[:])/, $bias_string;
2676	1	50				5	@biases = grep {defined($_) && length($_) > 0} @biases;
	5					35
2677	1					5	foreach my $item (@biases) {
2678	4	100				21	next if $item =~ /bias/;
2679	2					24	my @splits = split /\s+/, $item;
2680	2	50				8	@splits = grep {defined($_) && length($_) > 0} @splits;
	18					97
2681	2					6	my $feature_name;
2682	2					16	foreach my $i (0..@splits-1) {
2683	16	100				89	if ($i == 0) {
		100
2684	2					10	$bias_hash{$splits[0]} = {};
2685							} elsif ($splits[$i] =~ /(^.+)[:]$/) {
2686	8					29	$feature_name = $1;
2687	8					37	$bias_hash{$splits[0]}->{$feature_name} = [];
2688							} else {
2689	6	50				28	next if !defined $feature_name;
2690	6	50				20	push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
	6					26
2691							if defined $feature_name;
2692							}
2693							}
2694							}
2695	1					5	$self->{_bias_hash} = \%bias_hash;
2696	1	50				16	if ($debug) {
2697	0					0	print "\n\nClass names: @class_names\n";
2698	0					0	my $num_of_classes = @class_names;
2699	0					0	print "Class priors: @class_priors\n";
2700	0					0	print "Number of classes: $num_of_classes\n";
2701	0					0	print "\nHere are the features and their possible values:\n";
2702	0					0	while ( my ($k, $v) = each %features_and_values_hash ) {
2703	0					0	print "$k ===> @$v\n";
2704							}
2705	0					0	print "\nHere is the biasing for each class:\n";
2706	0					0	while ( my ($k, $v) = each %bias_hash ) {
2707	0					0	print "$k:\n";
2708	0					0	while ( my ($k1, $v1) = each %$v ) {
2709	0					0	print " $k1 ===> @$v1\n";
2710							}
2711							}
2712							}
2713							}
2714
2715							## This method generates training data according to the specifications placed in a
2716							## parameter file that is read by the previous method.
2717							sub gen_symbolic_training_data {
2718	1			1		11	my $self = shift;
2719	1					3	my @class_names = @{$self->{_class_names}};
	1					7
2720	1					4	my @class_priors = @{$self->{_class_priors}};
	1					5
2721	1					3	my %training_sample_records;
2722	1					3	my %features_and_values_hash = %{$self->{_features_and_values_hash}};
	1					8
2723	1					4	my %bias_hash = %{$self->{_bias_hash}};
	1					5
2724	1					5	my $how_many_training_samples = $self->{_number_of_samples_for_training};
2725	1					4	my $how_many_test_samples = $self->{_number_of_samples_for_testing};
2726	1					3	my %class_priors_to_unit_interval_map;
2727	1					3	my $accumulated_interval = 0;
2728	1					6	foreach my $i (0..@class_names-1) {
2729	2					13	$class_priors_to_unit_interval_map{$class_names[$i]}
2730							= [$accumulated_interval, $accumulated_interval + $class_priors[$i]];
2731	2					8	$accumulated_interval += $class_priors[$i];
2732							}
2733	1	50				6	if ($self->{_debug}) {
2734	0					0	print "Mapping of class priors to unit interval: \n";
2735	0					0	while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {
2736	0					0	print "$k => @$v\n";
2737							}
2738	0					0	print "\n\n";
2739							}
2740	1					10	my $ele_index = 0;
2741	1					6	while ($ele_index < $how_many_training_samples) {
2742	35					86	my $sample_name = "sample" . "_$ele_index";
2743	35					108	$training_sample_records{$sample_name} = [];
2744							# Generate class label for this training sample:
2745	35					109	my $roll_the_dice = rand(1.0);
2746	35					56	my $class_label;
2747	35					95	foreach my $class_name (keys %class_priors_to_unit_interval_map ) {
2748	52					93	my $v = $class_priors_to_unit_interval_map{$class_name};
2749	52	100	66			245	if ( ($roll_the_dice >= $v->[0]) && ($roll_the_dice <= $v->[1]) ) {
2750	35					67	push @{$training_sample_records{$sample_name}},
	35					103
2751							"class=" . $class_name;
2752	35					69	$class_label = $class_name;
2753	35					66	last;
2754							}
2755							}
2756	35					92	foreach my $feature (keys %features_and_values_hash) {
2757	140					259	my @values = @{$features_and_values_hash{$feature}};
	140					411
2758	140					303	my $bias_string = $bias_hash{$class_label}->{$feature}->[0];
2759	140					289	my $no_bias = 1.0 / @values;
2760	140	100				482	$bias_string = "$values[0]" . "=$no_bias" if !defined $bias_string;
2761	140					232	my %value_priors_to_unit_interval_map;
2762	140					700	my @splits = split /\s=\s/, $bias_string;
2763	140					342	my $chosen_for_bias_value = $splits[0];
2764	140					230	my $chosen_bias = $splits[1];
2765	140					365	my $remaining_bias = 1 - $chosen_bias;
2766	140					298	my $remaining_portion_bias = $remaining_bias / (@values -1);
2767	140	50				301	@splits = grep {defined($_) && length($_) > 0} @splits;
	280					1359
2768	140					291	my $accumulated = 0;
2769	140					335	foreach my $i (0..@values-1) {
2770	490	100				1220	if ($values[$i] eq $chosen_for_bias_value) {
2771	140					407	$value_priors_to_unit_interval_map{$values[$i]}
2772							= [$accumulated, $accumulated + $chosen_bias];
2773	140					304	$accumulated += $chosen_bias;
2774							} else {
2775	350					866	$value_priors_to_unit_interval_map{$values[$i]}
2776							= [$accumulated, $accumulated + $remaining_portion_bias];
2777	350					677	$accumulated += $remaining_portion_bias;
2778							}
2779							}
2780	140					276	my $roll_the_dice = rand(1.0);
2781	140					230	my $value_label;
2782	140					353	foreach my $value_name (keys %value_priors_to_unit_interval_map ) {
2783	307					546	my $v = $value_priors_to_unit_interval_map{$value_name};
2784	307	100	100			1282	if ( ($roll_the_dice >= $v->[0])
2785							&& ($roll_the_dice <= $v->[1]) ) {
2786	140					232	push @{$training_sample_records{$sample_name}},
	140					490
2787							$feature . "=" . $value_name;
2788	140					260	$value_label = $value_name;
2789	140					352	last;
2790							}
2791							}
2792	140	50				676	if ($self->{_debug}) {
2793	0					0	print "mapping feature value priors for '$feature' " .
2794							"to unit interval: \n";
2795	0					0	while ( my ($k, $v) =
2796							each %value_priors_to_unit_interval_map ) {
2797	0					0	print "$k => @$v\n";
2798							}
2799	0					0	print "\n\n";
2800							}
2801							}
2802	35					120	$ele_index++;
2803							}
2804	1					4	$self->{_training_sample_records} = \%training_sample_records;
2805	1	50				6	if ($self->{_debug}) {
2806	0					0	print "\n\nPRINTING TRAINING RECORDS:\n\n";
2807	0					0	foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {
	0					0
2808	0					0	print "$kee => @{$training_sample_records{$kee}}\n\n";
	0					0
2809							}
2810							}
2811	1					4	my $output_training_file = $self->{_output_training_datafile};
2812	1	50				5	print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};
2813	1					120	open FILEHANDLE, ">$output_training_file";
2814	1					10	my @features = sort keys %features_and_values_hash;
2815	1					4	my $title_string = ',class';
2816	1					4	foreach my $feature_name (@features) {
2817	4					9	$title_string .= ',' . $feature_name;
2818							}
2819	1					12	print FILEHANDLE "$title_string\n";
2820	1					15	my @sample_names = sort {$a <=> $b} map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;
	105					156
	35					76
	35					63
2821	1					6	my $record_string = '';
2822	1					3	foreach my $sample_name (@sample_names) {
2823	35					61	$record_string .= "$sample_name,";
2824	35					44	my @record = @{$training_sample_records{"sample_$sample_name"}};
	35					111
2825	35					59	my %item_parts_hash;
2826	35					50	foreach my $item (@record) {
2827	175					431	my @splits = grep $_, split /=/, $item;
2828	175					365	$item_parts_hash{$splits[0]} = $splits[1];
2829							}
2830	35					76	$record_string .= $item_parts_hash{"class"};
2831	35					64	delete $item_parts_hash{"class"};
2832	35					102	my @kees = sort keys %item_parts_hash;
2833	35					67	foreach my $kee (@kees) {
2834	140					232	$record_string .= ",$item_parts_hash{$kee}";
2835							}
2836	35					67	print FILEHANDLE "$record_string\n";
2837	35					85	$record_string = '';
2838							}
2839	1					62	close FILEHANDLE;
2840							}
2841
2842							sub sample_index {
2843	0			0			my $arg = shift;
2844	0						$arg =~ /_(.+)$/;
2845	0						return $1;
2846							}
2847
2848							################################# Decision Tree Introspection #######################################
2849							################################# Class DTIntrospection #######################################
2850
2851							package DTIntrospection;
2852
2853							## Instances constructed from this class can provide explanations for the
2854							## classification decisions at the nodes of a decision tree.
2855							##
2856							## When used in the interactive mode, the decision-tree introspection made possible
2857							## by this class provides answers to the following three questions: (1) List of the
2858							## training samples that fall in the portion of the feature space that corresponds
2859							## to a node of the decision tree; (2) The probabilities associated with the last
2860							## feature test that led to the node; and (3) The class probabilities predicated on
2861							## just the last feature test on the path to that node.
2862							##
2863							## CAVEAT: It is possible for a node to exist even when there are no training
2864							## samples in the portion of the feature space that corresponds to the node. That
2865							## is because a decision tree is based on the probability densities estimated from
2866							## the training data. When training data is non-uniformly distributed, it is
2867							## possible for the probability associated with a point in the feature space to be
2868							## non-zero even when there are no training samples at or in the vicinity of that
2869							## point.
2870							##
2871							## For a node to exist even where there are no training samples in the portion of
2872							## the feature space that belongs to the node is an indication of the generalization
2873							## ability of decision-tree based classification.
2874							##
2875							## When used in a non-interactive mode, an instance of this class can be used to
2876							## create a tabular display that shows what training samples belong directly to the
2877							## portion of the feature space that corresponds to each node of the decision tree.
2878							## An instance of this class can also construct a tabular display that shows how the
2879							## influence of each training sample propagates in the decision tree. For each
2880							## training sample, this display first shows the list of nodes that came into
2881							## existence through feature test(s) that used the data provided by that sample.
2882							## This list for each training sample is followed by a subtree of the nodes that owe
2883							## their existence indirectly to the training sample. A training sample influences a
2884							## node indirectly if the node is a descendant of another node that is affected
2885							## directly by the training sample.
2886
2887	1			1		18	use strict;
	1					5
	1					47
2888	1			1		9	use Carp;
	1					3
	1					3391
2889
2890							sub new {
2891	0			0			my ($class, $dt) = @_;
2892	0	0					croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
2893							unless ref($dt) eq "Algorithm::DecisionTree";
2894							bless {
2895							_dt => $dt,
2896							_root_dtnode => $dt->{_root_node},
2897	0						_samples_at_nodes_hash => {},
2898							_branch_features_to_nodes_hash => {},
2899							_sample_to_node_mapping_direct_hash => {},
2900							_node_serial_num_to_node_hash => {},
2901							_awareness_raising_msg_shown => 0,
2902							_debug => 0,
2903							}, $class;
2904							}
2905
2906							sub initialize {
2907	0			0			my $self = shift;
2908							croak "You must first construct the decision tree before using the DTIntrospection class"
2909	0	0					unless $self->{_root_dtnode};
2910	0						$self->recursive_descent($self->{_root_dtnode});
2911							}
2912
2913							sub recursive_descent {
2914	0			0			my $self = shift;
2915	0						my $node = shift;
2916	0						my $node_serial_number = $node->get_serial_num();
2917	0						my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2918	0	0					print "\nAt node $node_serial_number: the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
	0
2919	0						$self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
2920	0						$self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
2921	0						my @samples_at_node = ();
2922	0						foreach my $item (@$branch_features_and_values_or_thresholds) {
2923	0						my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
2924	0	0					unless (@samples_at_node) {
2925	0						@samples_at_node = @$samples_for_feature_value_combo;
2926							} else {
2927	0						my @accum;
2928	0						foreach my $sample (@samples_at_node) {
2929	0	0					push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);
2930							}
2931	0						@samples_at_node = @accum;
2932							}
2933	0	0					last unless @samples_at_node;
2934							}
2935	0						@samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node;
	0
2936	0	0					print "Node: $node_serial_number the samples are: [@samples_at_node]\n" if ($self->{_debug});
2937	0						$self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
2938	0	0					if (@samples_at_node) {
2939	0						foreach my $sample (@samples_at_node) {
2940	0	0					if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2941	0						$self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number];
2942							} else {
2943	0						push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
	0
2944							}
2945							}
2946							}
2947	0						my $children = $node->get_children();
2948	0						foreach my $child (@$children) {
2949	0						$self->recursive_descent($child);
2950							}
2951							}
2952
2953							sub display_training_samples_at_all_nodes_direct_influence_only {
2954	0			0			my $self = shift;
2955							croak "You must first construct the decision tree before using the DT Introspection class."
2956	0	0					unless $self->{_root_dtnode};
2957	0						$self->recursive_descent_for_showing_samples_at_a_node($self->{_root_dtnode});
2958							}
2959
2960							sub recursive_descent_for_showing_samples_at_a_node{
2961	0			0			my $self = shift;
2962	0						my $node = shift;
2963	0						my $node_serial_number = $node->get_serial_num();
2964	0						my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2965	0	0					if (exists $self->{_samples_at_nodes_hash}->{$node_serial_number}) {
2966	0	0					print "\nAt node $node_serial_number: the branch features and values are: [@{$branch_features_and_values_or_thresholds}]\n" if $self->{_debug};
	0
2967	0						print "Node $node_serial_number: the samples are: [@{$self->{_samples_at_nodes_hash}->{$node_serial_number}}]\n";
	0
2968							}
2969	0						map $self->recursive_descent_for_showing_samples_at_a_node($_), @{$node->get_children()};
	0
2970							}
2971
2972							sub display_training_samples_to_nodes_influence_propagation {
2973	0			0			my $self = shift;
2974	0						foreach my $sample (sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} keys %{$self->{_dt}->{_training_data_hash}}) {
	0
	0
2975	0	0					if (exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2976	0						my $nodes_directly_affected = $self->{_sample_to_node_mapping_direct_hash}->{$sample};
2977	0						print "\n$sample:\n nodes affected directly: [@{$nodes_directly_affected}]\n";
	0
2978	0						print " nodes affected through probabilistic generalization:\n";
2979	0						map $self->recursive_descent_for_sample_to_node_influence($_, $nodes_directly_affected, " "), @$nodes_directly_affected;
2980							}
2981							}
2982							}
2983
2984							sub recursive_descent_for_sample_to_node_influence {
2985	0			0			my $self = shift;
2986	0						my $node_serial_num = shift;
2987	0						my $nodes_already_accounted_for = shift;
2988	0						my $offset = shift;
2989	0						$offset .= " ";
2990	0						my $node = $self->{_node_serial_num_to_node_hash}->{$node_serial_num};
2991	0						my @children = map $_->get_serial_num(), @{$node->get_children()};
	0
2992	0						my @children_affected = grep {!Algorithm::DecisionTree::contained_in($_, @{$nodes_already_accounted_for})} @children;
	0
	0
2993	0	0					if (@children_affected) {
2994	0						print "$offset $node_serial_num => [@children_affected]\n";
2995							}
2996	0						map $self->recursive_descent_for_sample_to_node_influence($_, \@children_affected, $offset), @children_affected;
2997							}
2998
2999							sub get_samples_for_feature_value_combo {
3000	0			0			my $self = shift;
3001	0						my $feature_value_combo = shift;
3002	0						my ($feature,$op,$value) = $self->extract_feature_op_val($feature_value_combo);
3003	0						my @samples = ();
3004	0	0					if ($op eq '=') {
		0
		0
3005	0						@samples = grep Algorithm::DecisionTree::contained_in($feature_value_combo, @{$self->{_dt}->{_training_data_hash}->{$_}}), keys %{$self->{_dt}->{_training_data_hash}};
	0
	0
3006							} elsif ($op eq '<') {
3007	0						foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
	0
3008	0						my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
	0
3009	0						foreach my $item (@features_and_values) {
3010	0						my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3011	0	0	0				if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data <= $value)) {
			0
3012	0						push @samples, $sample;
3013	0						last;
3014							}
3015							}
3016							}
3017							} elsif ($op eq '>') {
3018	0						foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
	0
3019	0						my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
	0
3020	0						foreach my $item (@features_and_values) {
3021	0						my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3022	0	0	0				if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data > $value)) {
			0
3023	0						push @samples, $sample;
3024	0						last;
3025							}
3026							}
3027							}
3028							} else {
3029	0						die "Something strange is going on";
3030							}
3031	0						return \@samples;
3032							}
3033
3034							sub extract_feature_op_val {
3035	0			0			my $self = shift;
3036	0						my $feature_value_combo = shift;
3037	0						my $pattern1 = '(.+)=(.+)';
3038	0						my $pattern2 = '(.+)<(.+)';
3039	0						my $pattern3 = '(.+)>(.+)';
3040	0						my ($feature,$value,$op);
3041	0	0					if ($feature_value_combo =~ /$pattern2/) {
		0
		0
3042	0						($feature,$op,$value) = ($1,'<',$2);
3043							} elsif ($feature_value_combo =~ /$pattern3/) {
3044	0						($feature,$op,$value) = ($1,'>',$2);
3045							} elsif ($feature_value_combo =~ /$pattern1/) {
3046	0						($feature,$op,$value) = ($1,'=',$2);
3047							}
3048	0						return ($feature,$op,$value);
3049							}
3050
3051							sub explain_classifications_at_multiple_nodes_interactively {
3052	0			0			my $self = shift;
3053							croak "You called explain_classification_at_multiple_nodes_interactively() without " .
3054							"first initializing the DTIntrospection instance in your code. Aborting."
3055	0	0					unless $self->{_samples_at_nodes_hash};
3056	0						print "\n\nIn order for the decision tree to introspect\n\n";
3057	0						print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3058							" that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3059							" with it even when there are NO training data points in that region?\n" .
3060							"\nEnter 'y' for yes or any other character for no: ";
3061	0						my $ans = ;
3062	0						$ans =~ s/^\s\|\s$//g;
3063	0	0					die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decisions in the decision tree. Aborting!\n" if $ans !~ /^ye?s?$/;
3064	0						$self->{_awareness_raising_msg_shown} = 1;
3065	0						while (1) {
3066	0						my $node_id;
3067							my $ans;
3068	0						while (1) {
3069	0						print "\nEnter the integer ID of a node: ";
3070	0						$ans = ;
3071	0						$ans =~ s/^\s\|\s$//g;
3072	0	0					return if $ans =~ /^exit$/;
3073	0	0					last if Algorithm::DecisionTree::contained_in($ans, keys %{$self->{_samples_at_nodes_hash}});
	0
3074	0						print "\nYour answer must be an integer ID of a node. Try again or enter 'exit'.\n";
3075							}
3076	0						$node_id = $ans;
3077	0						$self->explain_classification_at_one_node($node_id)
3078							}
3079							}
3080
3081							sub explain_classification_at_one_node {
3082	0			0			my $self = shift;
3083	0						my $node_id = shift;
3084							croak "You called explain_classification_at_one_node() without first initializing " .
3085	0	0					"the DTIntrospection instance in your code. Aborting." unless $self->{_samples_at_nodes_hash};
3086	0	0					unless (exists $self->{_samples_at_nodes_hash}->{$node_id}) {
3087	0						print "Node $node_id is not a node in the tree\n";
3088	0						return;
3089							}
3090	0	0					unless ($self->{_awareness_raising_msg_shown}) {
3091	0						print "\n\nIn order for the decision tree to introspect at Node $node_id: \n\n";
3092	0						print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3093							" that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3094							" with it even when there are NO training data points in that region?\n" .
3095							"\nEnter 'y' for yes or any other character for no: ";
3096	0						my $ans = ;
3097	0						$ans =~ s/^\s\|\s$//g;
3098	0	0					die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decision at node $node_id\n" if $ans !~ /^ye?s?$/;
3099							}
3100	0						my @samples_at_node = @{$self->{_samples_at_nodes_hash}->{$node_id}};
	0
3101	0						my @branch_features_to_node = @{$self->{_branch_features_to_nodes_hash}->{$node_id}};
	0
3102							# my @class_names = @{$self->get_class_names()};
3103	0						my @class_names = $self->{_dt}->get_class_names();
3104	0						my $class_probabilities = $self->{_root_dtnode}->get_class_probabilities();
3105	0						my ($feature,$op,$value) = $self->extract_feature_op_val( $branch_features_to_node[-1] );
3106	0	0					my $msg = @samples_at_node == 0
3107							? "\n\n There are NO training data samples directly in the region of the feature space assigned to node $node_id: @samples_at_node\n\n"
3108							: "\n Samples in the portion of the feature space assigned to Node $node_id: @samples_at_node\n";
3109	0						$msg .= "\n Features tests on the branch to node $node_id: [@branch_features_to_node]\n\n";
3110	0						$msg .= "\n Would you like to see the probability associated with the last feature test on the branch leading to Node $node_id?\n";
3111	0						$msg .= "\n Enter 'y' if yes and any other character for 'no': ";
3112	0						print $msg;
3113	0						my $ans = ;
3114	0						$ans =~ s/^\s\|\s$//g;
3115	0	0					if ($ans =~ /^ye?s?$/) {
3116	0						my $sequence = [$branch_features_to_node[-1]];
3117	0						my $prob = $self->{_dt}->probability_of_a_sequence_of_features_and_values_or_thresholds($sequence);
3118	0						print "\n probability of @{$sequence} is: $prob\n";
	0
3119							}
3120	0						$msg = "\n Using Bayes rule, would you like to see the class probabilities predicated on just the last feature test on the branch leading to Node $node_id?\n";
3121	0						$msg .= "\n Enter 'y' for yes and any other character for no: ";
3122	0						print $msg;
3123	0						$ans = ;
3124	0						$ans =~ s/^\s\|\s$//g;
3125	0	0					if ($ans =~ /^ye?s?$/) {
3126	0						my $sequence = [$branch_features_to_node[-1]];
3127	0						foreach my $cls (@class_names) {
3128	0						my $prob = $self->{_dt}->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($cls, $sequence);
3129	0						print "\n probability of class $cls given just the feature test @{$sequence} is: $prob\n";
	0
3130							}
3131							} else {
3132	0						print "goodbye\n";
3133							}
3134	0						print "\n Finished supplying information on Node $node_id\n\n";
3135							}
3136
3137							1;
3138
3139							=pod
3140
3141							=encoding UTF8
3142
3143							=head1 NAME
3144
3145							Algorithm::DecisionTree - A Perl module for decision-tree based classification of
3146							multidimensional data.
3147
3148
3149							=head1 SYNOPSIS
3150
3151							# FOR CONSTRUCTING A DECISION TREE AND FOR CLASSIFYING A SAMPLE:
3152
3153							# In general, your call for constructing an instance of the DecisionTree class
3154							# will look like:
3155
3156							my $training_datafile = "stage3cancer.csv";
3157							my $dt = Algorithm::DecisionTree->new(
3158							training_datafile => $training_datafile,
3159							csv_class_column_index => 2,
3160							csv_columns_for_features => [3,4,5,6,7,8],
3161							entropy_threshold => 0.01,
3162							max_depth_desired => 8,
3163							symbolic_to_numeric_cardinality_threshold => 10,
3164							csv_cleanup_needed => 1,
3165							);
3166
3167							# The constructor option `csv_class_column_index' informs the module as to which
3168							# column of your CSV file contains the class label. THE COLUMN INDEXING IS ZERO
3169							# BASED. The constructor option `csv_columns_for_features' specifies which columns
3170							# are to be used for feature values. The first row of the CSV file must specify
3171							# the names of the features. See examples of CSV files in the `Examples'
3172							# subdirectory.
3173
3174							# The option `symbolic_to_numeric_cardinality_threshold' is also important. For
3175							# the example shown above, if an ostensibly numeric feature takes on only 10 or
3176							# fewer different values in your training datafile, it will be treated like a
3177							# symbolic features. The option `entropy_threshold' determines the granularity
3178							# with which the entropies are sampled for the purpose of calculating entropy gain
3179							# with a particular choice of decision threshold for a numeric feature or a feature
3180							# value for a symbolic feature.
3181
3182							# The option 'csv_cleanup_needed' is by default set to 0. If you set it
3183							# to 1, that would cause all line records in your CSV file to be "sanitized" before
3184							# they are used for constructing a decision tree. You need this option if your CSV
3185							# file uses double-quoted field names and field values in the line records and if
3186							# such double-quoted strings are allowed to include commas for, presumably, better
3187							# readability.
3188
3189							# After you have constructed an instance of the DecisionTree class as shown above,
3190							# you read in the training data file and initialize the probability cache by
3191							# calling:
3192
3193							$dt->get_training_data();
3194							$dt->calculate_first_order_probabilities();
3195							$dt->calculate_class_priors();
3196
3197							# Next you construct a decision tree for your training data by calling:
3198
3199							$root_node = $dt->construct_decision_tree_classifier();
3200
3201							# where $root_node is an instance of the DTNode class that is also defined in the
3202							# module file. Now you are ready to classify a new data record. Let's say that
3203							# your data record looks like:
3204
3205							my @test_sample = qw / g2=4.2
3206							grade=2.3
3207							gleason=4
3208							eet=1.7
3209							age=55.0
3210							ploidy=diploid /;
3211
3212							# You can classify it by calling:
3213
3214							my $classification = $dt->classify($root_node, \@test_sample);
3215
3216							# The call to `classify()' returns a reference to a hash whose keys are the class
3217							# names and the values the associated classification probabilities. This hash also
3218							# includes another key-value pair for the solution path from the root node to the
3219							# leaf node at which the final classification was carried out.
3220
3221
3222							=head1 CHANGES
3223
3224							B This version fixes a bug in the C function.
3225							The source of the bug was a typo in a regex component meant for matching with white
3226							space. I have also made one additional change to this function to increase its
3227							versatility. With this change, you are now allowed to have empty strings as values
3228							for features.
3229
3230							B This version reintroduces C as an optional
3231							parameter in the module constructor. This was done in response to several requests
3232							received from the user community. (Previously, all line records from a CSV file were
3233							processed by the C function no matter what.) The main point made by
3234							the users was that invoking C when there was no need for CSV clean-up
3235							extracted a performance penalty when ingesting large database files with tens of
3236							thousands of line records. In addition to making C optional, I
3237							have also tweaked up the code in the C function in order to extract
3238							data from a larger range of messy CSV files.
3239
3240							B All the changes made in this version relate to the construction of
3241							regression trees. I have fixed a couple of bugs in the calculation of the regression
3242							coefficients. Additionally, the C class now comes with a new
3243							constructor parameter named C. For most cases, you'd set this
3244							parameter to 0, which causes the regression coefficients to be estimated through
3245							linear least-squares minimization.
3246
3247							B In addition to constructing decision trees, this version of the
3248							module also allows you to construct regression trees. The regression tree capability
3249							has been packed into a separate subclass, named C, of the main
3250							C class. The subdirectory C in the main
3251							installation directory illustrates how you can use this new functionality of the
3252							module.
3253
3254							B This version incorporates four very significant upgrades/changes to
3255							the C module: B<(1)> The CSV cleanup is now the default. So you do not
3256							have to set any special parameters in the constructor calls to initiate CSV
3257							cleanup. B<(2)> In the form of a new Perl class named C,
3258							this module provides you with an easy-to-use programming interface for attempting
3259							needle-in-a-haystack solutions for the case when your training data is overwhelmingly
3260							dominated by a single class. You need to set the constructor parameter
3261							C to invoke the logic that constructs multiple
3262							decision trees, each using the minority class samples along with samples drawn
3263							randomly from the majority class. The final classification is made through a
3264							majority vote from all the decision trees. B<(3)> Assuming you are faced with a
3265							big-data problem --- in the sense that you have been given a training database with a
3266							very large number of training records --- the class C will
3267							also let you construct multiple decision trees by pulling training data randomly from
3268							your training database (without paying attention to the relative populations of the
3269							classes). The final classification decision for a test sample is based on a majority
3270							vote from all the decision trees thus constructed. See the C
3271							directory for how to use these new features of the module. And, finally, B<(4)>
3272							Support for the old-style '.dat' training files has been dropped in this version.
3273
3274							B This version makes it easier to use a CSV training file that
3275							violates the assumption that a comma be used only to separate the different field
3276							values in a line record. Some large econometrics databases use double-quoted values
3277							for fields, and these values may contain commas (presumably for better readability).
3278							This version also allows you to specify the leftmost entry in the first CSV record
3279							that names all the fields. Previously, this entry was required to be an empty
3280							double-quoted string. I have also made some minor changes to the
3281							'C' method to make it more user friendly for large
3282							training files that may contain tens of thousands of records. When pulling training
3283							data from such files, this method prints out a dot on the terminal screen for every
3284							10000 records it has processed.
3285
3286							B This version brings the boosting capability to the C
3287							module.
3288
3289							B This version adds bagging to the C module. If your
3290							training dataset is large enough, you can ask the module to construct multiple
3291							decision trees using data bags extracted from your dataset. The module can show you
3292							the results returned by the individual decision trees and also the results obtained
3293							by taking a majority vote of the classification decisions made by the individual
3294							trees. You can specify any arbitrary extent of overlap between the data bags.
3295
3296							B The introspection capability in this version packs more of a punch.
3297							For each training data sample, you can now figure out not only the decision-tree
3298							nodes that are affected directly by that sample, but also those nodes that are
3299							affected indirectly through the generalization achieved by the probabilistic modeling
3300							of the data. The 'examples' directory of this version includes additional scripts
3301							that illustrate these enhancements to the introspection capability. See the section
3302							"The Introspection API" for a declaration of the introspection related methods, old
3303							and new.
3304
3305							B In response to requests from several users, this version includes a new
3306							capability: You can now ask the module to introspect about the classification
3307							decisions returned by the decision tree. Toward that end, the module includes a new
3308							class named C. Perhaps the most important bit of information you
3309							are likely to seek through DT introspection is the list of the training samples that
3310							fall directly in the portion of the feature space that is assigned to a node.
3311							B When training samples are non-uniformly distributed in the underlying
3312							feature space, IT IS POSSIBLE FOR A NODE TO EXIST EVEN WHEN NO TRAINING SAMPLES FALL
3313							IN THE PORTION OF THE FEATURE SPACE ASSIGNED TO THE NODE. B<(This is an important
3314							part of the generalization achieved by probabilistic modeling of the training data.)>
3315							For additional information related to DT introspection, see the section titled
3316							"DECISION TREE INTROSPECTION" in this documentation page.
3317
3318							B fixes a bug in the part of the module that some folks use for generating
3319							synthetic data for experimenting with decision tree construction and classification.
3320							In the class C that is a part of the module, there
3321							was a problem with the order in which the features were recorded from the
3322							user-supplied parameter file. The basic code for decision tree construction and
3323							classification remains unchanged.
3324
3325							B further downshifts the required version of Perl for this module. This
3326							was a result of testing the module with Version 5.10.1 of Perl. Only one statement
3327							in the module code needed to be changed for the module to work with the older version
3328							of Perl.
3329
3330							B fixes the C restriction on the required Perl version. This
3331							version should work with Perl versions 5.14.0 and higher.
3332
3333							B changes the required version of Perl from 5.18.0 to 5.14.0. Everything
3334							else remains the same.
3335
3336							B should prove more robust when the probability distribution for the
3337							values of a feature is expected to be heavy-tailed; that is, when the supposedly rare
3338							observations can occur with significant probabilities. A new option in the
3339							DecisionTree constructor lets the user specify the precision with which the
3340							probability distributions are estimated for such features.
3341
3342							B fixes a bug that was caused by the explicitly set zero values for
3343							numerical features being misconstrued as "false" in the conditional statements in
3344							some of the method definitions.
3345
3346							B makes it easier to write code for classifying in one go all of your test
3347							data samples in a CSV file. The bulk classifications obtained can be written out to
3348							either a CSV file or to a regular text file. See the script
3349							C in the C directory for how to
3350							classify all of your test data records in a CSV file. This version also includes
3351							improved code for generating synthetic numeric/symbolic training and test data
3352							records for experimenting with the decision tree classifier.
3353
3354							B allows you to test the quality of your training data by running a 10-fold
3355							cross-validation test on the data. This test divides all of the training data into
3356							ten parts, with nine parts used for training a decision tree and one part used for
3357							testing its ability to classify correctly. This selection of nine parts for training
3358							and one part for testing is carried out in all of the ten different ways that are
3359							possible. This testing functionality in Version 2.1 can also be used to find the
3360							best values to use for the constructor parameters C,
3361							C, and C.
3362
3363							B Now you can use both numeric and
3364							symbolic features for constructing a decision tree. A feature is numeric if it can
3365							take any floating-point value over an interval.
3366
3367							B fixes a bug in the code that was triggered by 0 being declared as one of
3368							the features values in the training datafile. Version 1.71 also include an additional
3369							safety feature that is useful for training datafiles that contain a very large number
3370							of features. The new version makes sure that the number of values you declare for
3371							each sample record matches the number of features declared at the beginning of the
3372							training datafile.
3373
3374							B includes safety checks on the consistency of the data you place in your
3375							training datafile. When a training file contains thousands of samples, it is
3376							difficult to manually check that you used the same class names in your sample records
3377							that you declared at top of your training file or that the values you have for your
3378							features are legal vis-a-vis the earlier declarations of the values in the training
3379							file. Another safety feature incorporated in this version is the non-consideration
3380							of classes that are declared at the top of the training file but that have no sample
3381							records in the file.
3382
3383							B uses probability caching much more extensively compared to the previous
3384							versions. This should result in faster construction of large decision trees.
3385							Another new feature in Version 1.6 is the use of a decision tree for interactive
3386							classification. In this mode, after you have constructed a decision tree from the
3387							training data, the user is prompted for answers to the questions pertaining to the
3388							feature tests at the nodes of the tree.
3389
3390							Some key elements of the documentation were cleaned up and made more readable in
3391							B. The implementation code remains unchanged from Version 1.4.
3392
3393							B should make things faster (and easier) for folks who want to use this
3394							module with training data that creates very large decision trees (that is, trees with
3395							tens of thousands or more decision nodes). The speedup in Version 1.4 has been
3396							achieved by eliminating duplicate calculation of probabilities as the tree grows. In
3397							addition, this version provides an additional constructor parameter,
3398							C for controlling the size of the decision tree. This is in
3399							addition to the tree size control achieved by the parameter C that
3400							was introduced in Version 1.3. Since large decision trees can take a long time to
3401							create, you may find yourself wishing you could store the tree you just created in a
3402							disk file and that, subsequently, you could use the stored tree for classification
3403							work. The C directory contains two scripts, C and
3404							C, that show how you can do exactly that with the
3405							help of Perl's C module.
3406
3407							B addresses the issue that arises when the header of a training datafile
3408							declares a certain possible value for a feature but that (feature,value) pair does
3409							NOT show up anywhere in the training data. Version 1.3 also makes it possible for a
3410							user to control the size of the decision tree by changing the value of the parameter
3411							C Additionally, Version 1.3 includes a method called
3412							C that displays useful information regarding the size and
3413							some other attributes of the training data. It also warns the user if the training
3414							data might result in a decision tree that would simply be much too large --- unless
3415							the user controls the size with the entropy_threshold parameter.
3416
3417							In addition to the removal of a couple of serious bugs, B incorporates a
3418							number of enhancements: (1) Version 1.2 includes checks on the names of the features
3419							and values used in test data --- this is the data you want to classify with the
3420							decision tree classifier constructed by this module. (2) Version 1.2 includes a
3421							separate constructor for generating test data. To make it easier to generate test
3422							data whose probabilistic parameters may not be identical to that used for the
3423							training data, I have used separate routines for generating the test data. (3)
3424							Version 1.2 also includes in its examples directory a script that classifies the test
3425							data in a file and outputs the class labels into another file. This is for folks who
3426							do not wish to write their own scripts using this module. (4) Version 1.2 also
3427							includes addition to the documentation regarding the issue of numeric values for
3428							features.
3429
3430							=head1 DESCRIPTION
3431
3432							B is a I module for constructing a decision tree from
3433							a training datafile containing multidimensional data. In one form or another,
3434							decision trees have been around for about fifty years. From a statistical
3435							perspective, they are closely related to classification and regression by recursive
3436							partitioning of multidimensional data. Early work that demonstrated the usefulness
3437							of such partitioning of data for classification and regression can be traced to the
3438							work of Terry Therneau in the early 1980's in the statistics community, and to the
3439							work of Ross Quinlan in the mid 1990's in the machine learning community.
3440
3441							For those not familiar with decision tree ideas, the traditional way to classify
3442							multidimensional data is to start with a feature space whose dimensionality is the
3443							same as that of the data. Each feature in this space corresponds to the attribute
3444							that each dimension of the data measures. You then use the training data to carve up
3445							the feature space into different regions, each corresponding to a different class.
3446							Subsequently, when you try to classify a new data sample, you locate it in the
3447							feature space and find the class label of the region to which it belongs. One can
3448							also give the new data point the same class label as that of the nearest training
3449							sample. This is referred to as the nearest neighbor classification. There exist
3450							hundreds of variations of varying power on these two basic approaches to the
3451							classification of multidimensional data.
3452
3453							A decision tree classifier works differently. When you construct a decision tree,
3454							you select for the root node a feature test that partitions the training data in a
3455							way that causes maximal disambiguation of the class labels associated with the data.
3456							In terms of information content as measured by entropy, such a feature test would
3457							cause maximum reduction in class entropy in going from all of the training data taken
3458							together to the data as partitioned by the feature test. You then drop from the root
3459							node a set of child nodes, one for each partition of the training data created by the
3460							feature test at the root node. When your features are purely symbolic, you'll have
3461							one child node for each value of the feature chosen for the feature test at the root.
3462							When the test at the root involves a numeric feature, you find the decision threshold
3463							for the feature that best bipartitions the data and you drop from the root node two
3464							child nodes, one for each partition. Now at each child node you pose the same
3465							question that you posed when you found the best feature to use at the root: Which
3466							feature at the child node in question would maximally disambiguate the class labels
3467							associated with the training data corresponding to that child node?
3468
3469							As the reader would expect, the two key steps in any approach to decision-tree based
3470							classification are the construction of the decision tree itself from a file
3471							containing the training data, and then using the decision tree thus obtained for
3472							classifying new data.
3473
3474							What is cool about decision tree classification is that it gives you soft
3475							classification, meaning it may associate more than one class label with a given data
3476							vector. When this happens, it may mean that your classes are indeed overlapping in
3477							the underlying feature space. It could also mean that you simply have not supplied
3478							sufficient training data to the decision tree classifier. For a tutorial
3479							introduction to how a decision tree is constructed and used, visit
3480							L
3481
3482							This module also allows you to generate your own synthetic training and test
3483							data. Generating your own training data, using it for constructing a decision-tree
3484							classifier, and subsequently testing the classifier on a synthetically generated
3485							test set of data is a good way to develop greater proficiency with decision trees.
3486
3487
3488							=head1 WHAT PRACTICAL PROBLEM IS SOLVED BY THIS MODULE
3489
3490							If you are new to the concept of a decision tree, their practical utility is best
3491							understood with an example that only involves symbolic features. However, as
3492							mentioned earlier, versions of the module higher than 2.0 allow you to use both
3493							symbolic and numeric features.
3494
3495							Consider the following scenario: Let's say you are running a small investment company
3496							that employs a team of stockbrokers who make buy/sell decisions for the customers of
3497							your company. Assume that your company has asked the traders to make each investment
3498							decision on the basis of the following four criteria:
3499
3500							price_to_earnings_ratio (P_to_E)
3501
3502							price_to_sales_ratio (P_to_S)
3503
3504							return_on_equity (R_on_E)
3505
3506							market_share (MS)
3507
3508							Since you are the boss, you keep track of the buy/sell decisions made by the
3509							individual traders. But one unfortunate day, all of your traders decide to quit
3510							because you did not pay them enough. So what do you do? If you had a module like
3511							the one here, you could still run your company and do so in such a way that, on the
3512							average, would do better than any of the individual traders who worked for your
3513							company. This is what you do: You pool together the individual trader buy/sell
3514							decisions you have accumulated during the last one year. This pooled information is
3515							likely to look like:
3516
3517
3518							example buy/sell P_to_E P_to_S R_on_E MS
3519							============================================================+=
3520
3521							example_1 buy high low medium low
3522							example_2 buy medium medium low low
3523							example_3 sell low medium low high
3524							....
3525							....
3526
3527							This data, when formatted according to CSV, would constitute your training file. You
3528							could feed this file into the module by calling:
3529
3530							my $dt = Algorithm::DecisionTree->new(
3531							training_datafile => $training_datafile,
3532							csv_class_column_index => 1,
3533							csv_columns_for_features => [2,3,4,5],
3534							);
3535							$dt->get_training_data();
3536							$dt->calculate_first_order_probabilities();
3537							$dt->calculate_class_priors();
3538
3539							Subsequently, you would construct a decision tree by calling:
3540
3541							my $root_node = $dt->construct_decision_tree_classifier();
3542
3543							Now you and your company (with practically no employees) are ready to service the
3544							customers again. Suppose your computer needs to make a buy/sell decision about an
3545							investment prospect that is best described by:
3546
3547							price_to_earnings_ratio = low
3548							price_to_sales_ratio = very_low
3549							return_on_equity = none
3550							market_share = medium
3551
3552							All that your computer would need to do would be to construct a data vector like
3553
3554							my @data = qw / P_to_E=low
3555							P_to_S=very_low
3556							R_on_E=none
3557							MS=medium /;
3558
3559							and call the decision tree classifier you just constructed by
3560
3561							$dt->classify($root_node, \@data);
3562
3563							The answer returned will be 'buy' and 'sell', along with the associated
3564							probabilities. So if the probability of 'buy' is considerably greater than the
3565							probability of 'sell', that's what you should instruct your computer to do.
3566
3567							The chances are that, on the average, this approach would beat the performance of any
3568							of your individual traders who worked for you previously since the buy/sell decisions
3569							made by the computer would be based on the collective wisdom of all your previous
3570							traders. B
3571							captured by the silly little example here. However, it does nicely the convey the
3572							sense in which the current module could be used.>
3573
3574							=head1 SYMBOLIC FEATURES VERSUS NUMERIC FEATURES
3575
3576							A feature is symbolic when its values are compared using string comparison operators.
3577							By the same token, a feature is numeric when its values are compared using numeric
3578							comparison operators. Having said that, features that take only a small number of
3579							numeric values in the training data can be treated symbolically provided you are
3580							careful about handling their values in the test data. At the least, you have to set
3581							the test data value for such a feature to its closest value in the training data.
3582							The module does that automatically for you for those numeric features for which the
3583							number different numeric values is less than a user-specified threshold. For those
3584							numeric features that the module is allowed to treat symbolically, this snapping of
3585							the values of the features in the test data to the small set of values in the training
3586							data is carried out automatically by the module. That is, after a user has told the
3587							module which numeric features to treat symbolically, the user need not worry about
3588							how the feature values appear in the test data.
3589
3590							The constructor parameter C let's you tell
3591							the module when to consider an otherwise numeric feature symbolically. Suppose you
3592							set this parameter to 10, that means that all numeric looking features that take 10
3593							or fewer different values in the training datafile will be considered to be symbolic
3594							features by the module. See the tutorial at
3595							L for
3596							further information on the implementation issues related to the symbolic and numeric
3597							features.
3598
3599							=head1 FEATURES WITH NOT SO "NICE" STATISTICAL PROPERTIES
3600
3601							For the purpose of estimating the probabilities, it is necessary to sample the range
3602							of values taken on by a numerical feature. For features with "nice" statistical
3603							properties, this sampling interval is set to the median of the differences between
3604							the successive feature values in the training data. (Obviously, as you would expect,
3605							you first sort all the values for a feature before computing the successive
3606							differences.) This logic will not work for the sort of a feature described below.
3607
3608							Consider a feature whose values are heavy-tailed, and, at the same time, the values
3609							span a million to one range. What I mean by heavy-tailed is that rare values can
3610							occur with significant probabilities. It could happen that most of the values for
3611							such a feature are clustered at one of the two ends of the range. At the same time,
3612							there may exist a significant number of values near the end of the range that is less
3613							populated. (Typically, features related to human economic activities --- such as
3614							wealth, incomes, etc. --- are of this type.) With the logic described in the
3615							previous paragraph, you could end up with a sampling interval that is much too small,
3616							which could result in millions of sampling points for the feature if you are not
3617							careful.
3618
3619							Beginning with Version 2.22, you have two options in dealing with such features. You
3620							can choose to go with the default behavior of the module, which is to sample the
3621							value range for such a feature over a maximum of 500 points. Or, you can supply an
3622							additional option to the constructor that sets a user-defined value for the number of
3623							points to use. The name of the option is C. The following
3624							script
3625
3626							construct_dt_for_heavytailed.pl
3627
3628							in the C directory shows an example of how to call the constructor of the
3629							module with the C option.
3630
3631
3632							=head1 TESTING THE QUALITY OF YOUR TRAINING DATA
3633
3634							Versions 2.1 and higher include a new class named C, derived from
3635							the main class C, that runs a 10-fold cross-validation test on your
3636							training data to test its ability to discriminate between the classes mentioned in
3637							the training file.
3638
3639							The 10-fold cross-validation test divides all of the training data into ten parts,
3640							with nine parts used for training a decision tree and one part used for testing its
3641							ability to classify correctly. This selection of nine parts for training and one part
3642							for testing is carried out in all of the ten different possible ways.
3643
3644							The following code fragment illustrates how you invoke the testing function of the
3645							EvalTrainingData class:
3646
3647							my $training_datafile = "training.csv";
3648							my $eval_data = EvalTrainingData->new(
3649							training_datafile => $training_datafile,
3650							csv_class_column_index => 1,
3651							csv_columns_for_features => [2,3],
3652							entropy_threshold => 0.01,
3653							max_depth_desired => 3,
3654							symbolic_to_numeric_cardinality_threshold => 10,
3655							csv_cleanup_needed => 1,
3656							);
3657							$eval_data->get_training_data();
3658							$eval_data->evaluate_training_data()
3659
3660							The last statement above prints out a Confusion Matrix and the value of Training Data
3661							Quality Index on a scale of 0 to 100, with 100 designating perfect training data.
3662							The Confusion Matrix shows how the different classes were mislabeled in the 10-fold
3663							cross-validation test.
3664
3665							This testing functionality can also be used to find the best values to use for the
3666							constructor parameters C, C, and
3667							C.
3668
3669							The following two scripts in the C directory illustrate the use of the
3670							C class for testing the quality of your data:
3671
3672							evaluate_training_data1.pl
3673							evaluate_training_data2.pl
3674
3675
3676							=head1 HOW TO MAKE THE BEST CHOICES FOR THE CONSTRUCTOR PARAMETERS
3677
3678							Assuming your training data is good, the quality of the results you get from a
3679							decision tree would depend on the choices you make for the constructor parameters
3680							C, C, and
3681							C. You can optimize your choices for
3682							these parameters by running the 10-fold cross-validation test that is made available
3683							in Versions 2.2 and higher through the new class C that is included
3684							in the module file. A description of how to run this test is in the previous section
3685							of this document.
3686
3687
3688							=head1 DECISION TREE INTROSPECTION
3689
3690							Starting with Version 2.30, you can ask the C class of the module to
3691							explain the classification decisions made at the different nodes of the decision
3692							tree.
3693
3694							Perhaps the most important bit of information you are likely to seek through DT
3695							introspection is the list of the training samples that fall directly in the portion
3696							of the feature space that is assigned to a node.
3697
3698							However, note that, when training samples are non-uniformly distributed in the
3699							underlying feature space, it is possible for a node to exist even when there are no
3700							training samples in the portion of the feature space assigned to the node. That is
3701							because the decision tree is constructed from the probability densities estimated
3702							from the training data. When the training samples are non-uniformly distributed, it
3703							is entirely possible for the estimated probability densities to be non-zero in a
3704							small region around a point even when there are no training samples specifically in
3705							that region. (After you have created a statistical model for, say, the height
3706							distribution of people in a community, the model may return a non-zero probability
3707							for the height values in a small interval even if the community does not include a
3708							single individual whose height falls in that interval.)
3709
3710							That a decision-tree node can exist even when there are no training samples in that
3711							portion of the feature space that belongs to the node is an important indication of
3712							the generalization ability of a decision-tree-based classifier.
3713
3714							In light of the explanation provided above, before the DTIntrospection class supplies
3715							any answers at all, it asks you to accept the fact that features can take on non-zero
3716							probabilities at a point in the feature space even though there are zero training
3717							samples at that point (or in a small region around that point). If you do not accept
3718							this rudimentary fact, the introspection class will not yield any answers (since you
3719							are not going to believe the answers anyway).
3720
3721							The point made above implies that the path leading to a node in the decision tree may
3722							test a feature for a certain value or threshold despite the fact that the portion of
3723							the feature space assigned to that node is devoid of any training data.
3724
3725							See the following three scripts in the Examples directory for how to carry out DT
3726							introspection:
3727
3728							introspection_in_a_loop_interactive.pl
3729
3730							introspection_show_training_samples_at_all_nodes_direct_influence.pl
3731
3732							introspection_show_training_samples_to_nodes_influence_propagation.pl
3733
3734							The first script places you in an interactive session in which you will first be
3735							asked for the node number you are interested in. Subsequently, you will be asked for
3736							whether or not you are interested in specific questions that the introspection can
3737							provide answers for. The second script descends down the decision tree and shows for
3738							each node the training samples that fall directly in the portion of the feature space
3739							assigned to that node. The third script shows for each training sample how it
3740							affects the decision-tree nodes either directly or indirectly through the
3741							generalization achieved by the probabilistic modeling of the data.
3742
3743							The output of the script
3744							C looks like:
3745
3746							Node 0: the samples are: None
3747							Node 1: the samples are: [sample_46 sample_58]
3748							Node 2: the samples are: [sample_1 sample_4 sample_7 .....]
3749							Node 3: the samples are: []
3750							Node 4: the samples are: []
3751							...
3752							...
3753
3754							The nodes for which no samples are listed come into existence through
3755							the generalization achieved by the probabilistic modeling of the data.
3756
3757							The output produced by the script
3758							C looks like
3759
3760							sample_1:
3761							nodes affected directly: [2 5 19 23]
3762							nodes affected through probabilistic generalization:
3763							2=> [3 4 25]
3764							25=> [26]
3765							5=> [6]
3766							6=> [7 13]
3767							7=> [8 11]
3768							8=> [9 10]
3769							11=> [12]
3770							13=> [14 18]
3771							14=> [15 16]
3772							16=> [17]
3773							19=> [20]
3774							20=> [21 22]
3775							23=> [24]
3776
3777							sample_4:
3778							nodes affected directly: [2 5 6 7 11]
3779							nodes affected through probabilistic generalization:
3780							2=> [3 4 25]
3781							25=> [26]
3782							5=> [19]
3783							19=> [20 23]
3784							20=> [21 22]
3785							23=> [24]
3786							6=> [13]
3787							13=> [14 18]
3788							14=> [15 16]
3789							16=> [17]
3790							7=> [8]
3791							8=> [9 10]
3792							11=> [12]
3793
3794							...
3795							...
3796							...
3797
3798							For each training sample, the display shown above first presents the list of nodes
3799							that are directly affected by the sample. A node is affected directly by a sample if
3800							the latter falls in the portion of the feature space that belongs to the former.
3801							Subsequently, for each training sample, the display shows a subtree of the nodes that
3802							are affected indirectly by the sample through the generalization achieved by the
3803							probabilistic modeling of the data. In general, a node is affected indirectly by a
3804							sample if it is a descendant of another node that is affected directly.
3805
3806							Also see the section titled B regarding how to invoke the
3807							introspection capabilities of the module in your own code.
3808
3809							=head1 METHODS
3810
3811							The module provides the following methods for constructing a decision tree from
3812							training data in a disk file and for classifying new data records with the decision
3813							tree thus constructed:
3814
3815							=over 4
3816
3817							=item B
3818
3819							my $dt = Algorithm::DecisionTree->new(
3820							training_datafile => $training_datafile,
3821							csv_class_column_index => 2,
3822							csv_columns_for_features => [3,4,5,6,7,8],
3823							entropy_threshold => 0.01,
3824							max_depth_desired => 8,
3825							symbolic_to_numeric_cardinality_threshold => 10,
3826							csv_cleanup_needed => 1,
3827							);
3828
3829							A call to C constructs a new instance of the C class.
3830							For this call to make sense, the training data in the training datafile must be
3831							in the CSV format.
3832
3833							=back
3834
3835							=head2 The Constructor Parameters
3836
3837							=over 8
3838
3839							=item C:
3840
3841							This parameter supplies the name of the file that contains the training data.
3842
3843							=item C:
3844
3845							When using a CSV file for your training data, this parameter supplies the zero-based
3846							column index for the column that contains the class label for each data record in the
3847							training file.
3848
3849
3850							=item C:
3851
3852							You need to set this parameter to 1 if your CSV file has double quoted strings (which
3853							may include commas) as values for the fields and if such values are allowed to
3854							include commas for, presumably, better readability.
3855
3856							=item C:
3857
3858							When using a CSV file for your training data, this parameter supplies a list of
3859							columns corresponding to the features you wish to use for decision tree construction.
3860							Each column is specified by its zero-based index.
3861
3862							=item C:
3863
3864							This parameter sets the granularity with which the entropies are sampled by the
3865							module. For example, a feature test at a node in the decision tree is acceptable if
3866							the entropy gain achieved by the test exceeds this threshold. The larger the value
3867							you choose for this parameter, the smaller the tree. Its default value is 0.001.
3868
3869							=item C:
3870
3871							This parameter sets the maximum depth of the decision tree. For obvious reasons, the
3872							smaller the value you choose for this parameter, the smaller the tree.
3873
3874							=item C:
3875
3876							This parameter allows the module to treat an otherwise numeric feature symbolically
3877							if the number of different values the feature takes in the training data file does
3878							not exceed the value of this parameter.
3879
3880							=item C:
3881
3882							This parameter gives the user the option to set the number of points at which the
3883							value range for a feature should be sampled for estimating the probabilities. This
3884							parameter is effective only for those features that occupy a large value range and
3885							whose probability distributions are heavy tailed. B
3886							when you have a very large training dataset:> In general, the larger the dataset, the
3887							smaller the smallest difference between any two values for a numeric feature in
3888							relation to the overall range of values for that feature. In such cases, the module
3889							may use too large a number of bins for estimating the probabilities and that may slow
3890							down the calculation of the decision tree. You can get around this difficulty by
3891							explicitly giving a value to the 'C' parameter.
3892
3893							=back
3894
3895
3896							You can choose the best values to use for the last three constructor parameters by
3897							running a 10-fold cross-validation test on your training data through the class
3898							C that comes with Versions 2.1 and higher of this module. See the
3899							section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.
3900
3901							=over
3902
3903							=item B
3904
3905							After you have constructed a new instance of the C class,
3906							you must now read in the training data that is the file named in the call to the
3907							constructor. This you do by:
3908
3909							$dt->get_training_data();
3910
3911
3912							=item B
3913
3914							If you wish to see the training data that was just digested by the module,
3915							call
3916
3917							$dt->show_training_data();
3918
3919							=item B
3920
3921							=item B
3922
3923							After the module has read the training data file, it needs to initialize the
3924							probability cache. This you do by invoking:
3925
3926							$dt->calculate_first_order_probabilities()
3927							$dt->calculate_class_priors()
3928
3929							=item B
3930
3931							With the probability cache initialized, it is time to construct a decision tree
3932							classifier. This you do by
3933
3934							my $root_node = $dt->construct_decision_tree_classifier();
3935
3936							This call returns an instance of type C. The C class is defined
3937							within the main package file. So, don't forget, that C<$root_node> in the above
3938							example call will be instantiated to an object of type C.
3939
3940							=item B<$root_nodeC<< -> >>display_decision_tree(" "):>
3941
3942							$root_node->display_decision_tree(" ");
3943
3944							This will display the decision tree in your terminal window by using a recursively
3945							determined offset for each node as the display routine descends down the tree.
3946
3947							I have intentionally left the syntax fragment C<$root_node> in the above call to
3948							remind the reader that C is NOT called on the instance of
3949							the C we constructed earlier, but on the C instance returned by
3950							the call to C.
3951
3952							=item B
3953
3954							Let's say you want to classify the following data record:
3955
3956							my @test_sample = qw / g2=4.2
3957							grade=2.3
3958							gleason=4
3959							eet=1.7
3960							age=55.0
3961							ploidy=diploid /;
3962
3963							you'd make the following call:
3964
3965							my $classification = $dt->classify($root_node, \@test_sample);
3966
3967							where, again, C<$root_node> is an instance of type C returned by the call to
3968							C. The variable C<$classification> holds a
3969							reference to a hash whose keys are the class names and whose values the associated
3970							probabilities. The hash that is returned by the above call also includes a special
3971							key-value pair for a key named C. The value associated with this key
3972							is an anonymous array that holds the path, in the form of a list of nodes, from the
3973							root node to the leaf node in the decision tree where the final classification was
3974							made.
3975
3976
3977							=item B
3978
3979							This method allows you to use a decision-tree based classifier in an interactive
3980							mode. In this mode, a user is prompted for answers to the questions pertaining to
3981							the feature tests at the nodes of the tree. The syntax for invoking this method is:
3982
3983							my $classification = $dt->classify_by_asking_questions($root_node);
3984
3985							where C<$dt> is an instance of the C class returned by a
3986							call to C and C<$root_node> the root node of the decision tree returned by a
3987							call to C.
3988
3989							=back
3990
3991
3992							=head1 THE INTROSPECTION API
3993
3994							To construct an instance of C, you call
3995
3996							my $introspector = DTIntrospection->new($dt);
3997
3998							where you supply the instance of the C class you used for constructing
3999							the decision tree through the parameter C<$dt>. After you have constructed an
4000							instance of the introspection class, you must initialize it by
4001
4002							$introspector->initialize();
4003
4004							Subsequently, you can invoke either of the following methods:
4005
4006							$introspector->explain_classification_at_one_node($node);
4007
4008							$introspector->explain_classifications_at_multiple_nodes_interactively();
4009
4010							depending on whether you want introspection at a single specified node or inside an
4011							infinite loop for an arbitrary number of nodes.
4012
4013							If you want to output a tabular display that shows for each node in the decision tree
4014							all the training samples that fall in the portion of the feature space that belongs
4015							to that node, call
4016
4017							$introspector->display_training_samples_at_all_nodes_direct_influence_only();
4018
4019							If you want to output a tabular display that shows for each training sample a list of
4020							all the nodes that are affected directly AND indirectly by that sample, call
4021
4022							$introspector->display_training_training_samples_to_nodes_influence_propagation();
4023
4024							A training sample affects a node directly if the sample falls in the portion of the
4025							features space assigned to that node. On the other hand, a training sample is
4026							considered to affect a node indirectly if the node is a descendant of a node that is
4027							affected directly by the sample.
4028
4029
4030							=head1 BULK CLASSIFICATION OF DATA RECORDS
4031
4032							For large test datasets, you would obviously want to process an entire file of test
4033							data at a time. The following scripts in the C directory illustrate how you
4034							can do that:
4035
4036							classify_test_data_in_a_file.pl
4037
4038							This script requires three command-line arguments, the first argument names the
4039							training datafile, the second the test datafile, and the third the file in which the
4040							classification results are to be deposited.
4041
4042							The other examples directories, C, C, and
4043							C, also contain scripts that illustrate how to carry out
4044							bulk classification of data records when you wish to take advantage of bagging,
4045							boosting, or tree randomization. In their respective directories, these scripts are
4046							named:
4047
4048							bagging_for_bulk_classification.pl
4049							boosting_for_bulk_classification.pl
4050							classify_database_records.pl
4051
4052
4053							=head1 HOW THE CLASSIFICATION RESULTS ARE DISPLAYED
4054
4055							It depends on whether you apply the classifier at once to all the data samples in a
4056							file, or whether you feed one data sample at a time into the classifier.
4057
4058							In general, the classifier returns soft classification for a test data vector. What
4059							that means is that, in general, the classifier will list all the classes to which a
4060							given data vector could belong and the probability of each such class label for the
4061							data vector. Run the examples scripts in the Examples directory to see how the output
4062							of classification can be displayed.
4063
4064							With regard to the soft classifications returned by this classifier, if the
4065							probability distributions for the different classes overlap in the underlying feature
4066							space, you would want the classifier to return all of the applicable class labels for
4067							a data vector along with the corresponding class probabilities. Another reason for
4068							why the decision tree classifier may associate significant probabilities with
4069							multiple class labels is that you used inadequate number of training samples to
4070							induce the decision tree. The good thing is that the classifier does not lie to you
4071							(unlike, say, a hard classification rule that would return a single class label
4072							corresponding to the partitioning of the underlying feature space). The decision
4073							tree classifier give you the best classification that can be made given the training
4074							data you fed into it.
4075
4076
4077							=head1 USING BAGGING
4078
4079							Starting with Version 3.0, you can use the class C that
4080							comes with the module to incorporate bagging in your decision tree based
4081							classification. Bagging means constructing multiple decision trees for different
4082							(possibly overlapping) segments of the data extracted from your training dataset and
4083							then aggregating the decisions made by the individual decision trees for the final
4084							classification. The aggregation of the classification decisions can average out the
4085							noise and bias that may otherwise affect the classification decision obtained from
4086							just one tree.
4087
4088							=over 4
4089
4090							=item B
4091
4092							A typical call to the constructor for the C class looks
4093							like:
4094
4095							use Algorithm::DecisionTreeWithBagging;
4096
4097							my $training_datafile = "stage3cancer.csv";
4098
4099							my $dtbag = Algorithm::DecisionTreeWithBagging->new(
4100							training_datafile => $training_datafile,
4101							csv_class_column_index => 2,
4102							csv_columns_for_features => [3,4,5,6,7,8],
4103							entropy_threshold => 0.01,
4104							max_depth_desired => 8,
4105							symbolic_to_numeric_cardinality_threshold => 10,
4106							how_many_bags => 4,
4107							bag_overlap_fraction => 0.2,
4108							csv_cleanup_needed => 1,
4109							);
4110
4111							Note in particular the following two constructor parameters:
4112
4113							how_many_bags
4114
4115							bag_overlap_fraction
4116
4117							where, as the name implies, the parameter C controls how many bags
4118							(and, therefore, how many decision trees) will be constructed from your training
4119							dataset; and where the parameter C controls the degree of
4120							overlap between the bags. To understand what exactly is achieved by setting the
4121							parameter C to 0.2 in the above example, let's say that the
4122							non-overlapping partitioning of the training data between the bags results in 100
4123							training samples per bag. With bag_overlap_fraction set to 0.2, additional 20 samples
4124							drawn randomly from the other bags will be added to the data in each bag.
4125
4126							=back
4127
4128							=head2 B class>
4129
4130							=over 8
4131
4132							=item B
4133
4134							This method reads your training datafile, randomizes it, and then partitions it into
4135							the specified number of bags. Subsequently, if the constructor parameter
4136							C is non-zero, it adds to each bag additional samples drawn at
4137							random from the other bags. The number of these additional samples added to each bag
4138							is controlled by the constructor parameter C. If this
4139							parameter is set to, say, 0.2, the size of each bag will grow by 20% with the samples
4140							drawn from the other bags.
4141
4142							=item B
4143
4144							Shows for each bag the names of the training data samples in that bag.
4145
4146							=item B
4147
4148							Calls on the appropriate methods of the main C class to estimate the
4149							first-order probabilities from the data samples in each bag.
4150
4151							=item B
4152
4153							Calls on the appropriate method of the main C class to estimate the
4154							class priors for the data samples in each bag.
4155
4156							=item B
4157
4158							Calls on the appropriate method of the main C class to construct a
4159							decision tree from the training data in each bag.
4160
4161							=item B
4162
4163							Display separately the decision tree for each bag..
4164
4165							=item B
4166
4167							Calls on the appropriate methods of the main C class to classify the
4168							argument test sample.
4169
4170							=item B
4171
4172							Displays separately the classification decision made by each the decision tree
4173							constructed for each bag.
4174
4175							=item B
4176
4177							Using majority voting, this method aggregates the classification decisions made by
4178							the individual decision trees into a single decision.
4179
4180							=back
4181
4182							See the example scripts in the directory C for how to call these
4183							methods for classifying individual samples and for bulk classification when you place
4184							all your test samples in a single file.
4185
4186							=head1 USING BOOSTING
4187
4188							Starting with Version 3.20, you can use the class C for
4189							constructing a boosted decision-tree classifier. Boosting results in a cascade of
4190							decision trees in which each decision tree is constructed with samples that are
4191							mostly those that are misclassified by the previous decision tree. To be precise,
4192							you create a probability distribution over the training samples for the selection of
4193							samples for training each decision tree in the cascade. To start out, the
4194							distribution is uniform over all of the samples. Subsequently, this probability
4195							distribution changes according to the misclassifications by each tree in the cascade:
4196							if a sample is misclassified by a given tree in the cascade, the probability of its
4197							being selected for training the next tree is increased significantly. You also
4198							associate a trust factor with each decision tree depending on its power to classify
4199							correctly all of the training data samples. After a cascade of decision trees is
4200							constructed in this manner, you construct a final classifier that calculates the
4201							class label for a test data sample by taking into account the classification
4202							decisions made by each individual tree in the cascade, the decisions being weighted
4203							by the trust factors associated with the individual classifiers. These boosting
4204							notions --- generally referred to as the AdaBoost algorithm --- are based on a now
4205							celebrated paper "A Decision-Theoretic Generalization of On-Line Learning and an
4206							Application to Boosting" by Yoav Freund and Robert Schapire that appeared in 1995 in
4207							the Proceedings of the 2nd European Conf. on Computational Learning Theory. For a
4208							tutorial introduction to AdaBoost, see L
4209
4210							Keep in mind the fact that, ordinarily, the theoretical guarantees provided by
4211							boosting apply only to the case of binary classification. Additionally, your
4212							training dataset must capture all of the significant statistical variations in the
4213							classes represented therein.
4214
4215							=over 4
4216
4217							=item B
4218
4219							If you'd like to experiment with boosting, a typical call to the constructor for the
4220							C class looks like:
4221
4222							use Algorithm::BoostedDecisionTree;
4223							my $training_datafile = "training6.csv";
4224							my $boosted = Algorithm::BoostedDecisionTree->new(
4225							training_datafile => $training_datafile,
4226							csv_class_column_index => 1,
4227							csv_columns_for_features => [2,3],
4228							entropy_threshold => 0.01,
4229							max_depth_desired => 8,
4230							symbolic_to_numeric_cardinality_threshold => 10,
4231							how_many_stages => 4,
4232							csv_cleanup_needed => 1,
4233							);
4234
4235							Note in particular the constructor parameter:
4236
4237							how_many_stages
4238
4239							As its name implies, this parameter controls how many stages will be used in the
4240							boosted decision tree classifier. As mentioned above, a separate decision tree is
4241							constructed for each stage of boosting using a set of training samples that are drawn
4242							through a probability distribution maintained over the entire training dataset.
4243
4244							=back
4245
4246							=head2 B class>
4247
4248							=over 8
4249
4250							=item B
4251
4252							This method reads your training datafile, creates the data structures from the data
4253							ingested for constructing the base decision tree.
4254
4255							=item B
4256
4257							Writes to the standard output the training data samples and also some relevant
4258							properties of the features used in the training dataset.
4259
4260							=item B
4261
4262							Calls on the appropriate methods of the main C class to estimate the
4263							first-order probabilities and the class priors.
4264
4265							=item B
4266
4267							Calls on the appropriate method of the main C class to construct the
4268							base decision tree.
4269
4270							=item B
4271
4272							Displays the base decision tree in your terminal window. (The textual form of the
4273							decision tree is written out to the standard output.)
4274
4275							=item B
4276
4277							Uses the AdaBoost algorithm to construct a cascade of decision trees. As mentioned
4278							earlier, the training samples for each tree in the cascade are drawn using a
4279							probability distribution over the entire training dataset. This probability
4280							distribution for any given tree in the cascade is heavily influenced by which
4281							training samples are misclassified by the previous tree.
4282
4283							=item B
4284
4285							Displays separately in your terminal window the decision tree constructed for each
4286							stage of the cascade. (The textual form of the trees is written out to the standard
4287							output.)
4288
4289							=item B
4290
4291							Calls on each decision tree in the cascade to classify the argument C<$test_sample>.
4292
4293							=item B
4294
4295							You can call this method to display in your terminal window the classification
4296							decision made by each decision tree in the cascade. The method also prints out the
4297							trust factor associated with each decision tree. It is important to look
4298							simultaneously at the classification decision and the trust factor for each tree ---
4299							since a classification decision made by a specific tree may appear bizarre for a
4300							given test sample. This method is useful primarily for debugging purposes.
4301
4302							=item B
4303
4304							As with the previous method, this method is useful mostly for debugging. It returns
4305							class labels for the samples misclassified by the stage whose integer index is
4306							supplied as an argument to the method. Say you have 10 stages in your cascade. The
4307							value of the argument C would go from 0 to 9, with 0 corresponding to
4308							the base tree.
4309
4310							=item B
4311
4312							Uses the "final classifier" formula of the AdaBoost algorithm to pool together the
4313							classification decisions made by the individual trees while taking into account the
4314							trust factors associated with the trees. As mentioned earlier, we associate with
4315							each tree of the cascade a trust factor that depends on the overall misclassification
4316							rate associated with that tree.
4317
4318							=back
4319
4320							See the example scripts in the C subdirectory for how to call the
4321							methods listed above for classifying individual data samples with boosting and for
4322							bulk classification when you place all your test samples in a single file.
4323
4324
4325							=head1 USING RANDOMIZED DECISION TREES
4326
4327							Consider the following two situations that call for using randomized decision trees,
4328							meaning multiple decision trees that are trained using data extracted randomly from a
4329							large database of training samples:
4330
4331							(1) Consider a two-class problem for which the training database is grossly
4332							imbalanced in how many majority-class samples it contains vis-a-vis the number of
4333							minority class samples. Let's assume for a moment that the ratio of majority class
4334							samples to minority class samples is 1000 to 1. Let's also assume that you have a
4335							test dataset that is drawn randomly from the same population mixture from which the
4336							training database was created. Now consider a stupid data classification program
4337							that classifies everything as belonging to the majority class. If you measure the
4338							classification accuracy rate as the ratio of the number of samples correctly
4339							classified to the total number of test samples selected randomly from the population,
4340							this classifier would work with an accuracy of 99.99%.
4341
4342							(2) Let's now consider another situation in which we are faced with a huge training
4343							database but in which every class is equally well represented. Feeding all the data
4344							into a single decision tree would be akin to polling all of the population of the
4345							United States for measuring the Coke-versus-Pepsi preference in the country. You are
4346							likely to get better results if you construct multiple decision trees, each trained
4347							with a collection of training samples drawn randomly from the training database.
4348							After you have created all the decision trees, your final classification decision
4349							could then be based on, say, majority voting by the trees.
4350
4351							In summary, the C class allows you to solve the following
4352							two problems: (1) Data classification using the needle-in-a-haystack metaphor, that
4353							is, when a vast majority of your training samples belong to just one class. And (2)
4354							You have access to a very large database of training samples and you wish to
4355							construct an ensemble of decision trees for classification.
4356
4357							=over 4
4358
4359							=item B
4360
4361							Here is how you'd call the C constructor for
4362							needle-in-a-haystack classification:
4363
4364							use Algorithm::RandomizedTreesForBigData;
4365							my $training_datafile = "your_database.csv";
4366							my $rt = Algorithm::RandomizedTreesForBigData->new(
4367							training_datafile => $training_datafile,
4368							csv_class_column_index => 48,
4369							csv_columns_for_features => [24,32,33,34,41],
4370							entropy_threshold => 0.01,
4371							max_depth_desired => 8,
4372							symbolic_to_numeric_cardinality_threshold => 10,
4373							how_many_trees => 5,
4374							looking_for_needles_in_haystack => 1,
4375							csv_cleanup_needed => 1,
4376							);
4377
4378							Note in particular the constructor parameters:
4379
4380							looking_for_needles_in_haystack
4381							how_many_trees
4382
4383							The first of these parameters, C, invokes the logic for
4384							constructing an ensemble of decision trees, each based on a training dataset that
4385							uses all of the minority class samples, and a random drawing from the majority class
4386							samples.
4387
4388							Here is how you'd call the C constructor for a more
4389							general attempt at constructing an ensemble of decision trees, with each tree trained
4390							with randomly drawn samples from a large database of training data (without paying
4391							attention to the differences in the sizes of the populations for the different
4392							classes):
4393
4394							use Algorithm::RandomizedTreesForBigData;
4395							my $training_datafile = "your_database.csv";
4396							my $rt = Algorithm::RandomizedTreesForBigData->new(
4397							training_datafile => $training_datafile,
4398							csv_class_column_index => 2,
4399							csv_columns_for_features => [3,4,5,6,7,8],
4400							entropy_threshold => 0.01,
4401							max_depth_desired => 8,
4402							symbolic_to_numeric_cardinality_threshold => 10,
4403							how_many_trees => 3,
4404							how_many_training_samples_per_tree => 50,
4405							csv_cleanup_needed => 1,
4406							);
4407
4408							Note in particular the constructor parameters:
4409
4410							how_many_training_samples_per_tree
4411							how_many_trees
4412
4413							When you set the C parameter, you are not allowed
4414							to also set the C parameter, and vice versa.
4415
4416							=back
4417
4418							=head2 B class>
4419
4420							=over 8
4421
4422							=item B
4423
4424							What this method does depends on which of the two constructor parameters ---
4425							C or C --- is
4426							set. When the former is set, it creates a collection of training datasets for
4427							C number of decision trees, with each dataset being a mixture of the
4428							minority class and sample drawn randomly from the majority class. However, when the
4429							latter option is set, all the datasets are drawn randomly from the training database
4430							with no particular attention given to the relative populations of the two classes.
4431
4432							=item B
4433
4434							As the name implies, this method shows the training data being used for all the
4435							decision trees. This method is useful for debugging purposes using small datasets.
4436
4437							=item B
4438
4439							Calls on the appropriate method of the main C to estimate the
4440							first-order probabilities for the training dataset to be used for each decision tree.
4441
4442							=item B
4443
4444							Calls on the appropriate method of the main C class to estimate the
4445							class priors for the training dataset to be used for each decision tree.
4446
4447							=item B
4448
4449							Calls on the appropriate method of the main C class to construct the
4450							decision trees.
4451
4452							=item B
4453
4454							Displays all the decision trees in your terminal window. (The textual form of the
4455							decision trees is written out to the standard output.)
4456
4457							=item B
4458
4459							The test_sample is sent to each decision tree for classification.
4460
4461							=item B
4462
4463							The classification decisions returned by the individual decision trees are written
4464							out to the standard output.
4465
4466							=item B
4467
4468							This method aggregates the classification results returned by the individual decision
4469							trees and returns the majority decision.
4470
4471							=back
4472
4473							=head1 CONSTRUCTING REGRESSION TREES:
4474
4475							Decision tree based modeling requires that the class labels be distinct. That is,
4476							the training dataset must contain a relatively small number of discrete class labels
4477							for all of your data records if you want to model the data with one or more decision
4478							trees. However, when one is trying to understand all of the associational
4479							relationships that exist in a large database, one often runs into situations where,
4480							instead of discrete class labels, you have a continuously valued variable as a
4481							dependent variable whose values are predicated on a set of feature values. It is for
4482							such situations that you will find useful the new class C that is now
4483							a part of the C module. The C class has been
4484							programmed as a subclass of the main C class.
4485
4486							You can think of regression with a regression tree as a powerful generalization of
4487							the very commonly used Linear Regression algorithms. Although you can certainly
4488							carry out polynomial regression with run-of-the-mill Linear Regression algorithms for
4489							modeling nonlinearities between the predictor variables and the dependent variable,
4490							specifying the degree of the polynomial is often tricky. Additionally, a polynomial
4491							can inject continuities between the predictor and the predicted variables that may
4492							not really exist in the real data. Regression trees, on the other hand, give you a
4493							piecewise linear relationship between the predictor and the predicted variables that
4494							is freed from the constraints of superimposed continuities at the joins between the
4495							different segments. See the following tutorial for further information regarding the
4496							standard linear regression approach and the regression that can be achieved with the
4497							RegressionTree class in this module:
4498							L
4499
4500							The RegressionTree class in the current version of the module assumes that all of
4501							your data is numerical. That is, unlike what is possible with the DecisionTree class
4502							(and the other more closely related classes in this module) that allow your training
4503							file to contain a mixture of numerical and symbolic data, the RegressionTree class
4504							requires that ALL of your data be numerical. I hope to relax this constraint in
4505							future versions of this module. Obviously, the dependent variable will always be
4506							numerical for regression.
4507
4508							See the example scripts in the directory C if you wish to become
4509							more familiar with the regression capabilities of the module.
4510
4511							=over 4
4512
4513							=item B
4514
4515							my $training_datafile = "gendata5.csv";
4516							my $rt = Algorithm::RegressionTree->new(
4517							training_datafile => $training_datafile,
4518							dependent_variable_column => 2,
4519							predictor_columns => [1],
4520							mse_threshold => 0.01,
4521							max_depth_desired => 2,
4522							jacobian_choice => 0,
4523							csv_cleanup_needed => 1,
4524							);
4525
4526							Note in particular the constructor parameters:
4527
4528							dependent_variable
4529							predictor_columns
4530							mse_threshold
4531							jacobian_choice
4532
4533							The first of these parameters, C, is set to the column index in
4534							the CSV file for the dependent variable. The second constructor parameter,
4535							C, tells the system as to which columns contain values for the
4536							predictor variables. The third parameter, C, is for deciding when to
4537							partition the data at a node into two child nodes as a regression tree is being
4538							constructed. If the minmax of MSE (Mean Squared Error) that can be achieved by
4539							partitioning any of the features at a node is smaller than C, that
4540							node becomes a leaf node of the regression tree.
4541
4542							The last parameter, C, must be set to either 0 or 1 or 2. Its
4543							default value is 0. When this parameter equals 0, the regression coefficients are
4544							calculated using the linear least-squares method and no further "refinement" of the
4545							coefficients is carried out using gradient descent. This is the fastest way to
4546							calculate the regression coefficients. When C is set to 1, you get
4547							a weak version of gradient descent in which the Jacobian is set to the "design
4548							matrix" itself. Choosing 2 for C results in a more reasonable
4549							approximation to the Jacobian. That, however, is at a cost of much longer
4550							computation time. B For most cases, using 0 for C is the
4551							best choice. See my tutorial "I" for why
4552							that is the case.
4553
4554							=back
4555
4556							=head2 B class>
4557
4558							=over 8
4559
4560							=item B
4561
4562							Only CSV training datafiles are allowed. Additionally, the first record in the file
4563							must list the names of the fields, and the first column must contain an integer ID
4564							for each record.
4565
4566							=item B
4567
4568							As the name implies, this is the method that construct a regression tree.
4569
4570							=item B
4571
4572							Displays the regression tree, as the name implies. The white-space string argument
4573							specifies the offset to use in displaying the child nodes in relation to a parent
4574							node.
4575
4576							=item B
4577
4578							You call this method after you have constructed a regression tree if you want to
4579							calculate the prediction for one sample. The parameter C<$root_node> is what is
4580							returned by the call C. The formatting of the argument
4581							bound to the C<$test_sample> parameter is important. To elaborate, let's say you are
4582							using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In
4583							this case, the C<$test_sample> parameter will be bound to a list that will look like
4584
4585							['xvar1 = 23.4', 'xvar2 = 12.9']
4586
4587							Arbitrary amount of white space, including none, on the two sides of the equality
4588							symbol is allowed in the construct shown above. A call to this method returns a
4589							dictionary with two key-value pairs. One of the keys is called C and
4590							the other C. The value associated with key C is the path
4591							in the regression tree to the leaf node that yielded the prediction. And the value
4592							associated with the key C is the answer you are looking for.
4593
4594							=item B
4595
4596							This call calculates the predictions for all of the predictor variables data in your
4597							training file. The parameter C<$root_node> is what is returned by the call to
4598							C. The values for the dependent variable thus predicted
4599							can be seen by calling C, which is the method mentioned below.
4600
4601							=item B
4602
4603							This method displays the results obtained by calling the prediction method of the
4604							previous entry. This method also creates a hardcopy of the plots and saves it as a
4605							C<.png> disk file. The name of this output file is always C.
4606
4607							=item B
4608
4609							This method carries out an error analysis of the predictions for the samples in your
4610							training datafile. It shows you the overall MSE (Mean Squared Error) with tree-based
4611							regression, the MSE for the data samples at each of the leaf nodes of the regression
4612							tree, and the MSE for the plain old Linear Regression as applied to all of the data.
4613							The parameter C<$root_node> in the call syntax is what is returned by the call to
4614							C.
4615
4616							=item B
4617
4618							Call this method if you want to apply the regression tree to all your test data in a
4619							disk file. The predictions for all of the test samples in the disk file are written
4620							out to another file whose name is the same as that of the test file except for the
4621							addition of C<_output> in the name of the file. The parameter C<$filename> is the
4622							name of the disk file that contains the test data. And the parameter C<$columns> is a
4623							list of the column indices for the predictor variables in the test file.
4624
4625							=back
4626
4627							=head1 GENERATING SYNTHETIC TRAINING DATA
4628
4629							The module file contains the following additional classes: (1)
4630							C, and (2) C for
4631							generating synthetic training data.
4632
4633							The class C outputs one CSV file for the
4634							training data and another one for the test data for experimenting with numeric
4635							features. The numeric values are generated using a multivariate Gaussian
4636							distribution whose mean and covariance are specified in a parameter file. See the
4637							file C in the C directory for an example of such a
4638							parameter file. Note that the dimensionality of the data is inferred from the
4639							information you place in the parameter file.
4640
4641							The class C generates synthetic training for the
4642							purely symbolic case. The relative frequencies of the different possible values for
4643							the features is controlled by the biasing information you place in a parameter file.
4644							See C for an example of such a file.
4645
4646
4647							=head1 THE C DIRECTORY
4648
4649							See the C directory in the distribution for how to construct a decision
4650							tree, and how to then classify new data using the decision tree. To become more
4651							familiar with the module, run the scripts
4652
4653							construct_dt_and_classify_one_sample_case1.pl
4654							construct_dt_and_classify_one_sample_case2.pl
4655							construct_dt_and_classify_one_sample_case3.pl
4656							construct_dt_and_classify_one_sample_case4.pl
4657
4658							The first script is for the purely symbolic case, the second for the case that
4659							involves both numeric and symbolic features, the third for the case of purely numeric
4660							features, and the last for the case when the training data is synthetically generated
4661							by the script C.
4662
4663							Next run the following script as it is for bulk classification of data records placed
4664							in a CSV file:
4665
4666							classify_test_data_in_a_file.pl training4.csv test4.csv out4.csv
4667
4668							The script first constructs a decision tree using the training data in the training
4669							file supplied by the first argument file C. The script then
4670							calculates the class label for each data record in the test data file supplied
4671							through the second argument file, C. The estimated class labels are
4672							written out to the output file which in the call shown above is C. An
4673							important thing to note here is that your test file --- in this case C ---
4674							must have a column for class labels. Obviously, in real-life situations, there will
4675							be no class labels in this column. What that is the case, you can place an empty
4676							string C<""> there for each data record. This is demonstrated by the following call:
4677
4678							classify_test_data_in_a_file.pl training4.csv test4_no_class_labels.csv out4.csv
4679
4680							The following script in the C directory
4681
4682							classify_by_asking_questions.pl
4683
4684							shows how you can use a decision-tree classifier interactively. In this mode, you
4685							first construct the decision tree from the training data and then the user is
4686							prompted for answers to the feature tests at the nodes of the tree.
4687
4688							If your training data has a feature whose values span a large range and, at the same
4689							time, are characterized by a heavy-tail distribution, you should look at the script
4690
4691							construct_dt_for_heavytailed.pl
4692
4693							to see how to use the option C in the call to the
4694							constructor. This option was introduced in Version 2.22 for dealing with such
4695							features. If you do not set this option, the module will use the default value of
4696							500 for the number of points at which to sample the value range for such a feature.
4697
4698							The C directory also contains the following scripts:
4699
4700							generate_training_data_numeric.pl
4701							generate_training_data_symbolic.pl
4702
4703							that show how you can use the module to generate synthetic training. Synthetic
4704							training is generated according to the specifications laid out in a parameter file.
4705							There are constraints on how the information is laid out in a parameter file. See
4706							the files C and C in the C directory
4707							for how to structure these files.
4708
4709							The C directory of Versions 2.1 and higher of the module also contains the
4710							following two scripts:
4711
4712							evaluate_training_data1.pl
4713							evaluate_training_data2.pl
4714
4715							that illustrate how the Perl class C can be used to evaluate the
4716							quality of your training data (as long as it resides in a `C<.csv>' file.) This new
4717							class is a subclass of the C class in the module file. See the README
4718							in the C directory for further information regarding these two scripts.
4719
4720							The C directory of Versions 2.31 and higher of the module contains the
4721							following three scripts:
4722
4723							introspection_in_a_loop_interactive.pl
4724
4725							introspection_show_training_samples_at_all_nodes_direct_influence.pl
4726
4727							introspection_show_training_samples_to_nodes_influence_propagation.pl
4728
4729							The first script illustrates how to use the C class of the module
4730							interactively for generating explanations for the classification decisions made at
4731							the nodes of the decision tree. In the interactive session you are first asked for
4732							the node number you are interested in. Subsequently, you are asked for whether or
4733							not you are interested in specific questions that the introspector can provide
4734							answers for. The second script generates a tabular display that shows for each node
4735							of the decision tree a list of the training samples that fall directly in the portion
4736							of the feature space assigned that node. (As mentioned elsewhere in this
4737							documentation, when this list is empty for a node, that means the node is a result of
4738							the generalization achieved by probabilistic modeling of the data. Note that this
4739							module constructs a decision tree NOT by partitioning the set of training samples,
4740							BUT by partitioning the domains of the probability density functions.) The third
4741							script listed above also generates a tabular display, but one that shows how the
4742							influence of each training sample propagates in the tree. This display first shows
4743							the list of nodes that are affected directly by the data in a training sample. This
4744							list is followed by an indented display of the nodes that are affected indirectly by
4745							the training sample. A training sample affects a node indirectly if the node is a
4746							descendant of one of the nodes affected directly.
4747
4748							The latest addition to the Examples directory is the script:
4749
4750							get_indexes_associated_with_fields.py
4751
4752							As to why you may find this script useful, note that large database files may have
4753							hundreds of fields and it is not always easy to figure out what numerical index is
4754							associated with a given field. At the same time, the constructor of the DecisionTree
4755							module requires that the field that holds the class label and the fields that contain
4756							the feature values be specified by their numerical zero-based indexes. If you have a
4757							large database and you are faced with this problem, you can run this script to see
4758							the zero-based numerical index values associated with the different columns of your
4759							CSV file.
4760
4761
4762							=head1 THE C DIRECTORY
4763
4764							The C directory contains the following scripts:
4765
4766							bagging_for_classifying_one_test_sample.pl
4767
4768							bagging_for_bulk_classification.pl
4769
4770							As the names of the scripts imply, the first shows how to call the different methods
4771							of the C class for classifying a single test sample. When
4772							you are classifying a single test sample, you can also see how each bag is
4773							classifying the test sample. You can, for example, display the training data used in
4774							each bag, the decision tree constructed for each bag, etc.
4775
4776							The second script is for the case when you place all of the test samples in a single
4777							file. The demonstration script displays for each test sample a single aggregate
4778							classification decision that is obtained through majority voting by all the decision
4779							trees.
4780
4781
4782							=head1 THE C DIRECTORY
4783
4784							The C subdirectory in the main installation directory contains the
4785							following three scripts:
4786
4787							boosting_for_classifying_one_test_sample_1.pl
4788
4789							boosting_for_classifying_one_test_sample_2.pl
4790
4791							boosting_for_bulk_classification.pl
4792
4793							As the names of the first two scripts imply, these show how to call the different
4794							methods of the C class for classifying a single test sample.
4795							When you are classifying a single test sample, you can see how each stage of the
4796							cascade of decision trees is classifying the test sample. You can also view each
4797							decision tree separately and also see the trust factor associated with the tree.
4798
4799							The third script is for the case when you place all of the test samples in a single
4800							file. The demonstration script outputs for each test sample a single aggregate
4801							classification decision that is obtained through trust-factor weighted majority
4802							voting by all the decision trees.
4803
4804							=head1 THE C DIRECTORY
4805
4806							The C directory shows example scripts that you can use to
4807							become more familiar with the C class for solving
4808							needle-in-a-haystack and big-data data classification problems. These scripts are:
4809
4810							randomized_trees_for_classifying_one_test_sample_1.pl
4811
4812							randomized_trees_for_classifying_one_test_sample_2.pl
4813
4814							classify_database_records.pl
4815
4816							The first script shows the constructor options to use for solving a
4817							needle-in-a-haystack problem --- that is, a problem in which a vast majority of the
4818							training data belongs to just one class. The second script shows the constructor
4819							options for using randomized decision trees for the case when you have access to a
4820							very large database of training samples and you'd like to construct an ensemble of
4821							decision trees using training samples pulled randomly from the training database.
4822							The last script illustrates how you can evaluate the classification power of an
4823							ensemble of decision trees as constructed by C by classifying
4824							a large number of test samples extracted randomly from the training database.
4825
4826
4827							=head1 THE C DIRECTORY
4828
4829							The C subdirectory in the main installation directory shows
4830							example scripts that you can use to become familiar with regression trees and how
4831							they can be used for nonlinear regression. If you are new to the concept of
4832							regression trees, start by executing the following scripts without changing them and
4833							see what sort of output is produced by them:
4834
4835							regression4.pl
4836
4837							regression5.pl
4838
4839							regression6.pl
4840
4841							regression8.pl
4842
4843							The C script involves only one predictor variable and one dependent
4844							variable. The training data for this exercise is drawn from the file C.
4845							This data file contains strongly nonlinear data. When you run the script
4846							C, you will see how much better the result from tree regression is
4847							compared to what you can get with linear regression.
4848
4849							The C script is essentially the same as the previous script except
4850							for the fact that the training datafile used in this case, C, consists
4851							of three noisy segments, as opposed to just two in the previous case.
4852
4853							The script C deals with the case when we have two predictor variables
4854							and one dependent variable. You can think of the data as consisting of noisy height
4855							values over an C<(x1,x2)> plane. The data used in this script is drawn from the csv
4856							file C.
4857
4858							Finally, the script C shows how you can carry out bulk prediction for
4859							all your test data records in a disk file. The script writes all the calculated
4860							predictions into another disk file whose name is derived from the name of the test
4861							data file.
4862
4863
4864							=head1 EXPORT
4865
4866							None by design.
4867
4868							=head1 BUGS
4869
4870							Please notify the author if you encounter any bugs. When sending email, please place
4871							the string 'DecisionTree' in the subject line.
4872
4873							=head1 INSTALLATION
4874
4875							Download the archive from CPAN in any directory of your choice. Unpack the archive
4876							with a command that on a Linux machine would look like:
4877
4878							tar zxvf Algorithm-DecisionTree-3.43.tar.gz
4879
4880							This will create an installation directory for you whose name will be
4881							C. Enter this directory and execute the following
4882							commands for a standard install of the module if you have root privileges:
4883
4884							perl Makefile.PL
4885							make
4886							make test
4887							sudo make install
4888
4889							If you do not have root privileges, you can carry out a non-standard install the
4890							module in any directory of your choice by:
4891
4892							perl Makefile.PL prefix=/some/other/directory
4893							make
4894							make test
4895							make install
4896
4897							With a non-standard install, you may also have to set your PERL5LIB environment
4898							variable so that this module can find the required other modules. How you do that
4899							would depend on what platform you are working on. In order to install this module in
4900							a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
4901							variable by
4902
4903							setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4904
4905							If I used bash, I'd need to declare:
4906
4907							export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4908
4909							Another option for a non-standard install would be to place at the top of your own
4910							scripts the following invocation:
4911
4912							BEGIN {
4913							unshift @INC, "/some/other/directory/blib/lib";
4914							}
4915
4916
4917							=head1 THANKS
4918
4919							I wish to thank many users of this module for their feedback. Many of the
4920							improvements I have made to the module over the years are a result of the feedback
4921							received.
4922
4923							I thank Slaven Rezic for pointing out that the module worked with Perl 5.14.x. For
4924							Version 2.22, I had set the required version of Perl to 5.18.0 since that's what I
4925							used for testing the module. Slaven's feedback in the form of the Bug report
4926							C<#96547> resulted in Version 2.23 of the module. Version 2.25 further downshifts
4927							the required version of Perl to 5.10.
4928
4929							On the basis of the report posted by Slaven at C regarding Version 2.27,
4930							I am removing the Perl version restriction altogether from Version 2.30. Thanks
4931							Slaven!
4932
4933							Version 3.43 was prompted by a bug report sent by Jan Trukenmüller regarding a
4934							missing backslash in a regex in the C function. Thanks, Jan!
4935
4936							=head1 AUTHOR
4937
4938							The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy Project"
4939							with the publication of the book I by John-Wiley. If
4940							interested, visit his web page at Purdue to find out what this project was all
4941							about. You might like I especially if you enjoyed reading
4942							Harry Potter as a kid (or even as an adult, for that matter).
4943
4944							If you send email regarding this module, please place the string "DecisionTree" in
4945							your subject line to get past my spam filter. Avi Kak's email address is
4946							C
4947
4948							=head1 COPYRIGHT
4949
4950							This library is free software; you can redistribute it and/or modify it under the
4951							same terms as Perl itself.
4952
4953							Copyright 2017 Avinash Kak
4954
4955							=cut
4956