File Coverage

blib/lib/Algorithm/DecisionTree.pm

Criterion	Covered	Total	%
statement	780	2249	34.6
branch	201	708	28.3
condition	42	234	17.9
subroutine	44	92	47.8
pod	8	40	20.0
total	1075	3323	32.3

line	stmt	bran	cond	sub	pod	time	code
1							package Algorithm::DecisionTree;
2
3							#--------------------------------------------------------------------------------------
4							# Copyright (c) 2016 Avinash Kak. All rights reserved. This program is free
5							# software. You may modify and/or distribute it under the same terms as Perl itself.
6							# This copyright notice must remain attached to the file.
7							#
8							# Algorithm::DecisionTree is a Perl module for decision-tree based classification of
9							# multidimensional data.
10							# -------------------------------------------------------------------------------------
11
12							#use 5.10.0;
13	1			1		12964	use strict;
	1					2
	1					24
14	1			1		3	use warnings;
	1					1
	1					20
15	1			1		4	use Carp;
	1					3
	1					12093
16
17							our $VERSION = '3.42';
18
19							############################################ Constructor ##############################################
20							sub new {
21	1			1	1	330	my ($class, %args, $eval_or_boosting_mode);
22	1	50				12	if (@_ % 2 != 0) {
23	1					5	($class, %args) = @_;
24							} else {
25	0					0	$class = shift;
26	0					0	$eval_or_boosting_mode = shift;
27	0	0	0			0	die unless $eval_or_boosting_mode eq 'evalmode' \|\| $eval_or_boosting_mode eq 'boostingmode';
28	0	0				0	die "Only one string arg allowed in eval and boosting modes" if @_;
29							}
30	1	50				3	unless ($eval_or_boosting_mode) {
31	1					3	my @params = keys %args;
32	1	50				3	croak "\nYou have used a wrong name for a keyword argument --- perhaps a misspelling\n"
33							if check_for_illegal_params2(@params) == 0;
34							}
35							bless {
36							_training_datafile => $args{training_datafile},
37							_entropy_threshold => $args{entropy_threshold} \|\| 0.01,
38							_max_depth_desired => exists $args{max_depth_desired} ?
39							$args{max_depth_desired} : undef,
40							_debug1 => $args{debug1} \|\| 0,
41							_debug2 => $args{debug2} \|\| 0,
42							_debug3 => $args{debug3} \|\| 0,
43							_csv_class_column_index => $args{csv_class_column_index} \|\| undef,
44							_csv_columns_for_features => $args{csv_columns_for_features} \|\| undef,
45							_symbolic_to_numeric_cardinality_threshold
46							=> $args{symbolic_to_numeric_cardinality_threshold} \|\| 10,
47							_number_of_histogram_bins => $args{number_of_histogram_bins} \|\| undef,
48	1	50	50			53	_csv_cleanup_needed => $args{csv_cleanup_needed} \|\| 0,
			50
			50
			50
			50
			50
			50
			50
			50
49							_training_data => [],
50							_root_node => undef,
51							_probability_cache => {},
52							_entropy_cache => {},
53							_training_data_hash => {},
54							_features_and_values_hash => {},
55							_samples_class_label_hash => {},
56							_class_names => [],
57							_class_priors => [],
58							_class_priors_hash => {},
59							_feature_names => [],
60							_numeric_features_valuerange_hash => {},
61							_sampling_points_for_numeric_feature_hash => {},
62							_feature_values_how_many_uniques_hash => {},
63							_prob_distribution_numeric_features_hash => {},
64							_histogram_delta_hash => {},
65							_num_of_histogram_bins_hash => {},
66							}, $class;
67
68							}
69
70							#################################### Classify with Decision Tree #######################################
71
72							## Classifies one test sample at a time using the decision tree constructed from
73							## your training file. The data record for the test sample must be supplied as
74							## shown in the scripts in the `examples' subdirectory. See the scripts
75							## construct_dt_and_classify_one_sample_caseX.pl in that subdirectory.
76							sub classify {
77	0			0	1	0	my $self = shift;
78	0					0	my $root_node = shift;
79	0					0	my $feature_and_values = shift;
80	0					0	my $numregex = '[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?';
81	0					0	my @features_and_values = @$feature_and_values;
82	0					0	@features_and_values = @{deep_copy_array(\@features_and_values)};
	0					0
83	0	0				0	die "\n\nError in the names you have used for features and/or values. " .
84							"Try using the csv_cleanup_needed option in the constructor call."
85							unless $self->check_names_used(\@features_and_values);
86	0					0	my @new_features_and_values = ();
87	0					0	my $pattern = '(\S+)\s=\s(\S+)';
88	0					0	foreach my $feature_and_value (@features_and_values) {
89	0					0	$feature_and_value =~ /$pattern/;
90	0					0	my ($feature, $value) = ($1, $2);
91	0					0	my $newvalue = $value;
92	0					0	my @unique_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
	0					0
93	0					0	my $not_all_values_float = 0;
94	0	0				0	map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
	0					0
95	0	0	0			0	if (! contained_in($feature, keys %{$self->{_prob_distribution_numeric_features_hash}}) &&
	0					0
96							$not_all_values_float == 0) {
97	0					0	$newvalue = closest_sampling_point($value, \@unique_values_for_feature);
98							}
99	0					0	push @new_features_and_values, "$feature" . '=' . "$newvalue";
100							}
101	0					0	@features_and_values = @new_features_and_values;
102	0	0				0	print "\nCL1 New feature and values: @features_and_values\n" if $self->{_debug3};
103	0					0	my %answer = ();
104	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
105	0					0	$answer{$class_name} = undef;
106							}
107	0					0	$answer{'solution_path'} = [];
108	0					0	my %classification = %{$self->recursive_descent_for_classification($root_node,
	0					0
109							\@features_and_values,\%answer)};
110	0					0	@{$answer{'solution_path'}} = reverse @{$answer{'solution_path'}};
	0					0
	0					0
111	0	0				0	if ($self->{_debug3}) {
112	0					0	print "\nCL2 The classification:\n";
113	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
114	0					0	print " $class_name with probability $classification{$class_name}\n";
115							}
116							}
117	0					0	my %classification_for_display = ();
118	0					0	foreach my $item (keys %classification) {
119	0	0				0	if ($item ne 'solution_path') {
120	0					0	$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
121							} else {
122	0					0	my @outlist = ();
123	0					0	foreach my $x (@{$classification{$item}}) {
	0					0
124	0					0	push @outlist, "NODE$x";
125							}
126	0					0	$classification_for_display{$item} = \@outlist;
127							}
128							}
129	0					0	return \%classification_for_display;
130							}
131
132							sub recursive_descent_for_classification {
133	0			0	0	0	my $self = shift;
134	0					0	my $node = shift;
135	0					0	my $features_and_values = shift;
136	0					0	my $answer = shift;
137	0					0	my @features_and_values = @$features_and_values;
138	0					0	my %answer = %$answer;
139	0					0	my @children = @{$node->get_children()};
	0					0
140	0	0				0	if (@children == 0) {
141	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
142	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
143	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
144							}
145	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
146	0					0	return \%answer;
147							}
148	0					0	my $feature_tested_at_node = $node->get_feature();
149							print "\nCLRD1 Feature tested at node for classification: $feature_tested_at_node\n"
150	0	0				0	if $self->{_debug3};
151	0					0	my $value_for_feature;
152							my $path_found;
153	0					0	my $pattern = '(\S+)\s=\s(\S+)';
154	0					0	foreach my $feature_and_value (@features_and_values) {
155	0					0	$feature_and_value =~ /$pattern/;
156	0	0				0	$value_for_feature = $2 if $feature_tested_at_node eq $1;
157							}
158							# The following clause introduced in Version 3.20
159	0	0				0	if (!defined $value_for_feature) {
160	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
161	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
162	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
163							}
164	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
165	0					0	return \%answer;
166							}
167	0	0				0	if ($value_for_feature) {
168	0	0				0	if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
	0					0
169	0	0				0	print( "\nCLRD2 In the truly numeric section") if $self->{_debug3};
170	0					0	my $pattern1 = '(.+)<(.+)';
171	0					0	my $pattern2 = '(.+)>(.+)';
172	0					0	foreach my $child (@children) {
173	0					0	my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
	0					0
174	0					0	my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
175	0	0				0	if ($last_feature_and_value_on_branch =~ /$pattern1/) {
176	0					0	my ($feature, $threshold) = ($1,$2);
177	0	0				0	if ($value_for_feature <= $threshold) {
178	0					0	$path_found = 1;
179	0					0	%answer = %{$self->recursive_descent_for_classification($child,
	0					0
180							$features_and_values,\%answer)};
181	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
182	0					0	last;
183							}
184							}
185	0	0				0	if ($last_feature_and_value_on_branch =~ /$pattern2/) {
186	0					0	my ($feature, $threshold) = ($1,$2);
187	0	0				0	if ($value_for_feature > $threshold) {
188	0					0	$path_found = 1;
189	0					0	%answer = %{$self->recursive_descent_for_classification($child,
	0					0
190							$features_and_values,\%answer)};
191	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
192	0					0	last;
193							}
194							}
195							}
196	0	0				0	return \%answer if $path_found;
197							} else {
198	0					0	my $feature_value_combo = "$feature_tested_at_node" . '=' . "$value_for_feature";
199							print "\nCLRD3 In the symbolic section with feature_value_combo: $feature_value_combo\n"
200	0	0				0	if $self->{_debug3};
201	0					0	foreach my $child (@children) {
202	0					0	my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
	0					0
203	0	0				0	print "\nCLRD4 branch features and values: @branch_features_and_values\n" if $self->{_debug3};
204	0					0	my $last_feature_and_value_on_branch = $branch_features_and_values[-1];
205	0	0				0	if ($last_feature_and_value_on_branch eq $feature_value_combo) {
206	0					0	%answer = %{$self->recursive_descent_for_classification($child,
	0					0
207							$features_and_values,\%answer)};
208	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
209	0					0	$path_found = 1;
210	0					0	last;
211							}
212							}
213	0	0				0	return \%answer if $path_found;
214							}
215							}
216	0	0				0	if (! $path_found) {
217	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
218	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
219	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
220							}
221	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
222							}
223	0					0	return \%answer;
224							}
225
226							## If you want classification to be carried out by engaging a human user in a
227							## question-answer session, this is the method to use for that purpose. See, for
228							## example, the script classify_by_asking_questions.pl in the `examples'
229							## subdirectory for an illustration of how to do that.
230							sub classify_by_asking_questions {
231	0			0	1	0	my $self = shift;
232	0					0	my $root_node = shift;
233	0					0	my %answer = ();
234	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
235	0					0	$answer{$class_name} = undef;
236							}
237	0					0	$answer{'solution_path'} = [];
238	0					0	my %scratchpad_for_numeric_answers = ();
239	0					0	foreach my $feature_name (keys %{$self->{_prob_distribution_numeric_features_hash}}) {
	0					0
240	0					0	$scratchpad_for_numeric_answers{$feature_name} = undef;
241							}
242	0					0	my %classification = %{$self->interactive_recursive_descent_for_classification($root_node,
	0					0
243							\%answer, \%scratchpad_for_numeric_answers)};
244	0					0	@{$classification{'solution_path'}} = reverse @{$classification{'solution_path'}};
	0					0
	0					0
245	0					0	my %classification_for_display = ();
246	0					0	foreach my $item (keys %classification) {
247	0	0				0	if ($item ne 'solution_path') {
248	0					0	$classification_for_display{$item} = sprintf("%0.3f", $classification{$item});
249							} else {
250	0					0	my @outlist = ();
251	0					0	foreach my $x (@{$classification{$item}}) {
	0					0
252	0					0	push @outlist, "NODE$x";
253							}
254	0					0	$classification_for_display{$item} = \@outlist;
255							}
256							}
257	0					0	return \%classification_for_display;
258							}
259
260							sub interactive_recursive_descent_for_classification {
261	0			0	0	0	my $self = shift;
262	0					0	my $node = shift;
263	0					0	my $answer = shift;
264	0					0	my $scratchpad_for_numerics = shift;
265	0					0	my %answer = %$answer;
266	0					0	my %scratchpad_for_numerics = %$scratchpad_for_numerics;
267	0					0	my $pattern1 = '(.+)<(.+)';
268	0					0	my $pattern2 = '(.+)>(.+)';
269	0					0	my $user_value_for_feature;
270	0					0	my @children = @{$node->get_children()};
	0					0
271	0	0				0	if (@children == 0) {
272	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
273	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
274	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
275							}
276	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
277	0					0	return \%answer;
278							}
279	0					0	my @list_of_branch_attributes_to_children = ();
280	0					0	foreach my $child (@children) {
281	0					0	my @branch_features_and_values = @{$child->get_branch_features_and_values_or_thresholds()};
	0					0
282	0					0	my $feature_and_value_on_branch = $branch_features_and_values[-1];
283	0					0	push @list_of_branch_attributes_to_children, $feature_and_value_on_branch;
284							}
285	0					0	my $feature_tested_at_node = $node->get_feature();
286	0					0	my $feature_value_combo;
287	0					0	my $path_found = 0;
288	0	0				0	if (contained_in($feature_tested_at_node, keys %{$self->{_prob_distribution_numeric_features_hash}})) {
	0					0
289	0	0				0	if ($scratchpad_for_numerics{$feature_tested_at_node}) {
290	0					0	$user_value_for_feature = $scratchpad_for_numerics{$feature_tested_at_node};
291							} else {
292	0					0	my @valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_tested_at_node}};
	0					0
293	0					0	while (1) {
294	0					0	print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
295	0					0	print "\nEnter a value in the range (@valuerange): ";
296	0					0	$user_value_for_feature = ;
297	0					0	$user_value_for_feature =~ s/\r?\n?$//;
298	0					0	$user_value_for_feature =~ s/^\s(\S+)\s$/$1/;
299	0					0	my $answer_found = 0;
300	0	0	0			0	if ($user_value_for_feature >= $valuerange[0] && $user_value_for_feature <= $valuerange[1]) {
301	0					0	$answer_found = 1;
302	0					0	last;
303							}
304	0	0				0	last if $answer_found;
305	0					0	print("You entered illegal value. Let's try again")
306							}
307	0					0	$scratchpad_for_numerics{$feature_tested_at_node} = $user_value_for_feature;
308							}
309	0					0	foreach my $i (0..@list_of_branch_attributes_to_children-1) {
310	0					0	my $branch_attribute = $list_of_branch_attributes_to_children[$i];
311	0	0				0	if ($branch_attribute =~ /$pattern1/) {
312	0					0	my ($feature,$threshold) = ($1,$2);
313	0	0				0	if ($user_value_for_feature <= $threshold) {
314	0					0	%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
	0					0
315							\%answer, \%scratchpad_for_numerics)};
316	0					0	$path_found = 1;
317	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
318	0					0	last;
319							}
320							}
321	0	0				0	if ($branch_attribute =~ /$pattern2/) {
322	0					0	my ($feature,$threshold) = ($1,$2);
323	0	0				0	if ($user_value_for_feature > $threshold) {
324	0					0	%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
	0					0
325							\%answer, \%scratchpad_for_numerics)};
326	0					0	$path_found = 1;
327	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
328	0					0	last;
329							}
330							}
331							}
332	0	0				0	return \%answer if $path_found;
333							} else {
334	0					0	my @possible_values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature_tested_at_node}};
	0					0
335	0					0	while (1) {
336	0					0	print "\nWhat is the value for the feature $feature_tested_at_node ?\n";
337	0					0	print "\nEnter a value from the list (@possible_values_for_feature): ";
338	0					0	$user_value_for_feature = ;
339	0					0	$user_value_for_feature =~ s/\r?\n?$//;
340	0					0	$user_value_for_feature =~ s/^\s(\S+)\s$/$1/;
341	0					0	my $answer_found = 0;
342	0	0				0	if (contained_in($user_value_for_feature, @possible_values_for_feature)) {
343	0					0	$answer_found = 1;
344	0					0	last;
345							}
346	0	0				0	last if $answer_found;
347	0					0	print("You entered illegal value. Let's try again");
348							}
349	0					0	$feature_value_combo = "$feature_tested_at_node=$user_value_for_feature";
350	0					0	foreach my $i (0..@list_of_branch_attributes_to_children-1) {
351	0					0	my $branch_attribute = $list_of_branch_attributes_to_children[$i];
352	0	0				0	if ($branch_attribute eq $feature_value_combo) {
353	0					0	%answer = %{$self->interactive_recursive_descent_for_classification($children[$i],
	0					0
354							\%answer, \%scratchpad_for_numerics)};
355	0					0	$path_found = 1;
356	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
357	0					0	last;
358							}
359							}
360	0	0				0	return \%answer if $path_found;
361							}
362	0	0				0	if (! $path_found) {
363	0					0	my @leaf_node_class_probabilities = @{$node->get_class_probabilities()};
	0					0
364	0					0	foreach my $i (0..@{$self->{_class_names}}-1) {
	0					0
365	0					0	$answer{$self->{_class_names}->[$i]} = $leaf_node_class_probabilities[$i];
366							}
367	0					0	push @{$answer{'solution_path'}}, $node->get_serial_num();
	0					0
368							}
369	0					0	return \%answer;
370							}
371
372							###################################### Decision Tree Construction ####################################
373
374							## At the root node, we find the best feature that yields the greatest reduction in
375							## class entropy from the entropy based on just the class priors. The logic for
376							## finding this feature is different for symbolic features and for numeric features.
377							## That logic is built into the method shown later for best feature calculations.
378							sub construct_decision_tree_classifier {
379	1			1	1	8	print "\nConstructing the decision tree ...\n";
380	1					2	my $self = shift;
381	1	50				3	if ($self->{_debug3}) {
382	0					0	$self->determine_data_condition();
383	0					0	print "\nStarting construction of the decision tree:\n";
384							}
385	1					2	my @class_probabilities = map {$self->prior_probability_for_class($_)} @{$self->{_class_names}};
	2					5
	1					2
386	1	50				4	if ($self->{_debug3}) {
387	0					0	print "\nPrior class probabilities: @class_probabilities\n";
388	0					0	print "\nClass names: @{$self->{_class_names}}\n";
	0					0
389							}
390	1					4	my $entropy = $self->class_entropy_on_priors();
391	1	50				3	print "\nClass entropy on priors: $entropy\n" if $self->{_debug3};
392	1					11	my $root_node = DTNode->new(undef, $entropy, \@class_probabilities, [], $self, 'root');
393	1					1	$root_node->set_class_names(\@{$self->{_class_names}});
	1					4
394	1					1	$self->{_root_node} = $root_node;
395	1					3	$self->recursive_descent($root_node);
396	1					5	return $root_node;
397							}
398
399							## After the root node of the decision tree is calculated by the previous methods,
400							## we invoke this method recursively to create the rest of the tree. At each node,
401							## we find the feature that achieves the largest entropy reduction with regard to
402							## the partitioning of the training data samples that correspond to that node.
403							sub recursive_descent {
404	26			26	0	23	my $self = shift;
405	26					13	my $node = shift;
406							print "\n==================== ENTERING RECURSIVE DESCENT ==========================\n"
407	26	50				44	if $self->{_debug3};
408	26					31	my $node_serial_number = $node->get_serial_num();
409	26					18	my @features_and_values_or_thresholds_on_branch = @{$node->get_branch_features_and_values_or_thresholds()};
	26					29
410	26					27	my $existing_node_entropy = $node->get_node_entropy();
411	26	50				38	if ($self->{_debug3}) {
412	0					0	print "\nRD1 NODE SERIAL NUMBER: $node_serial_number\n";
413	0					0	print "\nRD2 Existing Node Entropy: $existing_node_entropy\n";
414	0					0	print "\nRD3 features_and_values_or_thresholds_on_branch: @features_and_values_or_thresholds_on_branch\n";
415	0					0	my @class_probs = @{$node->get_class_probabilities()};
	0					0
416	0					0	print "\nRD4 Class probabilities: @class_probs\n";
417							}
418	26	100				36	if ($existing_node_entropy < $self->{_entropy_threshold}) {
419	8	50				12	print "\nRD5 returning because existing node entropy is below threshold\n" if $self->{_debug3};
420	8					23	return;
421							}
422	18					16	my @copy_of_path_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	18					22
423	18					35	my ($best_feature, $best_feature_entropy, $best_feature_val_entropies, $decision_val) =
424							$self->best_feature_calculator(\@copy_of_path_attributes, $existing_node_entropy);
425	18					30	$node->set_feature($best_feature);
426	18	50				27	$node->display_node() if $self->{_debug3};
427	18	50	33			28	if (defined($self->{_max_depth_desired}) &&
428							(@features_and_values_or_thresholds_on_branch >= $self->{_max_depth_desired})) {
429	0	0				0	print "\nRD6 REACHED LEAF NODE AT MAXIMUM DEPTH ALLOWED\n" if $self->{_debug3};
430	0					0	return;
431							}
432	18	50				27	return if ! defined $best_feature;
433	18	50				21	if ($self->{_debug3}) {
434	0					0	print "\nRD7 Existing entropy at node: $existing_node_entropy\n";
435	0					0	print "\nRD8 Calculated best feature is $best_feature and its value $decision_val\n";
436	0					0	print "\nRD9 Best feature entropy: $best_feature_entropy\n";
437	0					0	print "\nRD10 Calculated entropies for different values of best feature: @$best_feature_val_entropies\n";
438							}
439	18					19	my $entropy_gain = $existing_node_entropy - $best_feature_entropy;
440	18	50				22	print "\nRD11 Expected entropy gain at this node: $entropy_gain\n" if $self->{_debug3};
441	18	100				25	if ($entropy_gain > $self->{_entropy_threshold}) {
442	12	50	33			23	if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature} &&
443							$self->{_feature_values_how_many_uniques_hash}->{$best_feature} >
444							$self->{_symbolic_to_numeric_cardinality_threshold}) {
445	0					0	my $best_threshold = $decision_val; # as returned by best feature calculator
446	0					0	my ($best_entropy_for_less, $best_entropy_for_greater) = @$best_feature_val_entropies;
447							my @extended_branch_features_and_values_or_thresholds_for_lessthan_child =
448	0					0	@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	0					0
449							my @extended_branch_features_and_values_or_thresholds_for_greaterthan_child =
450	0					0	@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	0					0
451	0					0	my $feature_threshold_combo_for_less_than = "$best_feature" . '<' . "$best_threshold";
452	0					0	my $feature_threshold_combo_for_greater_than = "$best_feature" . '>' . "$best_threshold";
453	0					0	push @extended_branch_features_and_values_or_thresholds_for_lessthan_child,
454							$feature_threshold_combo_for_less_than;
455	0					0	push @extended_branch_features_and_values_or_thresholds_for_greaterthan_child,
456							$feature_threshold_combo_for_greater_than;
457	0	0				0	if ($self->{_debug3}) {
458	0					0	print "\nRD12 extended_branch_features_and_values_or_thresholds_for_lessthan_child: " .
459							"@extended_branch_features_and_values_or_thresholds_for_lessthan_child\n";
460	0					0	print "\nRD13 extended_branch_features_and_values_or_thresholds_for_greaterthan_child: " .
461							"@extended_branch_features_and_values_or_thresholds_for_greaterthan_child\n";
462							}
463							my @class_probabilities_for_lessthan_child_node =
464	0					0	map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
465	0					0	\@extended_branch_features_and_values_or_thresholds_for_lessthan_child)} @{$self->{_class_names}};
	0					0
466							my @class_probabilities_for_greaterthan_child_node =
467	0					0	map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
468	0					0	\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child)} @{$self->{_class_names}};
	0					0
469	0	0				0	if ($self->{_debug3}) {
470	0					0	print "\nRD14 class entropy for going down lessthan child: $best_entropy_for_less\n";
471	0					0	print "\nRD15 class_entropy_for_going_down_greaterthan_child: $best_entropy_for_greater\n";
472							}
473	0	0				0	if ($best_entropy_for_less < $existing_node_entropy - $self->{_entropy_threshold}) {
474	0					0	my $left_child_node = DTNode->new(undef, $best_entropy_for_less,
475							\@class_probabilities_for_lessthan_child_node,
476							\@extended_branch_features_and_values_or_thresholds_for_lessthan_child, $self);
477	0					0	$node->add_child_link($left_child_node);
478	0					0	$self->recursive_descent($left_child_node);
479							}
480	0	0				0	if ($best_entropy_for_greater < $existing_node_entropy - $self->{_entropy_threshold}) {
481	0					0	my $right_child_node = DTNode->new(undef, $best_entropy_for_greater,
482							\@class_probabilities_for_greaterthan_child_node,
483							\@extended_branch_features_and_values_or_thresholds_for_greaterthan_child, $self);
484	0					0	$node->add_child_link($right_child_node);
485	0					0	$self->recursive_descent($right_child_node);
486							}
487							} else {
488							print "\nRD16 RECURSIVE DESCENT: In section for symbolic features for creating children"
489	12	50				15	if $self->{_debug3};
490	12					8	my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$best_feature}};
	12					22
491	12	50				19	print "\nRD17 Values for feature $best_feature are @values_for_feature\n" if $self->{_debug3};
492	12					29	my @feature_value_combos = sort map {"$best_feature" . '=' . $_} @values_for_feature;
	40					75
493	12					13	my @class_entropies_for_children = ();
494	12					23	foreach my $feature_and_value_index (0..@feature_value_combos-1) {
495							print "\nRD18 Creating a child node for: $feature_value_combos[$feature_and_value_index]\n"
496	40	50				50	if $self->{_debug3};
497	40					32	my @extended_branch_features_and_values_or_thresholds;
498	40	100				45	if (! @features_and_values_or_thresholds_on_branch) {
499	4					7	@extended_branch_features_and_values_or_thresholds =
500							($feature_value_combos[$feature_and_value_index]);
501							} else {
502							@extended_branch_features_and_values_or_thresholds =
503	36					27	@{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	36					38
504	36					41	push @extended_branch_features_and_values_or_thresholds,
505							$feature_value_combos[$feature_and_value_index];
506							}
507							my @class_probabilities =
508	80					89	map {$self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($_,
509	40					31	\@extended_branch_features_and_values_or_thresholds)} @{$self->{_class_names}};
	40					42
510	40					48	my $class_entropy_for_child =
511							$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(
512							\@extended_branch_features_and_values_or_thresholds);
513	40	50				56	if ($self->{_debug3}) {
514	0					0	print "\nRD19 branch attributes: @extended_branch_features_and_values_or_thresholds\n";
515	0					0	print "\nRD20 class entropy for child: $class_entropy_for_child\n";
516							}
517	40	100				51	if ($existing_node_entropy - $class_entropy_for_child > $self->{_entropy_threshold}) {
518	25					42	my $child_node = DTNode->new(undef, $class_entropy_for_child,
519							\@class_probabilities, \@extended_branch_features_and_values_or_thresholds, $self);
520	25					38	$node->add_child_link($child_node);
521	25					78	$self->recursive_descent($child_node);
522							} else {
523	15	50				34	print "\nRD21 This child will NOT result in a node\n" if $self->{_debug3};
524							}
525							}
526							}
527							} else {
528							print "\nRD22 REACHED LEAF NODE NATURALLY for: @features_and_values_or_thresholds_on_branch\n"
529	6	50				10	if $self->{_debug3};
530	6					22	return;
531							}
532							}
533
534							## This is the heart of the decision tree constructor. Its main job is to figure
535							## out the best feature to use for partitioning the training data samples that
536							## correspond to the current node. The search for the best feature is carried out
537							## differently for symbolic features and for numeric features. For a symbolic
538							## feature, the method estimates the entropy for each value of the feature and then
539							## averages out these entropies as a measure of the discriminatory power of that
540							## features. For a numeric feature, on the other hand, it estimates the entropy
541							## reduction that can be achieved if were to partition the set of training samples
542							## for each possible threshold. For a numeric feature, all possible sampling points
543							## relevant to the node in question are considered as candidates for thresholds.
544							sub best_feature_calculator {
545	18			18	0	18	my $self = shift;
546	18					8	my $features_and_values_or_thresholds_on_branch = shift;
547	18					13	my $existing_node_entropy = shift;
548	18					19	my @features_and_values_or_thresholds_on_branch = @$features_and_values_or_thresholds_on_branch;
549	18					17	my $pattern1 = '(.+)=(.+)';
550	18					9	my $pattern2 = '(.+)<(.+)';
551	18					13	my $pattern3 = '(.+)>(.+)';
552	18					17	my @all_symbolic_features = ();
553	18					13	foreach my $feature_name (@{$self->{_feature_names}}) {
	18					25
554							push @all_symbolic_features, $feature_name
555	72	50				111	if ! exists $self->{_prob_distribution_numeric_features_hash}->{$feature_name};
556							}
557	18					14	my @symbolic_features_already_used = ();
558	18					18	foreach my $feature_and_value_or_threshold (@features_and_values_or_thresholds_on_branch) {
559	47	50				197	push @symbolic_features_already_used, $1 if $feature_and_value_or_threshold =~ /$pattern1/;
560							}
561	18					14	my @symbolic_features_not_yet_used;
562	18					14	foreach my $x (@all_symbolic_features) {
563	72	100				65	push @symbolic_features_not_yet_used, $x unless contained_in($x, @symbolic_features_already_used);
564							}
565	18					15	my @true_numeric_types = ();
566	18					74	my @symbolic_types = ();
567	18					13	my @true_numeric_types_feature_names = ();
568	18					11	my @symbolic_types_feature_names = ();
569	18					17	foreach my $item (@features_and_values_or_thresholds_on_branch) {
570	47	50				146	if ($item =~ /$pattern2/) {
		50
		50
571	0					0	push @true_numeric_types, $item;
572	0					0	push @true_numeric_types_feature_names, $1;
573							} elsif ($item =~ /$pattern3/) {
574	0					0	push @true_numeric_types, $item;
575	0					0	push @true_numeric_types_feature_names, $1;
576							} elsif ($item =~ /$pattern1/) {
577	47					36	push @symbolic_types, $item;
578	47					66	push @symbolic_types_feature_names, $1;
579							} else {
580	0					0	die "format error in the representation of feature and values or thresholds";
581							}
582							}
583	18					24	my %seen = ();
584	18	0				15	@true_numeric_types_feature_names = grep {$_ if !$seen{$_}++} @true_numeric_types_feature_names;
	0					0
585	18					27	%seen = ();
586	18	50				18	@symbolic_types_feature_names = grep {$_ if !$seen{$_}++} @symbolic_types_feature_names;
	47					104
587							my @bounded_intervals_numeric_types =
588	18					11	@{$self->find_bounded_intervals_for_numeric_features(\@true_numeric_types)};
	18					33
589							# Calculate the upper and the lower bounds to be used when searching for the best
590							# threshold for each of the numeric features that are in play at the current node:
591	18					13	my (%upperbound, %lowerbound);
592	18					17	foreach my $feature (@true_numeric_types_feature_names) {
593	0					0	$upperbound{$feature} = undef;
594	0					0	$lowerbound{$feature} = undef;
595							}
596	18					17	foreach my $item (@bounded_intervals_numeric_types) {
597	0					0	foreach my $feature_grouping (@$item) {
598	0	0				0	if ($feature_grouping->[1] eq '>') {
599	0					0	$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
600							} else {
601	0					0	$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
602							}
603							}
604							}
605	18					15	my %entropy_values_for_different_features = ();
606	18					11	my %partitioning_point_child_entropies_hash = ();
607	18					15	my %partitioning_point_threshold = ();
608	18					16	my %entropies_for_different_values_of_symbolic_feature = ();
609	18					18	foreach my $feature (@{$self->{_feature_names}}) {
	18					25
610	72					73	$entropy_values_for_different_features{$feature} = [];
611	72					59	$partitioning_point_child_entropies_hash{$feature} = {};
612	72					54	$partitioning_point_threshold{$feature} = undef;
613	72					74	$entropies_for_different_values_of_symbolic_feature{$feature} = [];
614							}
615	18					13	foreach my $i (0..@{$self->{_feature_names}}-1) {
	18					51
616	72					62	my $feature_name = $self->{_feature_names}->[$i];
617	72	50				82	print "\n\nBFC1 FEATURE BEING CONSIDERED: $feature_name\n" if $self->{_debug3};
618	72	100	33			82	if (contained_in($feature_name, @symbolic_features_already_used)) {
		50
619	47					45	next;
620	25					44	} elsif (contained_in($feature_name, keys %{$self->{_numeric_features_valuerange_hash}}) &&
621							$self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
622							$self->{_symbolic_to_numeric_cardinality_threshold}) {
623	0					0	my @values = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
	0					0
624	0	0				0	print "\nBFC4 values for $feature_name are @values\n" if $self->{_debug3};
625	0					0	my @newvalues = ();
626	0	0				0	if (contained_in($feature_name, @true_numeric_types_feature_names)) {
627	0	0	0			0	if (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
		0	0
		0	0
		0	0
628							$lowerbound{$feature_name} >= $upperbound{$feature_name}) {
629	0					0	next;
630							} elsif (defined($upperbound{$feature_name}) && defined($lowerbound{$feature_name}) &&
631							$lowerbound{$feature_name} < $upperbound{$feature_name}) {
632	0					0	foreach my $x (@values) {
633	0	0	0			0	push @newvalues, $x if $x > $lowerbound{$feature_name} && $x <= $upperbound{$feature_name};
634							}
635							} elsif (defined($upperbound{$feature_name})) {
636	0					0	foreach my $x (@values) {
637	0	0				0	push @newvalues, $x if $x <= $upperbound{$feature_name};
638							}
639							} elsif (defined($lowerbound{$feature_name})) {
640	0					0	foreach my $x (@values) {
641	0	0				0	push @newvalues, $x if $x > $lowerbound{$feature_name};
642							}
643							} else {
644	0					0	die "Error is bound specifications in best feature calculator";
645							}
646							} else {
647	0					0	@newvalues = @{deep_copy_array(\@values)};
	0					0
648							}
649	0	0				0	next if @newvalues == 0;
650	0					0	my @partitioning_entropies = ();
651	0					0	foreach my $value (@newvalues) {
652	0					0	my $feature_and_less_than_value_string = "$feature_name" . '<' . "$value";
653	0					0	my $feature_and_greater_than_value_string = "$feature_name" . '>' . "$value";
654	0					0	my @for_left_child;
655							my @for_right_child;
656	0	0				0	if (@features_and_values_or_thresholds_on_branch) {
657	0					0	@for_left_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	0					0
658	0					0	push @for_left_child, $feature_and_less_than_value_string;
659	0					0	@for_right_child = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	0					0
660	0					0	push @for_right_child, $feature_and_greater_than_value_string;
661							} else {
662	0					0	@for_left_child = ($feature_and_less_than_value_string);
663	0					0	@for_right_child = ($feature_and_greater_than_value_string);
664							}
665	0					0	my $entropy1 = $self->class_entropy_for_less_than_threshold_for_feature(
666							\@features_and_values_or_thresholds_on_branch, $feature_name, $value);
667	0					0	my $entropy2 = $self->class_entropy_for_greater_than_threshold_for_feature(
668							\@features_and_values_or_thresholds_on_branch, $feature_name, $value);
669	0					0	my $partitioning_entropy = $entropy1 *
670							$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_left_child) +
671							$entropy2 *
672							$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@for_right_child);
673
674	0					0	push @partitioning_entropies, $partitioning_entropy;
675	0					0	$partitioning_point_child_entropies_hash{$feature_name}{$value} = [$entropy1, $entropy2];
676							}
677	0					0	my ($min_entropy, $best_partition_point_index) = minimum(\@partitioning_entropies);
678	0	0				0	if ($min_entropy < $existing_node_entropy) {
679	0					0	$partitioning_point_threshold{$feature_name} = $newvalues[$best_partition_point_index];
680	0					0	$entropy_values_for_different_features{$feature_name} = $min_entropy;
681							}
682							} else {
683	25	50				29	print "\nBFC2: Entering section reserved for symbolic features\n" if $self->{_debug3};
684	25	50				30	print "\nBFC3 Feature name: $feature_name\n" if $self->{_debug3};
685	25					20	my %seen;
686	82	50				262	my @values = grep {$_ ne 'NA' && !$seen{$_}++}
687	25					16	@{$self->{_features_and_unique_values_hash}->{$feature_name}};
	25					35
688	25					47	@values = sort @values;
689	25	50				31	print "\nBFC4 values for feature $feature_name are @values\n" if $self->{_debug3};
690
691	25					19	my $entropy = 0;
692	25					25	foreach my $value (@values) {
693	82					127	my $feature_value_string = "$feature_name" . '=' . "$value";
694	82	50				98	print "\nBFC4 feature_value_string: $feature_value_string\n" if $self->{_debug3};
695	82					59	my @extended_attributes = @{deep_copy_array(\@features_and_values_or_thresholds_on_branch)};
	82					96
696	82	100				117	if (@features_and_values_or_thresholds_on_branch) {
697	68					63	push @extended_attributes, $feature_value_string;
698							} else {
699	14					14	@extended_attributes = ($feature_value_string);
700							}
701	82					100	$entropy +=
702							$self->class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds(\@extended_attributes) *
703							$self->probability_of_a_sequence_of_features_and_values_or_thresholds(\@extended_attributes);
704							print "\nBFC5 Entropy calculated for symbolic feature value choice ($feature_name,$value) " .
705	82	50				118	"is $entropy\n" if $self->{_debug3};
706	82					62	push @{$entropies_for_different_values_of_symbolic_feature{$feature_name}}, $entropy;
	82					157
707							}
708	25	50				41	if ($entropy < $existing_node_entropy) {
709	25					64	$entropy_values_for_different_features{$feature_name} = $entropy;
710							}
711							}
712							}
713	18					16	my $min_entropy_for_best_feature;
714							my $best_feature_name;
715	18					30	foreach my $feature_nom (keys %entropy_values_for_different_features) {
716	72	100				73	if (!defined($best_feature_name)) {
717	18					14	$best_feature_name = $feature_nom;
718	18					20	$min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
719							} else {
720	54	100				86	if ($entropy_values_for_different_features{$feature_nom} < $min_entropy_for_best_feature) {
721	13					11	$best_feature_name = $feature_nom;
722	13					13	$min_entropy_for_best_feature = $entropy_values_for_different_features{$feature_nom};
723							}
724							}
725							}
726	18					15	my $threshold_for_best_feature;
727	18	50				23	if (exists $partitioning_point_threshold{$best_feature_name}) {
728	18					16	$threshold_for_best_feature = $partitioning_point_threshold{$best_feature_name};
729							} else {
730	0					0	$threshold_for_best_feature = undef;
731							}
732	18					10	my $best_feature_entropy = $min_entropy_for_best_feature;
733	18					15	my @val_based_entropies_to_be_returned;
734							my $decision_val_to_be_returned;
735	18	50	33			36	if (exists $self->{_numeric_features_valuerange_hash}->{$best_feature_name} &&
736							$self->{_feature_values_how_many_uniques_hash}->{$best_feature_name} >
737							$self->{_symbolic_to_numeric_cardinality_threshold}) {
738							@val_based_entropies_to_be_returned =
739	0					0	@{$partitioning_point_child_entropies_hash{$best_feature_name}{$threshold_for_best_feature}};
	0					0
740							} else {
741	18					15	@val_based_entropies_to_be_returned = ();
742							}
743	18	50				26	if (exists $partitioning_point_threshold{$best_feature_name}) {
744	18					15	$decision_val_to_be_returned = $partitioning_point_threshold{$best_feature_name};
745							} else {
746	0					0	$decision_val_to_be_returned = undef;
747							}
748							print "\nBFC6 Val based entropies to be returned for feature $best_feature_name are " .
749	18	50				25	"@val_based_entropies_to_be_returned\n" if $self->{_debug3};
750	18					102	return ($best_feature_name, $best_feature_entropy, \@val_based_entropies_to_be_returned,
751							$decision_val_to_be_returned);
752							}
753
754							######################################### Entropy Calculators #####################################
755
756							sub class_entropy_on_priors {
757	1			1	0	2	my $self = shift;
758							return $self->{_entropy_cache}->{'priors'}
759	1	50				4	if exists $self->{_entropy_cache}->{"priors"};
760	1					1	my @class_names = @{$self->{_class_names}};
	1					2
761	1					2	my $entropy;
762	1					1	foreach my $class (@class_names) {
763	2					4	my $prob = $self->prior_probability_for_class($class);
764	2	50	33			18	my $log_prob = log($prob) / log(2) if ($prob >= 0.0001) && ($prob <= 0.999) ;
765	2	50				3	$log_prob = 0 if $prob < 0.0001; # since X.log(X)->0 as X->0
766	2	50				4	$log_prob = 0 if $prob > 0.999; # since log(1) = 0
767	2	100				5	if (!defined $entropy) {
768	1					2	$entropy = -1.0 * $prob * $log_prob;
769	1					2	next;
770							}
771	1					3	$entropy += -1.0 * $prob * $log_prob;
772							}
773	1					2	$self->{_entropy_cache}->{'priors'} = $entropy;
774	1					2	return $entropy;
775							}
776
777							sub entropy_scanner_for_a_numeric_feature {
778	0			0	0	0	local $\| = 1;
779	0					0	my $self = shift;
780	0					0	my $feature = shift;
781	0					0	my @all_sampling_points = @{$self->{_sampling_points_for_numeric_feature_hash}->{$feature}};
	0					0
782	0					0	my @entropies_for_less_than_thresholds = ();
783	0					0	my @entropies_for_greater_than_thresholds = ();
784	0					0	foreach my $point (@all_sampling_points) {
785	0					0	print ". ";
786	0					0	push @entropies_for_less_than_thresholds,
787							$self->class_entropy_for_less_than_threshold_for_feature([], $feature, $point);
788	0					0	push @entropies_for_greater_than_thresholds,
789							$self->class_entropy_for_greater_than_threshold_for_feature([], $feature, $point);
790							}
791	0					0	print "\n\nSCANNER: All entropies less than thresholds for feature $feature are: ".
792							"@entropies_for_less_than_thresholds\n";
793	0					0	print "\nSCANNER: All entropies greater than thresholds for feature $feature are: ".
794							"@entropies_for_greater_than_thresholds\n";
795							}
796
797							sub class_entropy_for_less_than_threshold_for_feature {
798	0			0	0	0	my $self = shift;
799	0					0	my $arr = shift;
800	0					0	my $feature = shift;
801	0					0	my $threshold = shift;
802	0					0	my @array_of_features_and_values_or_thresholds = @$arr;
803	0					0	my $feature_threshold_combo = "$feature" . '<' . "$threshold";
804	0					0	my $sequence = join ":", @array_of_features_and_values_or_thresholds;
805	0					0	$sequence .= ":" . $feature_threshold_combo;
806	0	0				0	return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
807							my @copy_of_array_of_features_and_values_or_thresholds =
808	0					0	@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
	0					0
809	0					0	push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
810	0					0	my $entropy = 0;
811	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
812	0					0	my $log_prob = undef;
813	0					0	my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
814							$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
815	0	0	0			0	if ($prob >= 0.0001 && $prob <= 0.999) {
		0
		0
816	0					0	$log_prob = log($prob) / log(2.0);
817							} elsif ($prob < 0.0001) {
818	0					0	$log_prob = 0;
819							} elsif ($prob > 0.999) {
820	0					0	$log_prob = 0;
821							} else {
822	0					0	die "An error has occurred in log_prob calculation";
823							}
824	0					0	$entropy += -1.0 * $prob * $log_prob;
825							}
826	0	0				0	if (abs($entropy) < 0.0000001) {
827	0					0	$entropy = 0.0;
828							}
829	0					0	$self->{_entropy_cache}->{$sequence} = $entropy;
830	0					0	return $entropy;
831							}
832
833							sub class_entropy_for_greater_than_threshold_for_feature {
834	0			0	0	0	my $self = shift;
835	0					0	my $arr = shift;
836	0					0	my $feature = shift;
837	0					0	my $threshold = shift;
838	0					0	my @array_of_features_and_values_or_thresholds = @$arr;
839	0					0	my $feature_threshold_combo = "$feature" . '>' . "$threshold";
840	0					0	my $sequence = join ":", @array_of_features_and_values_or_thresholds;
841	0					0	$sequence .= ":" . $feature_threshold_combo;
842	0	0				0	return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
843							my @copy_of_array_of_features_and_values_or_thresholds =
844	0					0	@{deep_copy_array(\@array_of_features_and_values_or_thresholds)};
	0					0
845	0					0	push @copy_of_array_of_features_and_values_or_thresholds, $feature_threshold_combo;
846	0					0	my $entropy = 0;
847	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
848	0					0	my $log_prob = undef;
849	0					0	my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
850							$class_name, \@copy_of_array_of_features_and_values_or_thresholds);
851	0	0	0			0	if ($prob >= 0.0001 && $prob <= 0.999) {
		0
		0
852	0					0	$log_prob = log($prob) / log(2.0);
853							} elsif ($prob < 0.0001) {
854	0					0	$log_prob = 0;
855							} elsif ($prob > 0.999) {
856	0					0	$log_prob = 0;
857							} else {
858	0					0	die "An error has occurred in log_prob calculation";
859							}
860	0					0	$entropy += -1.0 * $prob * $log_prob;
861							}
862	0	0				0	if (abs($entropy) < 0.0000001) {
863	0					0	$entropy = 0.0;
864							}
865	0					0	$self->{_entropy_cache}->{$sequence} = $entropy;
866	0					0	return $entropy;
867							}
868
869							sub class_entropy_for_a_given_sequence_of_features_and_values_or_thresholds {
870	122			122	0	89	my $self = shift;
871	122					69	my $array_of_features_and_values_or_thresholds = shift;
872	122					124	my @array_of_features_and_values_or_thresholds = @$array_of_features_and_values_or_thresholds;
873	122					122	my $sequence = join ":", @array_of_features_and_values_or_thresholds;
874	122	100				229	return $self->{_entropy_cache}->{$sequence} if exists $self->{_entropy_cache}->{$sequence};
875	82					49	my $entropy = 0;
876	82					49	foreach my $class_name (@{$self->{_class_names}}) {
	82					89
877	164					124	my $log_prob = undef;
878	164					192	my $prob = $self->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds(
879							$class_name, \@array_of_features_and_values_or_thresholds);
880	164	100	100			438	if ($prob >= 0.0001 && $prob <= 0.999) {
		100
		50
881	138					149	$log_prob = log($prob) / log(2.0);
882							} elsif ($prob < 0.0001) {
883	13					10	$log_prob = 0;
884							} elsif ($prob > 0.999) {
885	13					12	$log_prob = 0;
886							} else {
887	0					0	die "An error has occurred in log_prob calculation";
888							}
889	164					184	$entropy += -1.0 * $prob * $log_prob;
890							}
891	82	100				116	if (abs($entropy) < 0.0000001) {
892	13					11	$entropy = 0.0;
893							}
894	82					109	$self->{_entropy_cache}->{$sequence} = $entropy;
895	82					132	return $entropy;
896							}
897
898
899							##################################### Probability Calculators ########################################
900
901							sub prior_probability_for_class {
902	4			4	0	5	my $self = shift;
903	4					3	my $class = shift;
904	4					3	my $class_name_in_cache = "prior" . '::' . $class;
905							return $self->{_probability_cache}->{$class_name_in_cache}
906	4	50				16	if exists $self->{_probability_cache}->{$class_name_in_cache};
907	0					0	my $total_num_of_samples = keys %{$self->{_samples_class_label_hash}};
	0					0
908	0					0	my @values = values %{$self->{_samples_class_label_hash}};
	0					0
909	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
910	0					0	my @trues = grep {$_ eq $class_name} @values;
	0					0
911	0					0	my $prior_for_this_class = (1.0 * @trues) / $total_num_of_samples;
912	0					0	my $this_class_name_in_cache = "prior" . '::' . $class_name;
913	0					0	$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
914							}
915	0					0	return $self->{_probability_cache}->{$class_name_in_cache};
916							}
917
918							sub calculate_class_priors {
919	1			1	1	7	my $self = shift;
920	1	50				2	return if scalar keys %{$self->{_class_priors_hash}} > 1;
	1					4
921	1					2	foreach my $class_name (@{$self->{_class_names}}) {
	1					2
922	2					4	my $class_name_in_cache = "prior::$class_name";
923	2					1	my $total_num_of_samples = scalar keys %{$self->{_samples_class_label_hash}};
	2					5
924	2					1	my @all_values = values %{$self->{_samples_class_label_hash}};
	2					11
925	2					3	my @trues_for_this_class = grep {$_ eq $class_name} @all_values;
	70					53
926	2					5	my $prior_for_this_class = (1.0 * (scalar @trues_for_this_class)) / $total_num_of_samples;
927	2					3	$self->{_class_priors_hash}->{$class_name} = $prior_for_this_class;
928	2					4	my $this_class_name_in_cache = "prior::$class_name";
929	2					6	$self->{_probability_cache}->{$this_class_name_in_cache} = $prior_for_this_class;
930							}
931	1	50				4	if ($self->{_debug1}) {
932	0					0	foreach my $class (sort keys %{$self->{_class_priors_hash}}) {
	0					0
933	0					0	print "$class => $self->{_class_priors_hash}->{$class}\n";
934							}
935							}
936							}
937
938							sub calculate_first_order_probabilities {
939	1			1	1	13	print "\nEstimating probabilities...\n";
940	1					2	my $self = shift;
941	1					1	foreach my $feature (@{$self->{_feature_names}}) {
	1					3
942	4					13	$self->probability_of_feature_value($feature, undef);
943	4	50				16	if ($self->{_debug2}) {
944	0	0				0	if (exists $self->{_prob_distribution_numeric_features_hash}->{$feature}) {
945	0					0	print "\nPresenting probability distribution for a numeric feature:\n";
946	0					0	foreach my $sampling_point (sort {$a <=> $b} keys
	0					0
947	0					0	%{$self->{_prob_distribution_numeric_features_hash}->{$feature}}) {
948	0					0	my $sampling_pt_for_display = sprintf("%.2f", $sampling_point);
949							print "$feature :: $sampling_pt_for_display=" . sprintf("%.5f",
950	0					0	$self->{_prob_distribution_numeric_features_hash}->{$feature}{$sampling_point}) . "\n";
951							}
952							} else {
953	0					0	print "\nPresenting probabilities for the values of a feature considered to be symbolic:\n";
954	0					0	my @values_for_feature = @{$self->{_features_and_unique_values_hash}->{$feature}};
	0					0
955	0					0	foreach my $value (sort @values_for_feature) {
956	0					0	my $prob = $self->probability_of_feature_value($feature,$value);
957	0					0	print "$feature :: $value = " . sprintf("%.5f", $prob) . "\n";
958							}
959							}
960							}
961							}
962							}
963
964							sub probability_of_feature_value {
965	190			190	0	133	my $self = shift;
966	190					142	my $feature_name = shift;
967	190					131	my $value = shift;
968	190	50	66			568	$value = sprintf("%.1f", $value) if defined($value) && $value =~ /^\d+$/;
969	190	50	66			373	if (defined($value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
970							$value = closest_sampling_point($value,
971	0					0	$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
972							}
973	190					112	my $feature_and_value;
974	190	100				228	if (defined($value)) {
975	186					196	$feature_and_value = "$feature_name=$value";
976							}
977	190	50	66			358	if (defined($value) && exists($self->{_probability_cache}->{$feature_and_value})) {
978	186					329	return $self->{_probability_cache}->{$feature_and_value};
979							}
980	4					7	my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
981	4	50				10	if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
982	0	0				0	if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
983							$self->{_symbolic_to_numeric_cardinality_threshold}) {
984	0	0				0	if (! exists $self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}) {
985	0					0	@valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
	0					0
986	0					0	$diffrange = $valuerange[1] - $valuerange[0];
987	0					0	my %seen = ();
988	0	0	0			0	my @unique_values_for_feature = sort {$a <=> $b} grep {$_ if $_ ne 'NA' && !$seen{$_}++}
	0					0
989	0					0	@{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
990	0					0	my @diffs = sort {$a <=> $b} map {$unique_values_for_feature[$_] -
	0					0
	0					0
991							$unique_values_for_feature[$_-1]} 1..@unique_values_for_feature-1;
992	0					0	my $median_diff = $diffs[int(@diffs/2) - 1];
993	0					0	$histogram_delta = $median_diff * 2;
994	0	0				0	if ($histogram_delta < $diffrange / 500.0) {
995	0	0				0	if (defined $self->{_number_of_histogram_bins}) {
996	0					0	$histogram_delta = $diffrange / $self->{_number_of_histogram_bins};
997							} else {
998	0					0	$histogram_delta = $diffrange / 500.0;
999							}
1000							}
1001	0					0	$self->{_histogram_delta_hash}->{$feature_name} = $histogram_delta;
1002	0					0	$num_of_histogram_bins = int($diffrange / $histogram_delta) + 1;
1003	0					0	$self->{_num_of_histogram_bins_hash}->{$feature_name} = $num_of_histogram_bins;
1004	0					0	my @sampling_points_for_feature = map {$valuerange[0] + $histogram_delta * $_}
	0					0
1005							0..$num_of_histogram_bins-1;
1006	0					0	$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name} =
1007							\@sampling_points_for_feature;
1008							}
1009							}
1010							}
1011	4	50				9	if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1012	0	0				0	if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1013							$self->{_symbolic_to_numeric_cardinality_threshold}) {
1014							my @sampling_points_for_feature =
1015	0					0	@{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
	0					0
1016	0					0	my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1017	0					0	my @actual_values_for_feature = grep {$_ ne 'NA'}
1018	0					0	@{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
1019	0					0	foreach my $i (0..@sampling_points_for_feature-1) {
1020	0					0	foreach my $j (0..@actual_values_for_feature-1) {
1021	0	0				0	if (abs($sampling_points_for_feature[$i]-$actual_values_for_feature[$j]) < $histogram_delta) {
1022	0					0	$counts_at_sampling_points[$i]++
1023							}
1024							}
1025							}
1026	0					0	my $total_counts = 0;
1027	0					0	map {$total_counts += $_} @counts_at_sampling_points;
	0					0
1028	0					0	my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
	0					0
1029	0					0	my %bin_prob_hash = ();
1030	0					0	foreach my $i (0..@sampling_points_for_feature-1) {
1031	0					0	$bin_prob_hash{$sampling_points_for_feature[$i]} = $probs[$i];
1032							}
1033	0					0	$self->{_prob_distribution_numeric_features_hash}->{$feature_name} = \%bin_prob_hash;
1034	0					0	my @values_for_feature = map "$feature_name=$_", map {sprintf("%.5f", $_)}
	0					0
1035							@sampling_points_for_feature;
1036	0					0	foreach my $i (0..@values_for_feature-1) {
1037	0					0	$self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1038							}
1039	0	0	0			0	if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1040	0					0	return $self->{_probability_cache}->{$feature_and_value};
1041							} else {
1042	0					0	return 0;
1043							}
1044							} else {
1045	0					0	my %seen = ();
1046	0	0	0			0	my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1047	0					0	@{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
1048	0					0	@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
	0					0
1049	0					0	my @value_counts = (0) x @values_for_feature;
1050							# foreach my $sample (sort {sample_index($a) cmp sample_index($b)}keys %{$self->{_training_data_hash}}){
1051	0					0	foreach my $sample (sort {sample_index($a) <=> sample_index($b)}keys %{$self->{_training_data_hash}}){
	0					0
	0					0
1052	0					0	my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
	0					0
1053	0					0	foreach my $i (0..@values_for_feature-1) {
1054	0					0	foreach my $current_value (@features_and_values) {
1055	0	0				0	$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1056							}
1057							}
1058							}
1059	0					0	my $total_counts = 0;
1060	0					0	map {$total_counts += $_} @value_counts;
	0					0
1061	0	0				0	die "PFV Something is wrong with your training file. It contains no training samples \
1062							for feature named $feature_name" if $total_counts == 0;
1063	0					0	my @probs = map {$_ / (1.0 * $total_counts)} @value_counts;
	0					0
1064	0					0	foreach my $i (0..@values_for_feature-1) {
1065	0					0	$self->{_probability_cache}->{$values_for_feature[$i]} = $probs[$i];
1066							}
1067	0	0	0			0	if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1068	0					0	return $self->{_probability_cache}->{$feature_and_value};
1069							} else {
1070	0					0	return 0;
1071							}
1072							}
1073							} else {
1074							# This section is only for purely symbolic features:
1075	4					6	my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
	4					26
1076	4					7	@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
	140					157
1077	4					17	my @value_counts = (0) x @values_for_feature;
1078							# foreach my $sample (sort {sample_index($a) cmp sample_index($b)} keys %{$self->{_training_data_hash}}) {
1079	4					5	foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %{$self->{_training_data_hash}}) {
	548					520
	4					50
1080	140					77	my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
	140					232
1081	140					148	foreach my $i (0..@values_for_feature-1) {
1082	4900					3357	for my $current_value (@features_and_values) {
1083	19600	100				23181	$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1084							}
1085							}
1086							}
1087	4					22	foreach my $i (0..@values_for_feature-1) {
1088							$self->{_probability_cache}->{$values_for_feature[$i]} =
1089	140					95	$value_counts[$i] / (1.0 * scalar(keys %{$self->{_training_data_hash}}));
	140					214
1090							}
1091	4	50	33			16	if (defined($value) && exists $self->{_probability_cache}->{$feature_and_value}) {
1092	0					0	return $self->{_probability_cache}->{$feature_and_value};
1093							} else {
1094	4					20	return 0;
1095							}
1096							}
1097							}
1098
1099							sub probability_of_feature_value_given_class {
1100	380			380	0	266	my $self = shift;
1101	380					261	my $feature_name = shift;
1102	380					266	my $feature_value = shift;
1103	380					225	my $class_name = shift;
1104	380	50	33			1091	$feature_value = sprintf("%.1f", $feature_value) if defined($feature_value) && $feature_value =~ /^\d+$/;
1105	380	50	33			678	if (defined($feature_value) && exists($self->{_sampling_points_for_numeric_feature_hash}->{$feature_name})) {
1106							$feature_value = closest_sampling_point($feature_value,
1107	0					0	$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name});
1108							}
1109	380					239	my $feature_value_class;
1110	380	50				417	if (defined($feature_value)) {
1111	380					515	$feature_value_class = "$feature_name=$feature_value" . "::" . "$class_name";
1112							}
1113	380	100	33			759	if (defined($feature_value) && exists($self->{_probability_cache}->{$feature_value_class})) {
1114							print "\nNext answer returned by cache for feature $feature_name and " .
1115	372	50				443	"value $feature_value given class $class_name\n" if $self->{_debug2};
1116	372					730	return $self->{_probability_cache}->{$feature_value_class};
1117							}
1118	8					14	my ($histogram_delta, $num_of_histogram_bins, @valuerange, $diffrange) = (undef,undef,undef,undef);
1119
1120	8	50				10	if (exists $self->{_numeric_features_valuerange_hash}->{$feature_name}) {
1121	0	0				0	if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1122							$self->{_symbolic_to_numeric_cardinality_threshold}) {
1123	0					0	$histogram_delta = $self->{_histogram_delta_hash}->{$feature_name};
1124	0					0	$num_of_histogram_bins = $self->{_num_of_histogram_bins_hash}->{$feature_name};
1125	0					0	@valuerange = @{$self->{_numeric_features_valuerange_hash}->{$feature_name}};
	0					0
1126	0					0	$diffrange = $valuerange[1] - $valuerange[0];
1127							}
1128							}
1129	8					8	my @samples_for_class = ();
1130							# Accumulate all samples names for the given class:
1131	8					4	foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
	8					47
1132	280	100				330	if ($self->{_samples_class_label_hash}->{$sample_name} eq $class_name) {
1133	140					111	push @samples_for_class, $sample_name;
1134							}
1135							}
1136	8	50				18	if (exists($self->{_numeric_features_valuerange_hash}->{$feature_name})) {
1137	0	0				0	if ($self->{_feature_values_how_many_uniques_hash}->{$feature_name} >
1138							$self->{_symbolic_to_numeric_cardinality_threshold}) {
1139							my @sampling_points_for_feature =
1140	0					0	@{$self->{_sampling_points_for_numeric_feature_hash}->{$feature_name}};
	0					0
1141	0					0	my @counts_at_sampling_points = (0) x @sampling_points_for_feature;
1142	0					0	my @actual_feature_values_for_samples_in_class = ();
1143	0					0	foreach my $sample (@samples_for_class) {
1144	0					0	foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
	0					0
1145	0					0	my $pattern = '(.+)=(.+)';
1146	0					0	$feature_and_value =~ /$pattern/;
1147	0					0	my ($feature, $value) = ($1, $2);
1148	0	0	0			0	if (($feature eq $feature_name) && ($value ne 'NA')) {
1149	0					0	push @actual_feature_values_for_samples_in_class, $value;
1150							}
1151							}
1152							}
1153	0					0	foreach my $i (0..@sampling_points_for_feature-1) {
1154	0					0	foreach my $j (0..@actual_feature_values_for_samples_in_class-1) {
1155	0	0				0	if (abs($sampling_points_for_feature[$i] -
1156							$actual_feature_values_for_samples_in_class[$j]) < $histogram_delta) {
1157	0					0	$counts_at_sampling_points[$i]++;
1158							}
1159							}
1160							}
1161	0					0	my $total_counts = 0;
1162	0					0	map {$total_counts += $_} @counts_at_sampling_points;
	0					0
1163	0	0				0	die "PFVC1 Something is wrong with your training file. It contains no training " .
1164							"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1165	0					0	my @probs = map {$_ / (1.0 * $total_counts)} @counts_at_sampling_points;
	0					0
1166	0					0	my @values_for_feature_and_class = map {"$feature_name=$_" . "::" . "$class_name"}
	0					0
1167							@sampling_points_for_feature;
1168	0					0	foreach my $i (0..@values_for_feature_and_class-1) {
1169	0					0	$self->{_probability_cache}->{$values_for_feature_and_class[$i]} = $probs[$i];
1170							}
1171	0	0				0	if (exists $self->{_probability_cache}->{$feature_value_class}) {
1172	0					0	return $self->{_probability_cache}->{$feature_value_class};
1173							} else {
1174	0					0	return 0;
1175							}
1176							} else {
1177							# This section is for numeric features that will be treated symbolically
1178	0					0	my %seen = ();
1179	0	0	0			0	my @values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1180	0					0	@{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
1181	0					0	@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
	0					0
1182	0					0	my @value_counts = (0) x @values_for_feature;
1183	0					0	foreach my $sample (@samples_for_class) {
1184	0					0	my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
	0					0
1185	0					0	foreach my $i (0..@values_for_feature-1) {
1186	0					0	foreach my $current_value (@features_and_values) {
1187	0	0				0	$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1188							}
1189							}
1190							}
1191	0					0	my $total_counts = 0;
1192	0					0	map {$total_counts += $_} @value_counts;
	0					0
1193	0	0				0	die "PFVC2 Something is wrong with your training file. It contains no training " .
1194							"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1195							# We normalize by total_count because the probabilities are conditioned on a given class
1196	0					0	foreach my $i (0..@values_for_feature-1) {
1197	0					0	my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1198	0					0	$self->{_probability_cache}->{$feature_and_value_and_class} =
1199							$value_counts[$i] / (1.0 * $total_counts);
1200							}
1201	0	0				0	if (exists $self->{_probability_cache}->{$feature_value_class}) {
1202	0					0	return $self->{_probability_cache}->{$feature_value_class};
1203							} else {
1204	0					0	return 0;
1205							}
1206							}
1207							} else {
1208							# This section is for purely symbolic features
1209	8					6	my @values_for_feature = @{$self->{_features_and_values_hash}->{$feature_name}};
	8					38
1210	8					8	my %seen = ();
1211	280	100	66			737	@values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
1212	8					6	@{$self->{_features_and_values_hash}->{$feature_name}};
	8					11
1213	8					8	@values_for_feature = map {"$feature_name=$_"} @values_for_feature;
	28					37
1214	8					11	my @value_counts = (0) x @values_for_feature;
1215	8					9	foreach my $sample (@samples_for_class) {
1216	140					81	my @features_and_values = @{$self->{_training_data_hash}->{$sample}};
	140					181
1217	140					139	foreach my $i (0..@values_for_feature-1) {
1218	490					313	foreach my $current_value (@features_and_values) {
1219	1960	100				2377	$value_counts[$i]++ if $values_for_feature[$i] eq $current_value;
1220							}
1221							}
1222							}
1223	8					9	my $total_counts = 0;
1224	8					7	map {$total_counts += $_} @value_counts;
	28					23
1225	8	50				29	die "PFVC2 Something is wrong with your training file. It contains no training " .
1226							"samples for Class $class_name and Feature $feature_name" if $total_counts == 0;
1227							# We normalize by total_count because the probabilities are conditioned on a given class
1228	8					11	foreach my $i (0..@values_for_feature-1) {
1229	28					32	my $feature_and_value_and_class = "$values_for_feature[$i]" . "::" . "$class_name";
1230	28					61	$self->{_probability_cache}->{$feature_and_value_and_class} =
1231							$value_counts[$i] / (1.0 * $total_counts);
1232							}
1233	8	50				11	if (exists $self->{_probability_cache}->{$feature_value_class}) {
1234	8					33	return $self->{_probability_cache}->{$feature_value_class};
1235							} else {
1236	0					0	return 0;
1237							}
1238							}
1239							}
1240
1241							sub probability_of_feature_less_than_threshold {
1242	0			0	0	0	my $self = shift;
1243	0					0	my $feature_name = shift;
1244	0					0	my $threshold = shift;
1245	0					0	my $feature_threshold_combo = "$feature_name" . '<' . "$threshold";
1246							return $self->{_probability_cache}->{$feature_threshold_combo}
1247	0	0				0	if (exists $self->{_probability_cache}->{$feature_threshold_combo});
1248	0	0				0	my @all_values = grep {$_ if $_ ne 'NA'} @{$self->{_features_and_values_hash}->{$feature_name}};
	0					0
	0					0
1249	0	0				0	my @all_values_less_than_threshold = grep {$_ if $_ <= $threshold} @all_values;
	0					0
1250	0					0	my $probability = (1.0 * @all_values_less_than_threshold) / @all_values;
1251	0					0	$self->{_probability_cache}->{$feature_threshold_combo} = $probability;
1252	0					0	return $probability;
1253							}
1254
1255							sub probability_of_feature_less_than_threshold_given_class {
1256	0			0	0	0	my $self = shift;
1257	0					0	my $feature_name = shift;
1258	0					0	my $threshold = shift;
1259	0					0	my $class_name = shift;
1260	0					0	my $feature_threshold_class_combo = "$feature_name" . "<" . "$threshold" . "::" . "$class_name";
1261							return $self->{_probability_cache}->{$feature_threshold_class_combo}
1262	0	0				0	if (exists $self->{_probability_cache}->{$feature_threshold_class_combo});
1263	0					0	my @data_samples_for_class = ();
1264							# Accumulate all samples names for the given class:
1265	0					0	foreach my $sample_name (keys %{$self->{_samples_class_label_hash}}) {
	0					0
1266							push @data_samples_for_class, $sample_name
1267	0	0				0	if $self->{_samples_class_label_hash}->{$sample_name} eq $class_name;
1268							}
1269	0					0	my @actual_feature_values_for_samples_in_class = ();
1270	0					0	foreach my $sample (@data_samples_for_class) {
1271	0					0	foreach my $feature_and_value (@{$self->{_training_data_hash}->{$sample}}) {
	0					0
1272	0					0	my $pattern = '(.+)=(.+)';
1273	0					0	$feature_and_value =~ /$pattern/;
1274	0					0	my ($feature,$value) = ($1,$2);
1275	0	0	0			0	push @actual_feature_values_for_samples_in_class, $value
1276							if $feature eq $feature_name && $value ne 'NA';
1277							}
1278							}
1279	0	0				0	my @actual_points_for_feature_less_than_threshold = grep {$_ if $_ <= $threshold} @actual_feature_values_for_samples_in_class;
	0					0
1280							# The condition in the assignment that follows was a bug correction in Version 3.20
1281	0	0				0	my $probability = @actual_feature_values_for_samples_in_class > 0 ? ((1.0 * @actual_points_for_feature_less_than_threshold) / @actual_feature_values_for_samples_in_class) : 0.0;
1282	0					0	$self->{_probability_cache}->{$feature_threshold_class_combo} = $probability;
1283	0					0	return $probability;
1284							}
1285
1286							# This method requires that all truly numeric types only be expressed as '<' or '>'
1287							# constructs in the array of branch features and thresholds
1288							sub probability_of_a_sequence_of_features_and_values_or_thresholds {
1289	233			233	0	160	my $self = shift;
1290	233					157	my $arr = shift;
1291	233					253	my @array_of_features_and_values_or_thresholds = @$arr;
1292	233	50				294	return if scalar @array_of_features_and_values_or_thresholds == 0;
1293	233					278	my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1294	233	100				463	return $self->{_probability_cache}->{$sequence} if exists $self->{_probability_cache}->{$sequence};
1295	68					48	my $probability = undef;
1296	68					46	my $pattern1 = '(.+)=(.+)';
1297	68					34	my $pattern2 = '(.+)<(.+)';
1298	68					47	my $pattern3 = '(.+)>(.+)';
1299	68					53	my @true_numeric_types = ();
1300	68					59	my @true_numeric_types_feature_names = ();
1301	68					55	my @symbolic_types = ();
1302	68					41	my @symbolic_types_feature_names = ();
1303	68					69	foreach my $item (@array_of_features_and_values_or_thresholds) {
1304	186	50				358	if ($item =~ /$pattern2/) {
		50
1305	0					0	push @true_numeric_types, $item;
1306	0					0	my ($feature,$value) = ($1,$2);
1307	0					0	push @true_numeric_types_feature_names, $feature;
1308							} elsif ($item =~ /$pattern3/) {
1309	0					0	push @true_numeric_types, $item;
1310	0					0	my ($feature,$value) = ($1,$2);
1311	0					0	push @true_numeric_types_feature_names, $feature;
1312							} else {
1313	186					128	push @symbolic_types, $item;
1314	186					280	$item =~ /$pattern1/;
1315	186					237	my ($feature,$value) = ($1,$2);
1316	186					193	push @symbolic_types_feature_names, $feature;
1317							}
1318							}
1319	68					66	my %seen1 = ();
1320	68	0				69	@true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
	0					0
1321	68					40	my %seen2 = ();
1322	68	50				61	@symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
	186					409
1323	68					107	my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1324							print_array_with_msg("POS: Answer returned by find_bounded: ",
1325	68	50				81	$bounded_intervals_numeric_types) if $self->{_debug2};
1326							# Calculate the upper and the lower bounds to be used when searching for the best
1327							# threshold for each of the numeric features that are in play at the current node:
1328	68					44	my (%upperbound, %lowerbound);
1329	68					62	foreach my $feature_name (@true_numeric_types_feature_names) {
1330	0					0	$upperbound{$feature_name} = undef;
1331	0					0	$lowerbound{$feature_name} = undef;
1332							}
1333	68					69	foreach my $item (@$bounded_intervals_numeric_types) {
1334	0					0	foreach my $feature_grouping (@$item) {
1335	0	0				0	if ($feature_grouping->[1] eq '>') {
1336	0					0	$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1337							} else {
1338	0					0	$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1339							}
1340							}
1341							}
1342	68					57	foreach my $feature_name (@true_numeric_types_feature_names) {
1343	0	0	0			0	if (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name}) &&
		0	0
		0	0
		0	0
			0
1344							$upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1345	0					0	return 0;
1346							} elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1347	0	0				0	if (! $probability) {
1348							$probability = $self->probability_of_feature_less_than_threshold($feature_name,
1349							$upperbound{$feature_name}) -
1350	0					0	$self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name});
1351							} else {
1352							$probability *= ($self->probability_of_feature_less_than_threshold($feature_name,
1353							$upperbound{$feature_name}) -
1354	0					0	$self->probability_of_feature_less_than_threshold($feature_name, $lowerbound{$feature_name}))
1355							}
1356							} elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1357	0	0				0	if (! $probability) {
1358							$probability = $self->probability_of_feature_less_than_threshold($feature_name,
1359	0					0	$upperbound{$feature_name});
1360							} else {
1361							$probability *= $self->probability_of_feature_less_than_threshold($feature_name,
1362	0					0	$upperbound{$feature_name});
1363							}
1364							} elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1365	0	0				0	if (! $probability) {
1366							$probability = 1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1367	0					0	$lowerbound{$feature_name});
1368							} else {
1369							$probability *= (1.0 - $self->probability_of_feature_less_than_threshold($feature_name,
1370	0					0	$lowerbound{$feature_name}));
1371							}
1372							} else {
1373	0					0	die("Ill formatted call to 'probability_of_sequence' method");
1374							}
1375							}
1376	68					50	foreach my $feature_and_value (@symbolic_types) {
1377	186	50				508	if ($feature_and_value =~ /$pattern1/) {
1378	186					228	my ($feature,$value) = ($1,$2);
1379	186	100				179	if (! $probability) {
1380	68					113	$probability = $self->probability_of_feature_value($feature, $value);
1381							} else {
1382	118					132	$probability *= $self->probability_of_feature_value($feature, $value);
1383							}
1384							}
1385							}
1386	68					106	$self->{_probability_cache}->{$sequence} = $probability;
1387	68					152	return $probability;
1388							}
1389
1390							## The following method requires that all truly numeric types only be expressed as
1391							## '<' or '>' constructs in the array of branch features and thresholds
1392							sub probability_of_a_sequence_of_features_and_values_or_thresholds_given_class {
1393	164			164	0	108	my $self = shift;
1394	164					103	my $arr = shift;
1395	164					118	my $class_name = shift;
1396	164					180	my @array_of_features_and_values_or_thresholds = @$arr;
1397	164	50				216	return if scalar @array_of_features_and_values_or_thresholds == 0;
1398	164					212	my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1399	164					178	my $sequence_with_class = "$sequence" . "::" . $class_name;
1400							return $self->{_probability_cache}->{$sequence_with_class}
1401	164	100				271	if exists $self->{_probability_cache}->{$sequence_with_class};
1402	144					95	my $probability = undef;
1403	144					118	my $pattern1 = '(.+)=(.+)';
1404	144					85	my $pattern2 = '(.+)<(.+)';
1405	144					85	my $pattern3 = '(.+)>(.+)';
1406	144					132	my @true_numeric_types = ();
1407	144					97	my @true_numeric_types_feature_names = ();
1408	144					86	my @symbolic_types = ();
1409	144					106	my @symbolic_types_feature_names = ();
1410	144					128	foreach my $item (@array_of_features_and_values_or_thresholds) {
1411	380	50				704	if ($item =~ /$pattern2/) {
		50
1412	0					0	push @true_numeric_types, $item;
1413	0					0	my ($feature,$value) = ($1,$2);
1414	0					0	push @true_numeric_types_feature_names, $feature;
1415							} elsif ($item =~ /$pattern3/) {
1416	0					0	push @true_numeric_types, $item;
1417	0					0	my ($feature,$value) = ($1,$2);
1418	0					0	push @true_numeric_types_feature_names, $feature;
1419							} else {
1420	380					269	push @symbolic_types, $item;
1421	380					603	$item =~ /$pattern1/;
1422	380					486	my ($feature,$value) = ($1,$2);
1423	380					416	push @symbolic_types_feature_names, $feature;
1424							}
1425							}
1426	144					133	my %seen1 = ();
1427	144	0				125	@true_numeric_types_feature_names = grep {$_ if !$seen1{$_}++} @true_numeric_types_feature_names;
	0					0
1428	144					101	my %seen2 = ();
1429	144	50				120	@symbolic_types_feature_names = grep {$_ if !$seen2{$_}++} @symbolic_types_feature_names;
	380					841
1430	144					205	my $bounded_intervals_numeric_types = $self->find_bounded_intervals_for_numeric_features(\@true_numeric_types);
1431							print_array_with_msg("POSC: Answer returned by find_bounded: ",
1432	144	50				186	$bounded_intervals_numeric_types) if $self->{_debug2};
1433							# Calculate the upper and the lower bounds to be used when searching for the best
1434							# threshold for each of the numeric features that are in play at the current node:
1435	144					93	my (%upperbound, %lowerbound);
1436	144					130	foreach my $feature_name (@true_numeric_types_feature_names) {
1437	0					0	$upperbound{$feature_name} = undef;
1438	0					0	$lowerbound{$feature_name} = undef;
1439							}
1440	144					129	foreach my $item (@$bounded_intervals_numeric_types) {
1441	0					0	foreach my $feature_grouping (@$item) {
1442	0	0				0	if ($feature_grouping->[1] eq '>') {
1443	0					0	$lowerbound{$feature_grouping->[0]} = $feature_grouping->[2];
1444							} else {
1445	0					0	$upperbound{$feature_grouping->[0]} = $feature_grouping->[2];
1446							}
1447							}
1448							}
1449	144					119	foreach my $feature_name (@true_numeric_types_feature_names) {
1450	0	0	0			0	if ($lowerbound{$feature_name} && $upperbound{$feature_name} &&
		0	0
		0	0
		0	0
			0
1451							$upperbound{$feature_name} <= $lowerbound{$feature_name}) {
1452	0					0	return 0;
1453							} elsif (defined($lowerbound{$feature_name}) && defined($upperbound{$feature_name})) {
1454	0	0				0	if (! $probability) {
1455
1456							$probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1457							$upperbound{$feature_name}, $class_name) -
1458							$self->probability_of_feature_less_than_threshold_given_class($feature_name,
1459	0					0	$lowerbound{$feature_name}, $class_name);
1460							} else {
1461							$probability *= ($self->probability_of_feature_less_than_threshold_given_class($feature_name,
1462							$upperbound{$feature_name}, $class_name) -
1463							$self->probability_of_feature_less_than_threshold_given_class($feature_name,
1464	0					0	$lowerbound{$feature_name}, $class_name))
1465							}
1466							} elsif (defined($upperbound{$feature_name}) && ! defined($lowerbound{$feature_name})) {
1467	0	0				0	if (! $probability) {
1468							$probability = $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1469	0					0	$upperbound{$feature_name}, $class_name);
1470							} else {
1471							$probability *= $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1472	0					0	$upperbound{$feature_name}, $class_name);
1473							}
1474							} elsif (defined($lowerbound{$feature_name}) && ! defined($upperbound{$feature_name})) {
1475	0	0				0	if (! $probability) {
1476							$probability = 1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1477	0					0	$lowerbound{$feature_name}, $class_name);
1478							} else {
1479							$probability *= (1.0 - $self->probability_of_feature_less_than_threshold_given_class($feature_name,
1480	0					0	$lowerbound{$feature_name}, $class_name));
1481							}
1482							} else {
1483	0					0	die("Ill formatted call to 'probability of sequence given class' method");
1484							}
1485							}
1486	144					122	foreach my $feature_and_value (@symbolic_types) {
1487	380	50				1027	if ($feature_and_value =~ /$pattern1/) {
1488	380					493	my ($feature,$value) = ($1,$2);
1489	380	100				365	if (! $probability) {
1490	144					192	$probability = $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1491							} else {
1492	236					272	$probability *= $self->probability_of_feature_value_given_class($feature, $value, $class_name);
1493							}
1494							}
1495							}
1496	144					250	$self->{_probability_cache}->{$sequence_with_class} = $probability;
1497	144					337	return $probability;
1498							}
1499
1500							sub probability_of_a_class_given_sequence_of_features_and_values_or_thresholds {
1501	244			244	0	153	my $self = shift;
1502	244					161	my $class_name = shift;
1503	244					157	my $arr = shift;
1504	244					248	my @array_of_features_and_values_or_thresholds = @$arr;
1505	244					240	my $sequence = join ':', @array_of_features_and_values_or_thresholds;
1506	244					233	my $class_and_sequence = "$class_name" . "::" . $sequence;
1507							return $self->{_probability_cache}->{$class_and_sequence}
1508	244	100				549	if exists $self->{_probability_cache}->{$class_and_sequence};
1509	82					55	my @array_of_class_probabilities = (0) x scalar @{$self->{_class_names}};
	82					114
1510	82					54	foreach my $i (0..@{$self->{_class_names}}-1) {
	82					100
1511	164					131	my $class_name = $self->{_class_names}->[$i];
1512	164					198	my $prob = $self->probability_of_a_sequence_of_features_and_values_or_thresholds_given_class(
1513							\@array_of_features_and_values_or_thresholds, $class_name);
1514	164	100				224	if ($prob < 0.000001) {
1515	13					16	$array_of_class_probabilities[$i] = 0.0;
1516	13					18	next;
1517							}
1518	151					208	my $prob_of_feature_sequence = $self->probability_of_a_sequence_of_features_and_values_or_thresholds(
1519							\@array_of_features_and_values_or_thresholds);
1520							# die "PCS Something is wrong with your sequence of feature values and thresholds in " .
1521							# "probability_of_a_class_given_sequence_of_features_and_values_or_thresholds()"
1522							# if ! $prob_of_feature_sequence;
1523	151					186	my $prior = $self->{_class_priors_hash}->{$self->{_class_names}->[$i]};
1524	151	50				159	if ($prob_of_feature_sequence) {
1525	151					210	$array_of_class_probabilities[$i] = $prob * $prior / $prob_of_feature_sequence;
1526							} else {
1527	0					0	$array_of_class_probabilities[$i] = $prior;
1528							}
1529							}
1530	82					56	my $sum_probability;
1531	82					66	map {$sum_probability += $_} @array_of_class_probabilities;
	164					159
1532	82	50				92	if ($sum_probability == 0) {
1533	0					0	@array_of_class_probabilities = map {1.0 / (scalar @{$self->{_class_names}})}
	0					0
1534	0					0	(0..@{$self->{_class_names}}-1);
	0					0
1535							} else {
1536	82					63	@array_of_class_probabilities = map {$_ * 1.0 / $sum_probability} @array_of_class_probabilities;
	164					200
1537							}
1538	82					64	foreach my $i (0..@{$self->{_class_names}}-1) {
	82					143
1539	164					204	my $this_class_and_sequence = "$self->{_class_names}->[$i]" . "::" . "$sequence";
1540	164					275	$self->{_probability_cache}->{$this_class_and_sequence} = $array_of_class_probabilities[$i];
1541							}
1542	82					138	return $self->{_probability_cache}->{$class_and_sequence};
1543							}
1544
1545							####################################### Class Based Utilities ##########################################
1546
1547							## Given a list of branch attributes for the numeric features of the form, say,
1548							## ['g2<1','g2<2','g2<3','age>34','age>36','age>37'], this method returns the
1549							## smallest list that is relevant for the purpose of calculating the probabilities.
1550							## To explain, the probability that the feature `g2' is less than 1 AND, at the same
1551							## time, less than 2, AND, at the same time, less than 3, is the same as the
1552							## probability that the feature less than 1. Similarly, the probability that 'age'
1553							## is greater than 34 and also greater than 37 is the same as `age' being greater
1554							## than 37.
1555							sub find_bounded_intervals_for_numeric_features {
1556	230			230	0	178	my $self = shift;
1557	230					141	my $arr = shift;
1558	230					174	my @arr = @$arr;
1559	230					151	my @features = @{$self->{_feature_names}};
	230					295
1560	230					192	my @arr1 = map {my @x = split /(>\|<)/, $_; \@x} @arr;
	0					0
	0					0
1561	230	50				316	print_array_with_msg("arr1", \@arr1) if $self->{_debug2};
1562	230					155	my @arr3 = ();
1563	230					198	foreach my $feature_name (@features) {
1564	920					624	my @temp = ();
1565	920					633	foreach my $x (@arr1) {
1566	0	0	0			0	push @temp, $x if @$x > 0 && $x->[0] eq $feature_name;
1567							}
1568	920	50				1201	push @arr3, \@temp if @temp > 0;
1569							}
1570	230	50				276	print_array_with_msg("arr3", \@arr3) if $self->{_debug2};
1571							# Sort each list so that '<' entries occur before '>' entries:
1572	230					138	my @arr4;
1573	230					166	foreach my $li (@arr3) {
1574	0					0	my @sorted = sort {$a->[1] cmp $b->[1]} @$li;
	0					0
1575	0					0	push @arr4, \@sorted;
1576							}
1577	230	50				281	print_array_with_msg("arr4", \@arr4) if $self->{_debug2};
1578	230					168	my @arr5;
1579	230					181	foreach my $li (@arr4) {
1580	0					0	my @temp1 = ();
1581	0					0	my @temp2 = ();
1582	0					0	foreach my $inner (@$li) {
1583	0	0				0	if ($inner->[1] eq '<') {
1584	0					0	push @temp1, $inner;
1585							} else {
1586	0					0	push @temp2, $inner;
1587							}
1588							}
1589	0	0	0			0	if (@temp1 > 0 && @temp2 > 0) {
		0
1590	0					0	push @arr5, [\@temp1, \@temp2];
1591							} elsif (@temp1 > 0) {
1592	0					0	push @arr5, [\@temp1];
1593							} else {
1594	0					0	push @arr5, [\@temp2];
1595							}
1596							}
1597	230	50				263	print_array_with_msg("arr5", \@arr5) if $self->{_debug2};
1598	230					180	my @arr6 = ();
1599	230					168	foreach my $li (@arr5) {
1600	0					0	my @temp1 = ();
1601	0					0	foreach my $inner (@$li) {
1602	0					0	my @sorted = sort {$a->[2] <=> $b->[2]} @$inner;
	0					0
1603	0					0	push @temp1, \@sorted;
1604							}
1605	0					0	push @arr6, \@temp1;
1606							}
1607	230	50				256	print_array_with_msg("arr6", \@arr6) if $self->{_debug2};
1608	230					182	my @arr9 = ();
1609	230					204	foreach my $li (@arr6) {
1610	0					0	foreach my $alist (@$li) {
1611	0					0	my @newalist = ();
1612	0	0				0	if ($alist->[0][1] eq '<') {
1613	0					0	push @newalist, $alist->[0];
1614							} else {
1615	0					0	push @newalist, $alist->[-1];
1616							}
1617	0	0				0	if ($alist->[0][1] ne $alist->[-1][1]) {
1618	0					0	push @newalist, $alist->[-1];
1619							}
1620	0					0	push @arr9, \@newalist;
1621							}
1622							}
1623	230	50				292	print_array_with_msg('arr9', \@arr9) if $self->{_debug2};
1624	230					324	return \@arr9;
1625
1626							}
1627
1628							## This method is used to verify that you used legal feature names in the test
1629							## sample that you want to classify with the decision tree.
1630							sub check_names_used {
1631	0			0	0	0	my $self = shift;
1632	0					0	my $features_and_values_test_data = shift;
1633	0					0	my @features_and_values_test_data = @$features_and_values_test_data;
1634	0					0	my $pattern = '(\S+)\s=\s(\S+)';
1635	0					0	foreach my $feature_and_value (@features_and_values_test_data) {
1636	0					0	$feature_and_value =~ /$pattern/;
1637	0					0	my ($feature,$value) = ($1,$2);
1638	0	0	0			0	die "Your test data has formatting error" unless defined($feature) && defined($value);
1639	0	0				0	return 0 unless contained_in($feature, @{$self->{_feature_names}});
	0					0
1640							}
1641	0					0	return 1;
1642							}
1643
1644							####################################### Data Condition Calculator ######################################
1645
1646							## This method estimates the worst-case fan-out of the decision tree taking into
1647							## account the number of values (and therefore the number of branches emanating from
1648							## a node) for the symbolic features.
1649							sub determine_data_condition {
1650	0			0	0	0	my $self = shift;
1651	0					0	my $num_of_features = scalar @{$self->{_feature_names}};
	0					0
1652	0					0	my @values = ();
1653	0					0	my @number_of_values;
1654	0					0	foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
	0					0
1655	0					0	push @values, @{$self->{_features_and_unique_values_hash}->{$feature}}
1656	0	0				0	if ! contained_in($feature, keys %{$self->{_numeric_features_valuerange_hash}});
	0					0
1657	0					0	push @number_of_values, scalar @values;
1658							}
1659	0	0				0	return if ! @values;
1660	0					0	print "Number of features: $num_of_features\n";
1661	0					0	my @minmax = minmax(\@number_of_values);
1662	0					0	my $max_num_values = $minmax[1];
1663	0					0	print "Largest number of values for symbolic features is: $max_num_values\n";
1664	0					0	my $estimated_number_of_nodes = $max_num_values ** $num_of_features;
1665	0					0	print "\nWORST CASE SCENARIO: The decision tree COULD have as many as $estimated_number_of_nodes " .
1666							"nodes. The exact number of nodes created depends critically on " .
1667							"the entropy_threshold used for node expansion (the default value " .
1668							"for this threshold is 0.01) and on the value set for max_depth_desired " .
1669							"for the depth of the tree.\n";
1670	0	0				0	if ($estimated_number_of_nodes > 10000) {
1671	0					0	print "\nTHIS IS WAY TOO MANY NODES. Consider using a relatively " .
1672							"large value for entropy_threshold and/or a small value for " .
1673							"for max_depth_desired to reduce the number of nodes created.\n";
1674	0					0	print "\nDo you wish to continue anyway? Enter 'y' for yes: ";
1675	0					0	my $answer = ;
1676	0					0	$answer =~ s/\r?\n?$//;
1677	0		0			0	while ( ($answer !~ /y(es)?/i) && ($answer !~ /n(o)?/i) ) {
1678	0					0	print "\nAnswer not recognized. Let's try again. Enter 'y' or 'n': ";
1679	0					0	$answer = ;
1680	0					0	$answer =~ s/\r?\n?$//;
1681							}
1682	0	0				0	die unless $answer =~ /y(es)?/i;
1683							}
1684							}
1685
1686
1687							#################################### Read Training Data From File ######################################
1688
1689
1690							sub get_training_data {
1691	1			1	1	5	my $self = shift;
1692	1	50				11	die("Aborted. get_training_data_csv() is only for CSV files") unless $self->{_training_datafile} =~ /\.csv$/;
1693	1					2	my %class_names = ();
1694	1					2	my %all_record_ids_with_class_labels;
1695							my $firstline;
1696	0					0	my %data_hash;
1697	1					2	$\|++;
1698	1		50			24	open FILEIN, $self->{_training_datafile} \|\| die "unable to open $self->{_training_datafile}: $!";
1699	1					2	my $record_index = 0;
1700	1					1	my $firsetline;
1701	1					13	while () {
1702	36	50				64	next if /^[ ]*\r?\n?$/;
1703	36					72	$_ =~ s/\r?\n?$//;
1704	36	50				47	my $record = $self->{_csv_cleanup_needed} ? cleanup_csv($_) : $_;
1705	36	100				42	if ($record_index == 0) {
1706	1					1	$firstline = $record;
1707	1					1	$record_index++;
1708	1					3	next;
1709							}
1710	35					102	my @parts = split /,/, $record;
1711	35					29	my $classname = $parts[$self->{_csv_class_column_index}];
1712	35					22	$class_names{$classname} = 1;
1713	35					29	my $record_label = shift @parts;
1714	35					46	$record_label =~ s/^\s\"\|\"\s$//g;
1715	35					48	$data_hash{$record_label} = \@parts;
1716	35					31	$all_record_ids_with_class_labels{$record_label} = $classname;
1717	35	50				45	print "." if $record_index % 10000 == 0;
1718	35					74	$record_index++;
1719							}
1720	1					6	close FILEIN;
1721	1					2	$\|--;
1722	1					2	$self->{_how_many_total_training_samples} = $record_index - 1; # must subtract 1 for the header record
1723	1	50				3	print "\n\nTotal number of training samples: $self->{_how_many_total_training_samples}\n" if $self->{_debug1};
1724	1					6	my @all_feature_names = split /,/, substr($firstline, index($firstline,','));
1725	1					2	my $class_column_heading = $all_feature_names[$self->{_csv_class_column_index}];
1726	1					2	my @all_class_names = sort map {"$class_column_heading=$_"} keys %class_names;
	2					6
1727	1					2	my @feature_names = map {$all_feature_names[$_]} @{$self->{_csv_columns_for_features}};
	4					5
	1					2
1728	1					6	my %class_for_sample_hash = map {"sample_" . $_ => "$class_column_heading=" . $data_hash{$_}->[$self->{_csv_class_column_index} - 1 ] } keys %data_hash;
	35					59
1729	1					7	my @sample_names = map {"sample_$_"} keys %data_hash;
	35					30
1730	1	50				6	my %feature_values_for_samples_hash = map {my $sampleID = $_; "sample_" . $sampleID => [map {my $fname = $all_feature_names[$_]; $fname . "=" . eval{$data_hash{$sampleID}->[$_-1] =~ /^\d+$/ ? sprintf("%.1f", $data_hash{$sampleID}->[$_-1] ) : $data_hash{$sampleID}->[$_-1] } } @{$self->{_csv_columns_for_features}} ] } keys %data_hash;
	35					21
	35					26
	140					99
	140					97
	140					331
	35					32
1731	1	50				5	my %features_and_values_hash = map { my $a = $_; {$all_feature_names[$a] => [ map {my $b = $_; $b =~ /^\d+$/ ? sprintf("%.1f",$b) : $b} map {$data_hash{$_}->[$a-1]} keys %data_hash ]} } @{$self->{_csv_columns_for_features}};
	4					3
	4					3
	4					16
	140					78
	140					199
	140					111
	1					2
1732	1					2	my %numeric_features_valuerange_hash = ();
1733	1					1	my %feature_values_how_many_uniques_hash = ();
1734	1					1	my %features_and_unique_values_hash = ();
1735	1					3	my $numregex = '[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?';
1736	1					3	foreach my $feature (keys %features_and_values_hash) {
1737	4					4	my %seen1 = ();
1738	140	100	66			369	my @unique_values_for_feature = sort grep {$_ if $_ ne 'NA' && !$seen1{$_}++}
1739	4					3	@{$features_and_values_hash{$feature}};
	4					6
1740	4					5	$feature_values_how_many_uniques_hash{$feature} = scalar @unique_values_for_feature;
1741	4					5	my $not_all_values_float = 0;
1742	4	50				5	map {$not_all_values_float = 1 if $_ !~ /^$numregex$/} @unique_values_for_feature;
	14					92
1743	4	50				9	if ($not_all_values_float == 0) {
1744	0					0	my @minmaxvalues = minmax(\@unique_values_for_feature);
1745	0					0	$numeric_features_valuerange_hash{$feature} = \@minmaxvalues;
1746							}
1747	4					8	$features_and_unique_values_hash{$feature} = \@unique_values_for_feature;
1748							}
1749	1	50				4	if ($self->{_debug1}) {
1750	0					0	print "\nAll class names: @all_class_names\n";
1751	0					0	print "\nEach sample data record:\n";
1752	0					0	foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %feature_values_for_samples_hash) {
	0					0
1753	0					0	print "$sample => @{$feature_values_for_samples_hash{$sample}}\n";
	0					0
1754							}
1755	0					0	print "\nclass label for each data sample:\n";
1756	0					0	foreach my $sample (sort {sample_index($a) <=> sample_index($b)} keys %class_for_sample_hash) {
	0					0
1757	0					0	print "$sample => $class_for_sample_hash{$sample}\n";
1758							}
1759	0					0	print "\nFeatures used: @feature_names\n\n";
1760	0					0	print "\nfeatures and the values taken by them:\n";
1761	0					0	foreach my $feature (sort keys %features_and_values_hash) {
1762	0					0	print "$feature => @{$features_and_values_hash{$feature}}\n";
	0					0
1763							}
1764	0					0	print "\nnumeric features and their ranges:\n";
1765	0					0	foreach my $feature (sort keys %numeric_features_valuerange_hash) {
1766	0					0	print "$feature => @{$numeric_features_valuerange_hash{$feature}}\n";
	0					0
1767							}
1768	0					0	print "\nnumber of unique values in each feature:\n";
1769	0					0	foreach my $feature (sort keys %feature_values_how_many_uniques_hash) {
1770	0					0	print "$feature => $feature_values_how_many_uniques_hash{$feature}\n";
1771							}
1772							}
1773	1					2	$self->{_class_names} = \@all_class_names;
1774	1					5	$self->{_feature_names} = \@feature_names;
1775	1					2	$self->{_samples_class_label_hash} = \%class_for_sample_hash;
1776	1					2	$self->{_training_data_hash} = \%feature_values_for_samples_hash;
1777	1					2	$self->{_features_and_values_hash} = \%features_and_values_hash;
1778	1					2	$self->{_features_and_unique_values_hash} = \%features_and_unique_values_hash;
1779	1					1	$self->{_numeric_features_valuerange_hash} = \%numeric_features_valuerange_hash;
1780	1					18	$self->{_feature_values_how_many_uniques_hash} = \%feature_values_how_many_uniques_hash;
1781							}
1782
1783							sub show_training_data {
1784	0			0	1	0	my $self = shift;
1785	0					0	my @class_names = @{$self->{_class_names}};
	0					0
1786	0					0	my %features_and_values_hash = %{$self->{_features_and_values_hash}};
	0					0
1787	0					0	my %samples_class_label_hash = %{$self->{_samples_class_label_hash}};
	0					0
1788	0					0	my %training_data_hash = %{$self->{_training_data_hash}};
	0					0
1789	0					0	print "\n\nClass Names: @class_names\n";
1790	0					0	print "\n\nFeatures and Their Values:\n\n";
1791	0					0	while ( my ($k, $v) = each %features_and_values_hash ) {
1792	0					0	print "$k ---> @{$features_and_values_hash{$k}}\n";
	0					0
1793							}
1794	0					0	print "\n\nSamples vs. Class Labels:\n\n";
1795	0					0	foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %samples_class_label_hash) {
	0					0
1796	0					0	print "$kee => $samples_class_label_hash{$kee}\n";
1797							}
1798	0					0	print "\n\nTraining Samples:\n\n";
1799	0					0	foreach my $kee (sort {sample_index($a) <=> sample_index($b)}
	0					0
1800							keys %training_data_hash) {
1801	0					0	print "$kee => @{$training_data_hash{$kee}}\n";
	0					0
1802							}
1803							}
1804
1805							sub get_class_names {
1806	0			0	0	0	my $self = shift;
1807	0					0	return @{$self->{_class_names}}
	0					0
1808							}
1809
1810							########################################## Utility Routines ############################################
1811
1812							sub closest_sampling_point {
1813	0			0	0	0	my $value = shift;
1814	0					0	my $arr_ref = shift;
1815	0					0	my @arr = @{$arr_ref};
	0					0
1816	0					0	my @compare = map {abs($_ - $value)} @arr;
	0					0
1817	0					0	my ($minval,$index) = minimum(\@compare);
1818	0					0	return $arr[$index];
1819							}
1820
1821							## returns the array index that contains a specified STRING value: (meant only for array of strings)
1822							sub get_index_at_value {
1823	0			0	0	0	my $value = shift;
1824	0					0	my @array = @{shift @_};
	0					0
1825	0					0	foreach my $i (0..@array-1) {
1826	0	0				0	return $i if $value eq $array[$i];
1827							}
1828							}
1829
1830							## When the training data is read from a CSV file, we assume that the first column
1831							## of each data record contains a unique integer identifier for the record in that
1832							## row. This training data is stored in a hash whose keys are the prefix 'sample_'
1833							## followed by the identifying integers. The purpose of this function is to return
1834							## the identifying integer associated with a data record.
1835							sub sample_index {
1836	1096			1096	0	678	my $arg = shift;
1837	1096					1079	$arg =~ /_(.+)$/;
1838	1096					1212	return $1;
1839							}
1840
1841							# Returns the minimum value and its positional index in an array
1842							sub minimum {
1843	0			0	0	0	my $arr = shift;
1844	0					0	my $min;
1845							my $index;
1846	0					0	foreach my $i (0..@{$arr}-1) {
	0					0
1847	0	0	0			0	if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
1848	0					0	$index = $i;
1849	0					0	$min = $arr->[$i];
1850							}
1851							}
1852	0					0	return ($min, $index);
1853							}
1854
1855							# Returns an array of two values, the min and the max, of an array of floats
1856							sub minmax {
1857	0			0	0	0	my $arr = shift;
1858	0					0	my ($min, $max);
1859	0					0	foreach my $i (0..@{$arr}-1) {
	0					0
1860	0	0	0			0	if ( (!defined $min) \|\| ($arr->[$i] < $min) ) {
1861	0					0	$min = $arr->[$i];
1862							}
1863	0	0	0			0	if ( (!defined $max) \|\| ($arr->[$i] > $max) ) {
1864	0					0	$max = $arr->[$i];
1865							}
1866							}
1867	0					0	return ($min, $max);
1868							}
1869
1870							# checks whether an element is in an array:
1871							sub contained_in {
1872	169			169	0	113	my $ele = shift;
1873	169					162	my @array = @_;
1874	169					95	my $count = 0;
1875	169	100				129	map {$count++ if $ele eq $_} @array;
	376					598
1876	169					275	return $count;
1877							}
1878
1879							# Meant only for an array of strings (no nesting):
1880							sub deep_copy_array {
1881	136			136	0	96	my $ref_in = shift;
1882	136					80	my $ref_out;
1883	136	100				206	return [] if scalar @$ref_in == 0;
1884	121					83	foreach my $i (0..@{$ref_in}-1) {
	121					158
1885	239					251	$ref_out->[$i] = $ref_in->[$i];
1886							}
1887	121					171	return $ref_out;
1888							}
1889
1890							sub check_for_illegal_params2 {
1891	1			1	0	2	my @params = @_;
1892	1					3	my @legal_params = qw / training_datafile
1893							entropy_threshold
1894							max_depth_desired
1895							csv_class_column_index
1896							csv_columns_for_features
1897							symbolic_to_numeric_cardinality_threshold
1898							number_of_histogram_bins
1899							csv_cleanup_needed
1900							debug1
1901							debug2
1902							debug3
1903							/;
1904	1					1	my $found_match_flag;
1905	1					3	foreach my $param (@params) {
1906	3					2	foreach my $legal (@legal_params) {
1907	10					8	$found_match_flag = 0;
1908	10	100				15	if ($param eq $legal) {
1909	3					2	$found_match_flag = 1;
1910	3					3	last;
1911							}
1912							}
1913	3	50				5	last if $found_match_flag == 0;
1914							}
1915	1					4	return $found_match_flag;
1916							}
1917
1918							sub print_array_with_msg {
1919	0			0	0	0	my $message = shift;
1920	0					0	my $arr = shift;
1921	0					0	print "\n$message: ";
1922	0					0	print_nested_array( $arr );
1923							}
1924
1925							sub print_nested_array {
1926	0			0	0	0	my $arr = shift;
1927	0					0	my @arr = @$arr;
1928	0					0	print "[";
1929	0					0	foreach my $item (@arr) {
1930	0	0				0	if (ref $item) {
1931	0					0	print_nested_array($item);
1932							} else {
1933	0					0	print "$item";
1934							}
1935							}
1936	0					0	print "]";
1937							}
1938
1939							sub cleanup_csv {
1940	0			0	0	0	my $line = shift;
1941	0					0	$line =~ tr/\/:?()[]{}'/ /;
1942	0					0	my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
1943	0					0	for (@double_quoted) {
1944	0					0	my $item = $_;
1945	0					0	$item = substr($item, 1, -1);
1946	0					0	$item =~ s/^s+\|,\|\s+$//g;
1947	0					0	$item = join '_', split /\s+/, $item;
1948	0					0	substr($line, index($line, $_), length($_)) = $item;
1949							}
1950	0					0	my @white_spaced = $line =~ /,(\s*[^,]+)(?=,\|$)/g;
1951	0					0	for (@white_spaced) {
1952	0					0	my $item = $_;
1953	0					0	$item =~ s/\s+/_/g;
1954	0					0	$item =~ s/^\s_\|_\s$//g;
1955	0					0	substr($line, index($line, $_), length($_)) = $item;
1956							}
1957	0					0	$line =~ s/,\s*(?=,\|$)/,NA/g;
1958	0					0	return $line;
1959							}
1960
1961							######################################### Class EvalTrainingData ########################################
1962
1963							## This subclass of the DecisionTree class is used to evaluate the quality of your
1964							## training data by running a 10-fold cross-validation test on it. This test divides
1965							## all of the training data into ten parts, with nine parts used for training a
1966							## decision tree and one part used for testing its ability to classify correctly.
1967							## This selection of nine parts for training and one part for testing is carried out
1968							## in all of the ten different possible ways. This testing functionality can also
1969							## be used to find the best values to use for the constructor parameters
1970							## entropy_threshold, max_depth_desired, and
1971							## symbolic_to_numeric_cardinality_threshold.
1972
1973							## Only the CSV training files can be evaluated in this manner (because only CSV
1974							## training are allowed to have numeric features --- which is the more interesting
1975							## case for evaluation analytics.
1976
1977							package EvalTrainingData;
1978
1979							@EvalTrainingData::ISA = ('Algorithm::DecisionTree');
1980
1981							sub new {
1982	0			0		0	my $class = shift;
1983	0					0	my $instance = Algorithm::DecisionTree->new(@_);
1984	0					0	bless $instance, $class;
1985							}
1986
1987							sub evaluate_training_data {
1988	0			0		0	my $self = shift;
1989	0					0	my $evaldebug = 0;
1990							die "The data evaluation function in the module can only be used when your " .
1991	0	0				0	"training data is in a CSV file" unless $self->{_training_datafile} =~ /\.csv$/;
1992	0					0	print "\nWill run a 10-fold cross-validation test on your training data to test its " .
1993							"class-discriminatory power:\n";
1994	0					0	my %all_training_data = %{$self->{_training_data_hash}};
	0					0
1995	0					0	my @all_sample_names = sort {Algorithm::DecisionTree::sample_index($a) <=>
	0					0
1996							Algorithm::DecisionTree::sample_index($b)} keys %all_training_data;
1997	0					0	my $fold_size = int(0.1 * (scalar keys %all_training_data));
1998	0					0	print "fold size: $fold_size\n";
1999	0					0	my %confusion_matrix = ();
2000	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
2001	0					0	foreach my $inner_class_name (@{$self->{_class_names}}) {
	0					0
2002	0					0	$confusion_matrix{$class_name}->{$inner_class_name} = 0;
2003							}
2004							}
2005	0					0	foreach my $fold_index (0..9) {
2006	0					0	print "\nStarting the iteration indexed $fold_index of the 10-fold cross-validation test\n";
2007	0					0	my @testing_samples = @all_sample_names[$fold_size * $fold_index .. $fold_size * ($fold_index+1) - 1];
2008	0					0	my @training_samples = (@all_sample_names[0 .. $fold_size * $fold_index-1],
2009							@all_sample_names[$fold_size * ($fold_index+1) .. (scalar keys %all_training_data) - 1]);
2010	0					0	my %testing_data = ();
2011	0					0	foreach my $x (@testing_samples) {
2012	0					0	$testing_data{$x} = $all_training_data{$x};
2013							}
2014	0					0	my %training_data = ();
2015	0					0	foreach my $x (@training_samples) {
2016	0					0	$training_data{$x} = $all_training_data{$x};
2017							}
2018	0					0	my $trainingDT = Algorithm::DecisionTree->new('evalmode');
2019	0					0	$trainingDT->{_training_data_hash} = \%training_data;
2020	0					0	$trainingDT->{_class_names} = $self->{_class_names};
2021	0					0	$trainingDT->{_feature_names} = $self->{_feature_names};
2022	0					0	$trainingDT->{_entropy_threshold} = $self->{_entropy_threshold};
2023	0					0	$trainingDT->{_max_depth_desired} = $self->{_max_depth_desired};
2024							$trainingDT->{_symbolic_to_numeric_cardinality_threshold} =
2025	0					0	$self->{_symbolic_to_numeric_cardinality_threshold};
2026	0					0	foreach my $sample_name (@training_samples) {
2027							$trainingDT->{_samples_class_label_hash}->{$sample_name} =
2028	0					0	$self->{_samples_class_label_hash}->{$sample_name};
2029							}
2030	0					0	foreach my $feature (keys %{$self->{_features_and_values_hash}}) {
	0					0
2031	0					0	$trainingDT->{_features_and_values_hash}->{$feature} = ();
2032							}
2033	0					0	my $pattern = '(\S+)\s=\s(\S+)';
2034	0					0	foreach my $item (sort {Algorithm::DecisionTree::sample_index($a) <=>
	0					0
2035							Algorithm::DecisionTree::sample_index($b)}
2036	0					0	keys %{$trainingDT->{_training_data_hash}}) {
2037	0					0	foreach my $feature_and_value (@{$trainingDT->{_training_data_hash}->{$item}}) {
	0					0
2038	0					0	$feature_and_value =~ /$pattern/;
2039	0					0	my ($feature,$value) = ($1,$2);
2040	0	0				0	push @{$trainingDT->{_features_and_values_hash}->{$feature}}, $value if $value ne 'NA';
	0					0
2041							}
2042							}
2043	0					0	foreach my $feature (keys %{$trainingDT->{_features_and_values_hash}}) {
	0					0
2044	0					0	my %seen = ();
2045	0	0	0			0	my @unique_values_for_feature = grep {$_ if $_ ne 'NA' && !$seen{$_}++}
2046	0					0	@{$trainingDT->{_features_and_values_hash}->{$feature}};
	0					0
2047	0	0				0	if (Algorithm::DecisionTree::contained_in($feature,
2048	0					0	keys %{$self->{_numeric_features_valuerange_hash}})) {
2049	0					0	@unique_values_for_feature = sort {$a <=> $b} @unique_values_for_feature;
	0					0
2050							} else {
2051	0					0	@unique_values_for_feature = sort @unique_values_for_feature;
2052							}
2053	0					0	$trainingDT->{_features_and_unique_values_hash}->{$feature} = \@unique_values_for_feature;
2054							}
2055	0					0	foreach my $feature (keys %{$self->{_numeric_features_valuerange_hash}}) {
	0					0
2056							my @minmaxvalues = Algorithm::DecisionTree::minmax(
2057	0					0	\@{$trainingDT->{_features_and_unique_values_hash}->{$feature}});
	0					0
2058	0					0	$trainingDT->{_numeric_features_valuerange_hash}->{$feature} = \@minmaxvalues;
2059							}
2060	0	0				0	if ($evaldebug) {
2061	0					0	print "\n\nprinting samples in the testing set: @testing_samples\n";
2062	0					0	print "\n\nPrinting features and their values in the training set:\n";
2063	0					0	foreach my $item (sort keys %{$trainingDT->{_features_and_values_hash}}) {
	0					0
2064	0					0	print "$item => @{$trainingDT->{_features_and_values_hash}->{$item}}\n";
	0					0
2065							}
2066	0					0	print "\n\nPrinting unique values for features:\n";
2067	0					0	foreach my $item (sort keys %{$trainingDT->{_features_and_unique_values_hash}}) {
	0					0
2068	0					0	print "$item => @{$trainingDT->{_features_and_unique_values_hash}->{$item}}\n";
	0					0
2069							}
2070	0					0	print "\n\nPrinting unique value ranges for features:\n";
2071	0					0	foreach my $item (sort keys %{$trainingDT->{_numeric_features_valuerange_hash}}) {
	0					0
2072	0					0	print "$item => @{$trainingDT->{_numeric_features_valuerange_hash}->{$item}}\n";
	0					0
2073							}
2074							}
2075	0					0	foreach my $feature (keys %{$self->{_features_and_unique_values_hash}}) {
	0					0
2076							$trainingDT->{_feature_values_how_many_uniques_hash}->{$feature} =
2077	0					0	scalar @{$trainingDT->{_features_and_unique_values_hash}->{$feature}};
	0					0
2078							}
2079	0	0				0	$trainingDT->{_debug2} = 1 if $evaldebug;
2080	0					0	$trainingDT->calculate_first_order_probabilities();
2081	0					0	$trainingDT->calculate_class_priors();
2082	0					0	my $root_node = $trainingDT->construct_decision_tree_classifier();
2083	0	0				0	$root_node->display_decision_tree(" ") if $evaldebug;
2084	0					0	foreach my $test_sample_name (@testing_samples) {
2085	0					0	my @test_sample_data = @{$all_training_data{$test_sample_name}};
	0					0
2086	0	0				0	print "original data in test sample: @test_sample_data\n" if $evaldebug;
2087	0	0	0			0	@test_sample_data = grep {$_ if $_ && $_ !~ /=NA$/} @test_sample_data;
	0					0
2088	0	0				0	print "filtered data in test sample: @test_sample_data\n" if $evaldebug;
2089	0					0	my %classification = %{$trainingDT->classify($root_node, \@test_sample_data)};
	0					0
2090	0					0	my @solution_path = @{$classification{'solution_path'}};
	0					0
2091	0					0	delete $classification{'solution_path'};
2092	0					0	my @which_classes = keys %classification;
2093	0					0	@which_classes = sort {$classification{$b} <=> $classification{$a}} @which_classes;
	0					0
2094	0					0	my $most_likely_class_label = $which_classes[0];
2095	0	0				0	if ($evaldebug) {
2096	0					0	print "\nClassification:\n\n";
2097	0					0	print " class probability\n";
2098	0					0	print " ---------- -----------\n";
2099	0					0	foreach my $which_class (@which_classes) {
2100	0					0	my $classstring = sprintf("%-30s", $which_class);
2101	0					0	my $valuestring = sprintf("%-30s", $classification{$which_class});
2102	0					0	print " $classstring $valuestring\n";
2103							}
2104	0					0	print "\nSolution path in the decision tree: @solution_path\n";
2105	0					0	print "\nNumber of nodes created: " . $root_node->how_many_nodes() . "\n";
2106							}
2107	0					0	my $true_class_label_for_sample = $self->{_samples_class_label_hash}->{$test_sample_name};
2108	0	0				0	print "$test_sample_name: true_class: $true_class_label_for_sample " .
2109							"estimated_class: $most_likely_class_label\n" if $evaldebug;
2110	0					0	$confusion_matrix{$true_class_label_for_sample}->{$most_likely_class_label} += 1;
2111							}
2112							}
2113	0					0	print "\n\n DISPLAYING THE CONFUSION MATRIX FOR THE 10-FOLD CROSS-VALIDATION TEST:\n\n\n";
2114	0					0	my $matrix_header = " " x 30;
2115	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
2116	0					0	$matrix_header .= sprintf("%-30s", $class_name);
2117							}
2118	0					0	print "\n" . $matrix_header . "\n\n";
2119	0					0	foreach my $row_class_name (sort keys %confusion_matrix) {
2120	0					0	my $row_display = sprintf("%-30s", $row_class_name);
2121	0					0	foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
	0					0
2122	0					0	$row_display .= sprintf( "%-30u", $confusion_matrix{$row_class_name}->{$col_class_name} );
2123							}
2124	0					0	print "$row_display\n\n";
2125							}
2126	0					0	print "\n\n";
2127	0					0	my ($diagonal_sum, $off_diagonal_sum) = (0,0);
2128	0					0	foreach my $row_class_name (sort keys %confusion_matrix) {
2129	0					0	foreach my $col_class_name (sort keys %{$confusion_matrix{$row_class_name}}) {
	0					0
2130	0	0				0	if ($row_class_name eq $col_class_name) {
2131	0					0	$diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2132							} else {
2133	0					0	$off_diagonal_sum += $confusion_matrix{$row_class_name}->{$col_class_name};
2134							}
2135							}
2136							}
2137	0					0	my $data_quality_index = 100.0 * $diagonal_sum / ($diagonal_sum + $off_diagonal_sum);
2138	0					0	print "\nTraining Data Quality Index: $data_quality_index (out of a possible maximum of 100)\n";
2139	0	0	0			0	if ($data_quality_index <= 80) {
		0	0
		0	0
		0
2140	0					0	print "\nYour training data does not possess much class discriminatory " .
2141							"information. It could be that the classes are inherently not well " .
2142							"separable or that your constructor parameter choices are not appropriate.\n";
2143							} elsif ($data_quality_index > 80 && $data_quality_index <= 90) {
2144	0					0	print "\nYour training data possesses some class discriminatory information " .
2145							"but it may not be sufficient for real-world applications. You might " .
2146							"try tweaking the constructor parameters to see if that improves the " .
2147							"class discriminations.\n";
2148							} elsif ($data_quality_index > 90 && $data_quality_index <= 95) {
2149	0					0	print "\nYour training data appears to possess good class discriminatory " .
2150							"information. Whether or not it is acceptable would depend on your " .
2151							"application.\n";
2152							} elsif ($data_quality_index > 95 && $data_quality_index <= 98) {
2153	0					0	print "\nYour training data is of excellent quality.\n";
2154							} else {
2155	0					0	print "\nYour training data is perfect.\n";
2156							}
2157
2158							}
2159
2160
2161							############################################# Class DTNode #############################################
2162
2163							# The nodes of the decision tree are instances of this class:
2164
2165							package DTNode;
2166
2167	1			1		16	use strict;
	1					2
	1					25
2168	1			1		4	use Carp;
	1					1
	1					847
2169
2170							# $feature is the feature test at the current node. $branch_features_and_values is
2171							# an anonymous array holding the feature names and corresponding values on the path
2172							# from the root to the current node:
2173							sub new {
2174	26			26		26	my ($class, $feature, $entropy, $class_probabilities,
2175							$branch_features_and_values_or_thresholds, $dt, $root_or_not) = @_;
2176	26	100				46	$root_or_not = '' if !defined $root_or_not;
2177	26	100				39	if ($root_or_not eq 'root') {
2178	1					2	$dt->{nodes_created} = -1;
2179	1					4	$dt->{class_names} = undef;
2180							}
2181	26					78	my $self = {
2182							_dt => $dt,
2183							_feature => $feature,
2184							_node_creation_entropy => $entropy,
2185							_class_probabilities => $class_probabilities,
2186							_branch_features_and_values_or_thresholds => $branch_features_and_values_or_thresholds,
2187							_linked_to => [],
2188							};
2189	26					29	bless $self, $class;
2190	26					31	$self->{_serial_number} = $self->get_next_serial_num();
2191	26					29	return $self;
2192							}
2193
2194							sub how_many_nodes {
2195	0			0		0	my $self = shift;
2196	0					0	return $self->{_dt}->{nodes_created} + 1;
2197							}
2198
2199							sub set_class_names {
2200	1			1		1	my $self = shift;
2201	1					1	my $class_names_list = shift;
2202	1					2	$self->{_dt}->{class_names} = $class_names_list;
2203							}
2204
2205							sub get_class_names {
2206	0			0		0	my $self = shift;
2207	0					0	return $self->{_dt}->{class_names};
2208							}
2209
2210							sub get_next_serial_num {
2211	26			26		18	my $self = shift;
2212	26					31	$self->{_dt}->{nodes_created} += 1;
2213	26					35	return $self->{_dt}->{nodes_created};
2214							}
2215
2216							sub get_serial_num {
2217	26			26		18	my $self = shift;
2218	26					24	$self->{_serial_number};
2219							}
2220
2221							# this returns the feature test at the current node
2222							sub get_feature {
2223	0			0		0	my $self = shift;
2224	0					0	return $self->{ _feature };
2225							}
2226
2227							sub set_feature {
2228	18			18		16	my $self = shift;
2229	18					16	my $feature = shift;
2230	18					20	$self->{_feature} = $feature;
2231							}
2232
2233							sub get_node_entropy {
2234	26			26		21	my $self = shift;
2235	26					26	return $self->{_node_creation_entropy};
2236							}
2237
2238							sub get_class_probabilities {
2239	0			0		0	my $self = shift;
2240	0					0	return $self->{ _class_probabilities};
2241							}
2242
2243							sub get_branch_features_and_values_or_thresholds {
2244	26			26		19	my $self = shift;
2245	26					34	return $self->{_branch_features_and_values_or_thresholds};
2246							}
2247
2248							sub add_to_branch_features_and_values {
2249	0			0		0	my $self = shift;
2250	0					0	my $feature_and_value = shift;
2251	0					0	push @{$self->{ _branch_features_and_values }}, $feature_and_value;
	0					0
2252							}
2253
2254							sub get_children {
2255	0			0		0	my $self = shift;
2256	0					0	return $self->{_linked_to};
2257							}
2258
2259							sub add_child_link {
2260	25			25		22	my ($self, $new_node, ) = @_;
2261	25					16	push @{$self->{_linked_to}}, $new_node;
	25					36
2262							}
2263
2264							sub delete_all_links {
2265	0			0		0	my $self = shift;
2266	0					0	$self->{_linked_to} = undef;
2267							}
2268
2269							sub display_node {
2270	0			0		0	my $self = shift;
2271	0		0			0	my $feature_at_node = $self->get_feature() \|\| " ";
2272	0					0	my $node_creation_entropy_at_node = $self->get_node_entropy();
2273	0					0	my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2274	0					0	my @class_probabilities = @{$self->get_class_probabilities()};
	0					0
2275	0					0	my @class_probabilities_for_display = map {sprintf("%0.3f", $_)} @class_probabilities;
	0					0
2276	0					0	my $serial_num = $self->get_serial_num();
2277	0					0	my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
	0					0
2278	0					0	print "\n\nNODE $serial_num" .
2279							":\n Branch features and values to this node: @branch_features_and_values_or_thresholds" .
2280							"\n Class probabilities at current node: @class_probabilities_for_display" .
2281							"\n Entropy at current node: $print_node_creation_entropy_at_node" .
2282							"\n Best feature test at current node: $feature_at_node\n\n";
2283							}
2284
2285							sub display_decision_tree {
2286	0			0		0	my $self = shift;
2287	0					0	my $offset = shift;
2288	0					0	my $serial_num = $self->get_serial_num();
2289	0	0				0	if (@{$self->get_children()} > 0) {
	0					0
2290	0		0			0	my $feature_at_node = $self->get_feature() \|\| " ";
2291	0					0	my $node_creation_entropy_at_node = $self->get_node_entropy();
2292	0					0	my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2293	0					0	my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
	0					0
2294	0					0	my @class_probabilities = @{$self->get_class_probabilities()};
	0					0
2295	0					0	my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
	0					0
2296	0					0	my @class_names = @{$self->get_class_names()};
	0					0
2297							my @print_class_probabilities_with_class =
2298	0					0	map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
	0					0
2299	0					0	print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
2300	0					0	my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2301	0					0	print "$second_line_offset" . "Decision Feature: $feature_at_node Node Creation Entropy: " ,
2302							"$print_node_creation_entropy_at_node Class Probs: @print_class_probabilities_with_class\n\n";
2303	0					0	$offset .= " ";
2304	0					0	foreach my $child (@{$self->get_children()}) {
	0					0
2305	0					0	$child->display_decision_tree($offset);
2306							}
2307							} else {
2308	0					0	my $node_creation_entropy_at_node = $self->get_node_entropy();
2309	0					0	my $print_node_creation_entropy_at_node = sprintf("%.3f", $node_creation_entropy_at_node);
2310	0					0	my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
	0					0
2311	0					0	my @class_probabilities = @{$self->get_class_probabilities()};
	0					0
2312	0					0	my @print_class_probabilities = map {sprintf("%0.3f", $_)} @class_probabilities;
	0					0
2313	0					0	my @class_names = @{$self->get_class_names()};
	0					0
2314							my @print_class_probabilities_with_class =
2315	0					0	map {"$class_names[$_]" . '=>' . $print_class_probabilities[$_]} 0..@class_names-1;
	0					0
2316	0					0	print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
2317	0					0	my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
2318	0					0	print "$second_line_offset" . "Node Creation Entropy: $print_node_creation_entropy_at_node " .
2319							"Class Probs: @print_class_probabilities_with_class\n\n";
2320							}
2321							}
2322
2323
2324							############################## Generate Your Own Numeric Training Data #################################
2325							############################# Class TrainingDataGeneratorNumeric ################################
2326
2327							## See the script generate_training_data_numeric.pl in the examples
2328							## directory on how to use this class for generating your own numeric training and
2329							## test data. The training and test data are generated in accordance with the
2330							## specifications you place in the parameter file that is supplied as an argument to
2331							## the constructor of this class.
2332
2333							package TrainingDataGeneratorNumeric;
2334
2335	1			1		5	use strict;
	1					1
	1					16
2336	1			1		3	use Carp;
	1					1
	1					943
2337
2338							sub new {
2339	0			0		0	my ($class, %args) = @_;
2340	0					0	my @params = keys %args;
2341	0	0				0	croak "\nYou have used a wrong name for a keyword argument " .
2342							"--- perhaps a misspelling\n"
2343							if check_for_illegal_params3(@params) == 0;
2344							bless {
2345							_output_training_csv_file => $args{'output_training_csv_file'}
2346							\|\| croak("name for output_training_csv_file required"),
2347							_output_test_csv_file => $args{'output_test_csv_file'}
2348							\|\| croak("name for output_test_csv_file required"),
2349							_parameter_file => $args{'parameter_file'}
2350							\|\| croak("parameter_file required"),
2351							_number_of_samples_for_training => $args{'number_of_samples_for_training'}
2352							\|\| croak("number_of_samples_for_training"),
2353							_number_of_samples_for_testing => $args{'number_of_samples_for_testing'}
2354							\|\| croak("number_of_samples_for_testing"),
2355	0		0			0	_debug => $args{debug} \|\| 0,
			0
			0
			0
			0
			0
2356							_class_names => [],
2357							_class_names_and_priors => {},
2358							_features_with_value_range => {},
2359							_features_ordered => [],
2360							_classes_and_their_param_values => {},
2361							}, $class;
2362							}
2363
2364							sub check_for_illegal_params3 {
2365	0			0		0	my @params = @_;
2366	0					0	my @legal_params = qw / output_training_csv_file
2367							output_test_csv_file
2368							parameter_file
2369							number_of_samples_for_training
2370							number_of_samples_for_testing
2371							debug
2372							/;
2373	0					0	my $found_match_flag;
2374	0					0	foreach my $param (@params) {
2375	0					0	foreach my $legal (@legal_params) {
2376	0					0	$found_match_flag = 0;
2377	0	0				0	if ($param eq $legal) {
2378	0					0	$found_match_flag = 1;
2379	0					0	last;
2380							}
2381							}
2382	0	0				0	last if $found_match_flag == 0;
2383							}
2384	0					0	return $found_match_flag;
2385							}
2386
2387							## The training data generated by an instance of the class
2388							## TrainingDataGeneratorNumeric is based on the specs you place in a parameter that
2389							## you supply to the class constructor through a constructor variable called
2390							## `parameter_file'. This method is for parsing the parameter file in order to
2391							## order to determine the names to be used for the different data classes, their
2392							## means, and their variances.
2393							sub read_parameter_file_numeric {
2394	0			0		0	my $self = shift;
2395	0					0	my @class_names = ();
2396	0					0	my %class_names_and_priors = ();
2397	0					0	my %features_with_value_range = ();
2398	0					0	my %classes_and_their_param_values = ();
2399							# my $regex8 = '[+-]?\ (\d+(\.\d)?\|\.\d+)([eE][+-]?\d+)?';
2400	0		0			0	open FILE, $self->{_parameter_file} \|\| die "unable to open parameter file: $!";
2401	0					0	my @params = ;
2402	0					0	my $params = join "", @params;
2403	0					0	my $regex = 'class names: ([\w ]+)\W*class priors: ([\d. ]+)';
2404	0					0	$params =~ /$regex/si;
2405	0					0	my ($class_names, $class_priors) = ($1, $2);
2406	0					0	@class_names = split ' ', $class_names;
2407	0					0	my @class_priors = split ' ', $class_priors;
2408	0					0	foreach my $i (0..@class_names-1) {
2409	0					0	$class_names_and_priors{$class_names[$i]} = $class_priors[$i];
2410							}
2411	0	0				0	if ($self->{_debug}) {
2412	0					0	foreach my $cname (keys %class_names_and_priors) {
2413	0					0	print "$cname => $class_names_and_priors{$cname}\n";
2414							}
2415							}
2416	0					0	$regex = 'feature name: \w.?value range: [\d\. -]+';
2417	0					0	my @features = $params =~ /$regex/gsi;
2418	0					0	my @features_ordered;
2419	0					0	$regex = 'feature name: (\w+)\W?value range:\s([\d. -]+)';
2420	0					0	foreach my $feature (@features) {
2421	0					0	$feature =~ /$regex/i;
2422	0					0	my $feature_name = $1;
2423	0					0	push @features_ordered, $feature_name;
2424	0					0	my @value_range = split ' ', $2;
2425	0					0	$features_with_value_range{$feature_name} = \@value_range;
2426							}
2427	0	0				0	if ($self->{_debug}) {
2428	0					0	foreach my $fname (keys %features_with_value_range) {
2429	0					0	print "$fname => @{$features_with_value_range{$fname}}\n";
	0					0
2430							}
2431							}
2432	0					0	foreach my $i (0..@class_names-1) {
2433	0					0	$classes_and_their_param_values{$class_names[$i]} = {};
2434							}
2435	0					0	$regex = 'params for class: \w?\W+?mean:[\d\. ]+\W?covariance:\W+?(?:[ \d.]+\W+?)+';
2436	0					0	my @class_params = $params =~ /$regex/gsi;
2437	0					0	$regex = 'params for class: (\w+)\W?mean:\s([\d. -]+)\Wcovariance:\s([\s\d.]+)';
2438	0					0	foreach my $class_param (@class_params) {
2439	0					0	$class_param =~ /$regex/gsi;
2440	0					0	my $class_name = $1;
2441	0					0	my @class_mean = split ' ', $2;
2442	0					0	$classes_and_their_param_values{$class_name}->{'mean'} = \@class_mean;
2443	0					0	my $class_param_string = $3;
2444	0					0	my @covar_rows = split '\n', $class_param_string;
2445	0					0	my @covar_matrix;
2446	0					0	foreach my $row (@covar_rows) {
2447	0					0	my @row = split ' ', $row;
2448	0					0	push @covar_matrix, \@row;
2449							}
2450	0					0	$classes_and_their_param_values{$class_name}->{'covariance'} = \@covar_matrix;
2451							}
2452	0	0				0	if ($self->{_debug}) {
2453	0					0	print "\nThe class parameters are:\n\n";
2454	0					0	foreach my $cname (keys %classes_and_their_param_values) {
2455	0					0	print "\nFor class name $cname:\n";
2456	0					0	my %params_hash = %{$classes_and_their_param_values{$cname}};
	0					0
2457	0					0	foreach my $x (keys %params_hash) {
2458	0	0				0	if ($x eq 'mean') {
2459	0					0	print " $x => @{$params_hash{$x}}\n";
	0					0
2460							} else {
2461	0	0				0	if ($x eq 'covariance') {
2462	0					0	print " The covariance matrix:\n";
2463	0					0	my @matrix = @{$params_hash{'covariance'}};
	0					0
2464	0					0	foreach my $row (@matrix) {
2465	0					0	print " @$row\n";
2466							}
2467							}
2468							}
2469							}
2470							}
2471							}
2472	0					0	$self->{_class_names} = \@class_names;
2473	0					0	$self->{_class_names_and_priors} = \%class_names_and_priors;
2474	0					0	$self->{_features_with_value_range} = \%features_with_value_range;
2475	0					0	$self->{_classes_and_their_param_values} = \%classes_and_their_param_values;
2476	0					0	$self->{_features_ordered} = \@features_ordered;
2477							}
2478
2479							## After the parameter file is parsed by the previous method, this method calls on
2480							## Math::Random::random_multivariate_normal() to generate the training and test data
2481							## samples. Your training and test data can be of any number of of dimensions, can
2482							## have any mean, and any covariance. The training and test data must obviously be
2483							## drawn from the same distribution.
2484							sub gen_numeric_training_and_test_data_and_write_to_csv {
2485	1			1		584	use Math::Random;
	1					4461
	1					767
2486	0			0		0	my $self = shift;
2487	0					0	my %training_samples_for_class;
2488							my %test_samples_for_class;
2489	0					0	foreach my $class_name (@{$self->{_class_names}}) {
	0					0
2490	0					0	$training_samples_for_class{$class_name} = [];
2491	0					0	$test_samples_for_class{$class_name} = [];
2492							}
2493	0					0	foreach my $class_name (keys %{$self->{_classes_and_their_param_values}}) {
	0					0
2494	0					0	my @mean = @{$self->{_classes_and_their_param_values}->{$class_name}->{'mean'}};
	0					0
2495	0					0	my @covariance = @{$self->{_classes_and_their_param_values}->{$class_name}->{'covariance'}};
	0					0
2496							my @new_training_data = Math::Random::random_multivariate_normal(
2497	0					0	$self->{_number_of_samples_for_training} * $self->{_class_names_and_priors}->{$class_name},
2498							@mean, @covariance );
2499							my @new_test_data = Math::Random::random_multivariate_normal(
2500	0					0	$self->{_number_of_samples_for_testing} * $self->{_class_names_and_priors}->{$class_name},
2501							@mean, @covariance );
2502	0	0				0	if ($self->{_debug}) {
2503	0					0	print "training data for class $class_name:\n";
2504	0					0	foreach my $x (@new_training_data) {print "@$x\n";}
	0					0
2505	0					0	print "\n\ntest data for class $class_name:\n";
2506	0					0	foreach my $x (@new_test_data) {print "@$x\n";}
	0					0
2507							}
2508	0					0	$training_samples_for_class{$class_name} = \@new_training_data;
2509	0					0	$test_samples_for_class{$class_name} = \@new_test_data;
2510							}
2511	0					0	my @training_data_records = ();
2512	0					0	my @test_data_records = ();
2513	0					0	foreach my $class_name (keys %training_samples_for_class) {
2514							my $num_of_samples_for_training = $self->{_number_of_samples_for_training} *
2515	0					0	$self->{_class_names_and_priors}->{$class_name};
2516							my $num_of_samples_for_testing = $self->{_number_of_samples_for_testing} *
2517	0					0	$self->{_class_names_and_priors}->{$class_name};
2518	0					0	foreach my $sample_index (0..$num_of_samples_for_training-1) {
2519	0					0	my @training_vector = @{$training_samples_for_class{$class_name}->[$sample_index]};
	0					0
2520	0					0	@training_vector = map {sprintf("%.3f", $_)} @training_vector;
	0					0
2521	0					0	my $training_data_record = "$class_name," . join(",", @training_vector) . "\n";
2522	0					0	push @training_data_records, $training_data_record;
2523							}
2524	0					0	foreach my $sample_index (0..$num_of_samples_for_testing-1) {
2525	0					0	my @test_vector = @{$test_samples_for_class{$class_name}->[$sample_index]};
	0					0
2526	0					0	@test_vector = map {sprintf("%.3f", $_)} @test_vector;
	0					0
2527	0					0	my $test_data_record = "$class_name," . join(",", @test_vector) . "\n";
2528	0					0	push @test_data_records, $test_data_record;
2529							}
2530							}
2531	0					0	fisher_yates_shuffle(\@training_data_records);
2532	0					0	fisher_yates_shuffle(\@test_data_records);
2533	0	0				0	if ($self->{_debug}) {
2534	0					0	foreach my $record (@training_data_records) {
2535	0					0	print "$record";
2536							}
2537	0					0	foreach my $record (@test_data_records) {
2538	0					0	print "$record";
2539							}
2540							}
2541	0					0	open OUTPUT, ">$self->{_output_training_csv_file}";
2542	0					0	my @feature_names_training = @{$self->{_features_ordered}};
	0					0
2543	0					0	my @quoted_feature_names_training = map {"\"$_\""} @feature_names_training;
	0					0
2544	0					0	my $first_row_training = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_training;
2545	0					0	print OUTPUT "$first_row_training\n";
2546	0					0	foreach my $i (0..@training_data_records-1) {
2547	0					0	my $i1 = $i+1;
2548	0					0	my $sample_record = "\"$i1\",$training_data_records[$i]";
2549	0					0	print OUTPUT "$sample_record";
2550							}
2551	0					0	close OUTPUT;
2552	0					0	open OUTPUT, ">$self->{_output_test_csv_file}";
2553	0					0	my @feature_names_testing = keys %{$self->{_features_with_value_range}};
	0					0
2554	0					0	my @quoted_feature_names_testing = map {"\"$_\""} @feature_names_testing;
	0					0
2555	0					0	my $first_row_testing = '"",' . "\"class_name\"," . join ",", @quoted_feature_names_testing;
2556	0					0	print OUTPUT "$first_row_testing\n";
2557	0					0	foreach my $i (0..@test_data_records-1) {
2558	0					0	my $i1 = $i+1;
2559	0					0	my $sample_record = "\"$i1\",$test_data_records[$i]";
2560	0					0	print OUTPUT "$sample_record";
2561							}
2562	0					0	close OUTPUT;
2563							}
2564
2565							# from perl docs:
2566							sub fisher_yates_shuffle {
2567	0			0		0	my $arr = shift;
2568	0					0	my $i = @$arr;
2569	0					0	while (--$i) {
2570	0					0	my $j = int rand( $i + 1 );
2571	0					0	@$arr[$i, $j] = @$arr[$j, $i];
2572							}
2573							}
2574
2575							########################### Generate Your Own Symbolic Training Data ###############################
2576							########################### Class TrainingDataGeneratorSymbolic #############################
2577
2578							## See the sample script generate_training_and_test_data_symbolic.pl for how to use
2579							## this class for generating purely symbolic training and test data. The data is
2580							## generated according to the specifications you place in a parameter file whose
2581							## name you supply as one of constructor arguments.
2582							package TrainingDataGeneratorSymbolic;
2583
2584	1			1		6	use strict;
	1					1
	1					21
2585	1			1		3	use Carp;
	1					2
	1					1734
2586
2587							sub new {
2588	1			1		14	my ($class, %args) = @_;
2589	1					4	my @params = keys %args;
2590	1	50				3	croak "\nYou have used a wrong name for a keyword argument " .
2591							"--- perhaps a misspelling\n"
2592							if check_for_illegal_params4(@params) == 0;
2593							bless {
2594							_output_training_datafile => $args{'output_training_datafile'}
2595							\|\| die("name for output_training_datafile required"),
2596							_parameter_file => $args{'parameter_file'}
2597							\|\| die("parameter_file required"),
2598							_number_of_samples_for_training => $args{'number_of_samples_for_training'}
2599							\|\| die("number_of_samples_for_training required"),
2600	1		50			17	_debug => $args{debug} \|\| 0,
			50
			50
			50
2601							_class_names => [],
2602							_class_priors => [],
2603							_features_and_values_hash => {},
2604							_bias_hash => {},
2605							_training_sample_records => {},
2606							}, $class;
2607							}
2608
2609							sub check_for_illegal_params4 {
2610	1			1		2	my @params = @_;
2611	1					2	my @legal_params = qw / output_training_datafile
2612							parameter_file
2613							number_of_samples_for_training
2614							debug
2615							/;
2616	1					1	my $found_match_flag;
2617	1					2	foreach my $param (@params) {
2618	3					3	foreach my $legal (@legal_params) {
2619	6					3	$found_match_flag = 0;
2620	6	100				10	if ($param eq $legal) {
2621	3					2	$found_match_flag = 1;
2622	3					3	last;
2623							}
2624							}
2625	3	50				6	last if $found_match_flag == 0;
2626							}
2627	1					3	return $found_match_flag;
2628							}
2629
2630							## Read a parameter file for generating symbolic training data. See the script
2631							## generate_symbolic_training_data_symbolic.pl in the Examples directory for how to
2632							## pass the name of the parameter file to the constructor of the
2633							## TrainingDataGeneratorSymbolic class.
2634							sub read_parameter_file_symbolic {
2635	1			1		4	my $self = shift;
2636	1					5	my $debug = $self->{_debug};
2637	1					1	my $number_of_training_samples = $self->{_number_of_samples_for_training};
2638	1					2	my $input_parameter_file = $self->{_parameter_file};
2639	1	50				3	croak "Forgot to supply parameter file" if ! defined $input_parameter_file;
2640	1					2	my $output_file_training = $self->{_output_training_datafile};
2641	1					1	my $output_file_testing = $self->{_output_test_datafile};
2642	1					1	my @all_params;
2643							my $param_string;
2644	1		33			24	open INPUT, $input_parameter_file \|\| "unable to open parameter file: $!";
2645	1					30	@all_params = ;
2646	1					3	@all_params = grep { $_ !~ /^[ ]*#/ } @all_params;
	40					46
2647	1					3	@all_params = grep { $_ =~ s/\r?\n?$//} @all_params;
	36					58
2648	1					5	$param_string = join ' ', @all_params;
2649	1					7	my ($class_names, $class_priors, $rest_param) =
2650							$param_string =~ /^\sclass names:(.?)\sclass priors:(.?)(feature: .*)/;
2651	1	50				40	my @class_names = grep {defined($_) && length($_) > 0} split /\s+/, $1;
	3					14
2652	1					1	push @{$self->{_class_names}}, @class_names;
	1					2
2653	1	50				5	my @class_priors = grep {defined($_) && length($_) > 0} split /\s+/, $2;
	3					10
2654	1					1	push @{$self->{_class_priors}}, @class_priors;
	1					3
2655	1					8	my ($feature_string, $bias_string) = $rest_param =~ /(feature:.?) (bias:.)/;
2656	1					1	my %features_and_values_hash;
2657	1					6	my @features = split /(feature[:])/, $feature_string;
2658	1	50				5	@features = grep {defined($_) && length($_) > 0} @features;
	9					23
2659	1					2	foreach my $item (@features) {
2660	8	100				12	next if $item =~ /feature/;
2661	4					13	my @splits = split / /, $item;
2662	4	50				4	@splits = grep {defined($_) && length($_) > 0} @splits;
	27					61
2663	4					8	foreach my $i (0..@splits-1) {
2664	22	100				24	if ($i == 0) {
2665	4					7	$features_and_values_hash{$splits[0]} = [];
2666							} else {
2667	18	100				25	next if $splits[$i] =~ /values/;
2668	14					9	push @{$features_and_values_hash{$splits[0]}}, $splits[$i];
	14					21
2669							}
2670							}
2671							}
2672	1					2	$self->{_features_and_values_hash} = \%features_and_values_hash;
2673	1					1	my %bias_hash = %{$self->{_bias_hash}};
	1					3
2674	1					7	my @biases = split /(bias[:]\s*class[:])/, $bias_string;
2675	1	50				1	@biases = grep {defined($_) && length($_) > 0} @biases;
	5					13
2676	1					2	foreach my $item (@biases) {
2677	4	100				8	next if $item =~ /bias/;
2678	2					16	my @splits = split /\s+/, $item;
2679	2	50				4	@splits = grep {defined($_) && length($_) > 0} @splits;
	18					43
2680	2					1	my $feature_name;
2681	2					4	foreach my $i (0..@splits-1) {
2682	16	100				29	if ($i == 0) {
		100
2683	2					3	$bias_hash{$splits[0]} = {};
2684							} elsif ($splits[$i] =~ /(^.+)[:]$/) {
2685	8					9	$feature_name = $1;
2686	8					14	$bias_hash{$splits[0]}->{$feature_name} = [];
2687							} else {
2688	6	50				9	next if !defined $feature_name;
2689	6	50				8	push @{$bias_hash{$splits[0]}->{$feature_name}}, $splits[$i]
	6					11
2690							if defined $feature_name;
2691							}
2692							}
2693							}
2694	1					1	$self->{_bias_hash} = \%bias_hash;
2695	1	50				10	if ($debug) {
2696	0					0	print "\n\nClass names: @class_names\n";
2697	0					0	my $num_of_classes = @class_names;
2698	0					0	print "Class priors: @class_priors\n";
2699	0					0	print "Number of classes: $num_of_classes\n";
2700	0					0	print "\nHere are the features and their possible values:\n";
2701	0					0	while ( my ($k, $v) = each %features_and_values_hash ) {
2702	0					0	print "$k ===> @$v\n";
2703							}
2704	0					0	print "\nHere is the biasing for each class:\n";
2705	0					0	while ( my ($k, $v) = each %bias_hash ) {
2706	0					0	print "$k:\n";
2707	0					0	while ( my ($k1, $v1) = each %$v ) {
2708	0					0	print " $k1 ===> @$v1\n";
2709							}
2710							}
2711							}
2712							}
2713
2714							## This method generates training data according to the specifications placed in a
2715							## parameter file that is read by the previous method.
2716							sub gen_symbolic_training_data {
2717	1			1		5	my $self = shift;
2718	1					1	my @class_names = @{$self->{_class_names}};
	1					3
2719	1					1	my @class_priors = @{$self->{_class_priors}};
	1					3
2720	1					1	my %training_sample_records;
2721	1					1	my %features_and_values_hash = %{$self->{_features_and_values_hash}};
	1					4
2722	1					1	my %bias_hash = %{$self->{_bias_hash}};
	1					3
2723	1					1	my $how_many_training_samples = $self->{_number_of_samples_for_training};
2724	1					1	my $how_many_test_samples = $self->{_number_of_samples_for_testing};
2725	1					1	my %class_priors_to_unit_interval_map;
2726	1					2	my $accumulated_interval = 0;
2727	1					2	foreach my $i (0..@class_names-1) {
2728	2					5	$class_priors_to_unit_interval_map{$class_names[$i]}
2729							= [$accumulated_interval, $accumulated_interval + $class_priors[$i]];
2730	2					4	$accumulated_interval += $class_priors[$i];
2731							}
2732	1	50				2	if ($self->{_debug}) {
2733	0					0	print "Mapping of class priors to unit interval: \n";
2734	0					0	while ( my ($k, $v) = each %class_priors_to_unit_interval_map ) {
2735	0					0	print "$k => @$v\n";
2736							}
2737	0					0	print "\n\n";
2738							}
2739	1					2	my $ele_index = 0;
2740	1					2	while ($ele_index < $how_many_training_samples) {
2741	35					33	my $sample_name = "sample" . "_$ele_index";
2742	35					50	$training_sample_records{$sample_name} = [];
2743							# Generate class label for this training sample:
2744	35					49	my $roll_the_dice = rand(1.0);
2745	35					33	my $class_label;
2746	35					39	foreach my $class_name (keys %class_priors_to_unit_interval_map ) {
2747	46					30	my $v = $class_priors_to_unit_interval_map{$class_name};
2748	46	100	66			121	if ( ($roll_the_dice >= $v->[0]) && ($roll_the_dice <= $v->[1]) ) {
2749	35					21	push @{$training_sample_records{$sample_name}},
	35					51
2750							"class=" . $class_name;
2751	35					25	$class_label = $class_name;
2752	35					36	last;
2753							}
2754							}
2755	35					45	foreach my $feature (keys %features_and_values_hash) {
2756	140					88	my @values = @{$features_and_values_hash{$feature}};
	140					212
2757	140					123	my $bias_string = $bias_hash{$class_label}->{$feature}->[0];
2758	140					111	my $no_bias = 1.0 / @values;
2759	140	100				271	$bias_string = "$values[0]" . "=$no_bias" if !defined $bias_string;
2760	140					68	my %value_priors_to_unit_interval_map;
2761	140					352	my @splits = split /\s=\s/, $bias_string;
2762	140					102	my $chosen_for_bias_value = $splits[0];
2763	140					91	my $chosen_bias = $splits[1];
2764	140					151	my $remaining_bias = 1 - $chosen_bias;
2765	140					123	my $remaining_portion_bias = $remaining_bias / (@values -1);
2766	140	50				111	@splits = grep {defined($_) && length($_) > 0} @splits;
	280					786
2767	140					105	my $accumulated = 0;
2768	140					149	foreach my $i (0..@values-1) {
2769	490	100				481	if ($values[$i] eq $chosen_for_bias_value) {
2770	140					197	$value_priors_to_unit_interval_map{$values[$i]}
2771							= [$accumulated, $accumulated + $chosen_bias];
2772	140					124	$accumulated += $chosen_bias;
2773							} else {
2774	350					411	$value_priors_to_unit_interval_map{$values[$i]}
2775							= [$accumulated, $accumulated + $remaining_portion_bias];
2776	350					283	$accumulated += $remaining_portion_bias;
2777							}
2778							}
2779	140					108	my $roll_the_dice = rand(1.0);
2780	140					80	my $value_label;
2781	140					173	foreach my $value_name (keys %value_priors_to_unit_interval_map ) {
2782	312					222	my $v = $value_priors_to_unit_interval_map{$value_name};
2783	312	100	100			749	if ( ($roll_the_dice >= $v->[0])
2784							&& ($roll_the_dice <= $v->[1]) ) {
2785	140					79	push @{$training_sample_records{$sample_name}},
	140					268
2786							$feature . "=" . $value_name;
2787	140					91	$value_label = $value_name;
2788	140					102	last;
2789							}
2790							}
2791	140	50				401	if ($self->{_debug}) {
2792	0					0	print "mapping feature value priors for '$feature' " .
2793							"to unit interval: \n";
2794	0					0	while ( my ($k, $v) =
2795							each %value_priors_to_unit_interval_map ) {
2796	0					0	print "$k => @$v\n";
2797							}
2798	0					0	print "\n\n";
2799							}
2800							}
2801	35					61	$ele_index++;
2802							}
2803	1					2	$self->{_training_sample_records} = \%training_sample_records;
2804	1	50				3	if ($self->{_debug}) {
2805	0					0	print "\n\nPRINTING TRAINING RECORDS:\n\n";
2806	0					0	foreach my $kee (sort {sample_index($a) <=> sample_index($b)} keys %training_sample_records) {
	0					0
2807	0					0	print "$kee => @{$training_sample_records{$kee}}\n\n";
	0					0
2808							}
2809							}
2810	1					1	my $output_training_file = $self->{_output_training_datafile};
2811	1	50				3	print "\n\nDISPLAYING TRAINING RECORDS:\n\n" if $self->{_debug};
2812	1					77	open FILEHANDLE, ">$output_training_file";
2813	1					7	my @features = sort keys %features_and_values_hash;
2814	1					1	my $title_string = ',class';
2815	1					2	foreach my $feature_name (@features) {
2816	4					5	$title_string .= ',' . $feature_name;
2817							}
2818	1					11	print FILEHANDLE "$title_string\n";
2819	1					11	my @sample_names = sort {$a <=> $b} map { $_ =~ s/^sample_//; $_ } sort keys %training_sample_records;
	105					91
	35					38
	35					36
2820	1					5	my $record_string = '';
2821	1					2	foreach my $sample_name (@sample_names) {
2822	35					27	$record_string .= "$sample_name,";
2823	35					19	my @record = @{$training_sample_records{"sample_$sample_name"}};
	35					55
2824	35					45	my %item_parts_hash;
2825	35					26	foreach my $item (@record) {
2826	175					220	my @splits = grep $_, split /=/, $item;
2827	175					196	$item_parts_hash{$splits[0]} = $splits[1];
2828							}
2829	35					25	$record_string .= $item_parts_hash{"class"};
2830	35					29	delete $item_parts_hash{"class"};
2831	35					60	my @kees = sort keys %item_parts_hash;
2832	35					30	foreach my $kee (@kees) {
2833	140					115	$record_string .= ",$item_parts_hash{$kee}";
2834							}
2835	35					33	print FILEHANDLE "$record_string\n";
2836	35					47	$record_string = '';
2837							}
2838	1					36	close FILEHANDLE;
2839							}
2840
2841							sub sample_index {
2842	0			0			my $arg = shift;
2843	0						$arg =~ /_(.+)$/;
2844	0						return $1;
2845							}
2846
2847							################################# Decision Tree Introspection #######################################
2848							################################# Class DTIntrospection #######################################
2849
2850							package DTIntrospection;
2851
2852							## Instances constructed from this class can provide explanations for the
2853							## classification decisions at the nodes of a decision tree.
2854							##
2855							## When used in the interactive mode, the decision-tree introspection made possible
2856							## by this class provides answers to the following three questions: (1) List of the
2857							## training samples that fall in the portion of the feature space that corresponds
2858							## to a node of the decision tree; (2) The probabilities associated with the last
2859							## feature test that led to the node; and (3) The class probabilities predicated on
2860							## just the last feature test on the path to that node.
2861							##
2862							## CAVEAT: It is possible for a node to exist even when there are no training
2863							## samples in the portion of the feature space that corresponds to the node. That
2864							## is because a decision tree is based on the probability densities estimated from
2865							## the training data. When training data is non-uniformly distributed, it is
2866							## possible for the probability associated with a point in the feature space to be
2867							## non-zero even when there are no training samples at or in the vicinity of that
2868							## point.
2869							##
2870							## For a node to exist even where there are no training samples in the portion of
2871							## the feature space that belongs to the node is an indication of the generalization
2872							## ability of decision-tree based classification.
2873							##
2874							## When used in a non-interactive mode, an instance of this class can be used to
2875							## create a tabular display that shows what training samples belong directly to the
2876							## portion of the feature space that corresponds to each node of the decision tree.
2877							## An instance of this class can also construct a tabular display that shows how the
2878							## influence of each training sample propagates in the decision tree. For each
2879							## training sample, this display first shows the list of nodes that came into
2880							## existence through feature test(s) that used the data provided by that sample.
2881							## This list for each training sample is followed by a subtree of the nodes that owe
2882							## their existence indirectly to the training sample. A training sample influences a
2883							## node indirectly if the node is a descendant of another node that is affected
2884							## directly by the training sample.
2885
2886	1			1		5	use strict;
	1					1
	1					17
2887	1			1		4	use Carp;
	1					0
	1					1992
2888
2889							sub new {
2890	0			0			my ($class, $dt) = @_;
2891	0	0					croak "The argument supplied to the DTIntrospection constructor must be of type DecisionTree"
2892							unless ref($dt) eq "Algorithm::DecisionTree";
2893							bless {
2894							_dt => $dt,
2895							_root_dtnode => $dt->{_root_node},
2896	0						_samples_at_nodes_hash => {},
2897							_branch_features_to_nodes_hash => {},
2898							_sample_to_node_mapping_direct_hash => {},
2899							_node_serial_num_to_node_hash => {},
2900							_awareness_raising_msg_shown => 0,
2901							_debug => 0,
2902							}, $class;
2903							}
2904
2905							sub initialize {
2906	0			0			my $self = shift;
2907							croak "You must first construct the decision tree before using the DTIntrospection class"
2908	0	0					unless $self->{_root_dtnode};
2909	0						$self->recursive_descent($self->{_root_dtnode});
2910							}
2911
2912							sub recursive_descent {
2913	0			0			my $self = shift;
2914	0						my $node = shift;
2915	0						my $node_serial_number = $node->get_serial_num();
2916	0						my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2917	0	0					print "\nAt node $node_serial_number: the branch features and values are: @{$branch_features_and_values_or_thresholds}\n" if $self->{_debug};
	0
2918	0						$self->{_node_serial_num_to_node_hash}->{$node_serial_number} = $node;
2919	0						$self->{_branch_features_to_nodes_hash}->{$node_serial_number} = $branch_features_and_values_or_thresholds;
2920	0						my @samples_at_node = ();
2921	0						foreach my $item (@$branch_features_and_values_or_thresholds) {
2922	0						my $samples_for_feature_value_combo = $self->get_samples_for_feature_value_combo($item);
2923	0	0					unless (@samples_at_node) {
2924	0						@samples_at_node = @$samples_for_feature_value_combo;
2925							} else {
2926	0						my @accum;
2927	0						foreach my $sample (@samples_at_node) {
2928	0	0					push @accum, $sample if Algorithm::DecisionTree::contained_in($sample, @$samples_for_feature_value_combo);
2929							}
2930	0						@samples_at_node = @accum;
2931							}
2932	0	0					last unless @samples_at_node;
2933							}
2934	0						@samples_at_node = sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} @samples_at_node;
	0
2935	0	0					print "Node: $node_serial_number the samples are: [@samples_at_node]\n" if ($self->{_debug});
2936	0						$self->{_samples_at_nodes_hash}->{$node_serial_number} = \@samples_at_node;
2937	0	0					if (@samples_at_node) {
2938	0						foreach my $sample (@samples_at_node) {
2939	0	0					if (! exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2940	0						$self->{_sample_to_node_mapping_direct_hash}->{$sample} = [$node_serial_number];
2941							} else {
2942	0						push @{$self->{_sample_to_node_mapping_direct_hash}->{$sample}}, $node_serial_number;
	0
2943							}
2944							}
2945							}
2946	0						my $children = $node->get_children();
2947	0						foreach my $child (@$children) {
2948	0						$self->recursive_descent($child);
2949							}
2950							}
2951
2952							sub display_training_samples_at_all_nodes_direct_influence_only {
2953	0			0			my $self = shift;
2954							croak "You must first construct the decision tree before using the DT Introspection class."
2955	0	0					unless $self->{_root_dtnode};
2956	0						$self->recursive_descent_for_showing_samples_at_a_node($self->{_root_dtnode});
2957							}
2958
2959							sub recursive_descent_for_showing_samples_at_a_node{
2960	0			0			my $self = shift;
2961	0						my $node = shift;
2962	0						my $node_serial_number = $node->get_serial_num();
2963	0						my $branch_features_and_values_or_thresholds = $node->get_branch_features_and_values_or_thresholds();
2964	0	0					if (exists $self->{_samples_at_nodes_hash}->{$node_serial_number}) {
2965	0	0					print "\nAt node $node_serial_number: the branch features and values are: [@{$branch_features_and_values_or_thresholds}]\n" if $self->{_debug};
	0
2966	0						print "Node $node_serial_number: the samples are: [@{$self->{_samples_at_nodes_hash}->{$node_serial_number}}]\n";
	0
2967							}
2968	0						map $self->recursive_descent_for_showing_samples_at_a_node($_), @{$node->get_children()};
	0
2969							}
2970
2971							sub display_training_samples_to_nodes_influence_propagation {
2972	0			0			my $self = shift;
2973	0						foreach my $sample (sort {Algorithm::DecisionTree::sample_index($a) <=> Algorithm::DecisionTree::sample_index($b)} keys %{$self->{_dt}->{_training_data_hash}}) {
	0
	0
2974	0	0					if (exists $self->{_sample_to_node_mapping_direct_hash}->{$sample}) {
2975	0						my $nodes_directly_affected = $self->{_sample_to_node_mapping_direct_hash}->{$sample};
2976	0						print "\n$sample:\n nodes affected directly: [@{$nodes_directly_affected}]\n";
	0
2977	0						print " nodes affected through probabilistic generalization:\n";
2978	0						map $self->recursive_descent_for_sample_to_node_influence($_, $nodes_directly_affected, " "), @$nodes_directly_affected;
2979							}
2980							}
2981							}
2982
2983							sub recursive_descent_for_sample_to_node_influence {
2984	0			0			my $self = shift;
2985	0						my $node_serial_num = shift;
2986	0						my $nodes_already_accounted_for = shift;
2987	0						my $offset = shift;
2988	0						$offset .= " ";
2989	0						my $node = $self->{_node_serial_num_to_node_hash}->{$node_serial_num};
2990	0						my @children = map $_->get_serial_num(), @{$node->get_children()};
	0
2991	0						my @children_affected = grep {!Algorithm::DecisionTree::contained_in($_, @{$nodes_already_accounted_for})} @children;
	0
	0
2992	0	0					if (@children_affected) {
2993	0						print "$offset $node_serial_num => [@children_affected]\n";
2994							}
2995	0						map $self->recursive_descent_for_sample_to_node_influence($_, \@children_affected, $offset), @children_affected;
2996							}
2997
2998							sub get_samples_for_feature_value_combo {
2999	0			0			my $self = shift;
3000	0						my $feature_value_combo = shift;
3001	0						my ($feature,$op,$value) = $self->extract_feature_op_val($feature_value_combo);
3002	0						my @samples = ();
3003	0	0					if ($op eq '=') {
		0
		0
3004	0						@samples = grep Algorithm::DecisionTree::contained_in($feature_value_combo, @{$self->{_dt}->{_training_data_hash}->{$_}}), keys %{$self->{_dt}->{_training_data_hash}};
	0
	0
3005							} elsif ($op eq '<') {
3006	0						foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
	0
3007	0						my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
	0
3008	0						foreach my $item (@features_and_values) {
3009	0						my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3010	0	0	0				if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data <= $value)) {
			0
3011	0						push @samples, $sample;
3012	0						last;
3013							}
3014							}
3015							}
3016							} elsif ($op eq '>') {
3017	0						foreach my $sample (keys %{$self->{_dt}->{_training_data_hash}}) {
	0
3018	0						my @features_and_values = @{$self->{_dt}->{_training_data_hash}->{$sample}};
	0
3019	0						foreach my $item (@features_and_values) {
3020	0						my ($feature_data,$op_data,$val_data) = $self->extract_feature_op_val($item);
3021	0	0	0				if (($val_data ne 'NA') && ($feature eq $feature_data) && ($val_data > $value)) {
			0
3022	0						push @samples, $sample;
3023	0						last;
3024							}
3025							}
3026							}
3027							} else {
3028	0						die "Something strange is going on";
3029							}
3030	0						return \@samples;
3031							}
3032
3033							sub extract_feature_op_val {
3034	0			0			my $self = shift;
3035	0						my $feature_value_combo = shift;
3036	0						my $pattern1 = '(.+)=(.+)';
3037	0						my $pattern2 = '(.+)<(.+)';
3038	0						my $pattern3 = '(.+)>(.+)';
3039	0						my ($feature,$value,$op);
3040	0	0					if ($feature_value_combo =~ /$pattern2/) {
		0
		0
3041	0						($feature,$op,$value) = ($1,'<',$2);
3042							} elsif ($feature_value_combo =~ /$pattern3/) {
3043	0						($feature,$op,$value) = ($1,'>',$2);
3044							} elsif ($feature_value_combo =~ /$pattern1/) {
3045	0						($feature,$op,$value) = ($1,'=',$2);
3046							}
3047	0						return ($feature,$op,$value);
3048							}
3049
3050							sub explain_classifications_at_multiple_nodes_interactively {
3051	0			0			my $self = shift;
3052							croak "You called explain_classification_at_multiple_nodes_interactively() without " .
3053							"first initializing the DTIntrospection instance in your code. Aborting."
3054	0	0					unless $self->{_samples_at_nodes_hash};
3055	0						print "\n\nIn order for the decision tree to introspect\n\n";
3056	0						print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3057							" that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3058							" with it even when there are NO training data points in that region?\n" .
3059							"\nEnter 'y' for yes or any other character for no: ";
3060	0						my $ans = ;
3061	0						$ans =~ s/^\s\|\s$//g;
3062	0	0					die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decisions in the decision tree. Aborting!\n" if $ans !~ /^ye?s?$/;
3063	0						$self->{_awareness_raising_msg_shown} = 1;
3064	0						while (1) {
3065	0						my $node_id;
3066							my $ans;
3067	0						while (1) {
3068	0						print "\nEnter the integer ID of a node: ";
3069	0						$ans = ;
3070	0						$ans =~ s/^\s\|\s$//g;
3071	0	0					return if $ans =~ /^exit$/;
3072	0	0					last if Algorithm::DecisionTree::contained_in($ans, keys %{$self->{_samples_at_nodes_hash}});
	0
3073	0						print "\nYour answer must be an integer ID of a node. Try again or enter 'exit'.\n";
3074							}
3075	0						$node_id = $ans;
3076	0						$self->explain_classification_at_one_node($node_id)
3077							}
3078							}
3079
3080							sub explain_classification_at_one_node {
3081	0			0			my $self = shift;
3082	0						my $node_id = shift;
3083							croak "You called explain_classification_at_one_node() without first initializing " .
3084	0	0					"the DTIntrospection instance in your code. Aborting." unless $self->{_samples_at_nodes_hash};
3085	0	0					unless (exists $self->{_samples_at_nodes_hash}->{$node_id}) {
3086	0						print "Node $node_id is not a node in the tree\n";
3087	0						return;
3088							}
3089	0	0					unless ($self->{_awareness_raising_msg_shown}) {
3090	0						print "\n\nIn order for the decision tree to introspect at Node $node_id: \n\n";
3091	0						print " DO YOU ACCEPT the fact that, in general, a region of the feature space\n" .
3092							" that corresponds to a DT node may have NON-ZERO probabilities associated\n" .
3093							" with it even when there are NO training data points in that region?\n" .
3094							"\nEnter 'y' for yes or any other character for no: ";
3095	0						my $ans = ;
3096	0						$ans =~ s/^\s\|\s$//g;
3097	0	0					die "\n Since you answered 'no' to a very real theoretical possibility, no explanations possible for the classification decision at node $node_id\n" if $ans !~ /^ye?s?$/;
3098							}
3099	0						my @samples_at_node = @{$self->{_samples_at_nodes_hash}->{$node_id}};
	0
3100	0						my @branch_features_to_node = @{$self->{_branch_features_to_nodes_hash}->{$node_id}};
	0
3101							# my @class_names = @{$self->get_class_names()};
3102	0						my @class_names = $self->{_dt}->get_class_names();
3103	0						my $class_probabilities = $self->{_root_dtnode}->get_class_probabilities();
3104	0						my ($feature,$op,$value) = $self->extract_feature_op_val( $branch_features_to_node[-1] );
3105	0	0					my $msg = @samples_at_node == 0
3106							? "\n\n There are NO training data samples directly in the region of the feature space assigned to node $node_id: @samples_at_node\n\n"
3107							: "\n Samples in the portion of the feature space assigned to Node $node_id: @samples_at_node\n";
3108	0						$msg .= "\n Features tests on the branch to node $node_id: [@branch_features_to_node]\n\n";
3109	0						$msg .= "\n Would you like to see the probability associated with the last feature test on the branch leading to Node $node_id?\n";
3110	0						$msg .= "\n Enter 'y' if yes and any other character for 'no': ";
3111	0						print $msg;
3112	0						my $ans = ;
3113	0						$ans =~ s/^\s\|\s$//g;
3114	0	0					if ($ans =~ /^ye?s?$/) {
3115	0						my $sequence = [$branch_features_to_node[-1]];
3116	0						my $prob = $self->{_dt}->probability_of_a_sequence_of_features_and_values_or_thresholds($sequence);
3117	0						print "\n probability of @{$sequence} is: $prob\n";
	0
3118							}
3119	0						$msg = "\n Using Bayes rule, would you like to see the class probabilities predicated on just the last feature test on the branch leading to Node $node_id?\n";
3120	0						$msg .= "\n Enter 'y' for yes and any other character for no: ";
3121	0						print $msg;
3122	0						$ans = ;
3123	0						$ans =~ s/^\s\|\s$//g;
3124	0	0					if ($ans =~ /^ye?s?$/) {
3125	0						my $sequence = [$branch_features_to_node[-1]];
3126	0						foreach my $cls (@class_names) {
3127	0						my $prob = $self->{_dt}->probability_of_a_class_given_sequence_of_features_and_values_or_thresholds($cls, $sequence);
3128	0						print "\n probability of class $cls given just the feature test @{$sequence} is: $prob\n";
	0
3129							}
3130							} else {
3131	0						print "goodbye\n";
3132							}
3133	0						print "\n Finished supplying information on Node $node_id\n\n";
3134							}
3135
3136							1;
3137
3138							=pod
3139
3140							=head1 NAME
3141
3142							Algorithm::DecisionTree - A Perl module for decision-tree based classification of
3143							multidimensional data.
3144
3145
3146							=head1 SYNOPSIS
3147
3148							# FOR CONSTRUCTING A DECISION TREE AND FOR CLASSIFYING A SAMPLE:
3149
3150							# In general, your call for constructing an instance of the DecisionTree class
3151							# will look like:
3152
3153							my $training_datafile = "stage3cancer.csv";
3154							my $dt = Algorithm::DecisionTree->new(
3155							training_datafile => $training_datafile,
3156							csv_class_column_index => 2,
3157							csv_columns_for_features => [3,4,5,6,7,8],
3158							entropy_threshold => 0.01,
3159							max_depth_desired => 8,
3160							symbolic_to_numeric_cardinality_threshold => 10,
3161							csv_cleanup_needed => 1,
3162							);
3163
3164							# The constructor option `csv_class_column_index' informs the module as to which
3165							# column of your CSV file contains the class label. THE COLUMN INDEXING IS ZERO
3166							# BASED. The constructor option `csv_columns_for_features' specifies which columns
3167							# are to be used for feature values. The first row of the CSV file must specify
3168							# the names of the features. See examples of CSV files in the `Examples'
3169							# subdirectory.
3170
3171							# The option `symbolic_to_numeric_cardinality_threshold' is also important. For
3172							# the example shown above, if an ostensibly numeric feature takes on only 10 or
3173							# fewer different values in your training datafile, it will be treated like a
3174							# symbolic features. The option `entropy_threshold' determines the granularity
3175							# with which the entropies are sampled for the purpose of calculating entropy gain
3176							# with a particular choice of decision threshold for a numeric feature or a feature
3177							# value for a symbolic feature.
3178
3179							# The option 'csv_cleanup_needed' is by default set to 0. If you set it
3180							# to 1, that would cause all line records in your CSV file to be "sanitized" before
3181							# they are used for constructing a decision tree. You need this option if your CSV
3182							# file uses double-quoted field names and field values in the line records and if
3183							# such double-quoted strings are allowed to include commas for, presumably, better
3184							# readability.
3185
3186							# After you have constructed an instance of the DecisionTree class as shown above,
3187							# you read in the training data file and initialize the probability cache by
3188							# calling:
3189
3190							$dt->get_training_data();
3191							$dt->calculate_first_order_probabilities();
3192							$dt->calculate_class_priors();
3193
3194							# Next you construct a decision tree for your training data by calling:
3195
3196							$root_node = $dt->construct_decision_tree_classifier();
3197
3198							# where $root_node is an instance of the DTNode class that is also defined in the
3199							# module file. Now you are ready to classify a new data record. Let's say that
3200							# your data record looks like:
3201
3202							my @test_sample = qw / g2=4.2
3203							grade=2.3
3204							gleason=4
3205							eet=1.7
3206							age=55.0
3207							ploidy=diploid /;
3208
3209							# You can classify it by calling:
3210
3211							my $classification = $dt->classify($root_node, \@test_sample);
3212
3213							# The call to `classify()' returns a reference to a hash whose keys are the class
3214							# names and the values the associated classification probabilities. This hash also
3215							# includes another key-value pair for the solution path from the root node to the
3216							# leaf node at which the final classification was carried out.
3217
3218
3219							=head1 CHANGES
3220
3221							B This version reintroduces C as an optional
3222							parameter in the module constructor. This was done in response to several requests
3223							received from the user community. (Previously, all line records from a CSV file were
3224							processed by the C function no matter what.) The main point made by
3225							the users was that invoking C when there was no need for CSV clean-up
3226							extracted a performance penalty when ingesting large database files with tens of
3227							thousands of line records. In addition to making C optional, I
3228							have also tweaked up the code in the C function in order to extract
3229							data from a larger range of messy CSV files.
3230
3231							B All the changes made in this version relate to the construction of
3232							regression trees. I have fixed a couple of bugs in the calculation of the regression
3233							coefficients. Additionally, the C class now comes with a new
3234							constructor parameter named C. For most cases, you'd set this
3235							parameter to 0, which causes the regression coefficients to be estimated through
3236							linear least-squares minimization.
3237
3238							B In addition to constructing decision trees, this version of the
3239							module also allows you to construct regression trees. The regression tree capability
3240							has been packed into a separate subclass, named C, of the main
3241							C class. The subdirectory C in the main
3242							installation directory illustrates how you can use this new functionality of the
3243							module.
3244
3245							B This version incorporates four very significant upgrades/changes to
3246							the C module: B<(1)> The CSV cleanup is now the default. So you do not
3247							have to set any special parameters in the constructor calls to initiate CSV
3248							cleanup. B<(2)> In the form of a new Perl class named C,
3249							this module provides you with an easy-to-use programming interface for attempting
3250							needle-in-a-haystack solutions for the case when your training data is overwhelmingly
3251							dominated by a single class. You need to set the constructor parameter
3252							C to invoke the logic that constructs multiple
3253							decision trees, each using the minority class samples along with samples drawn
3254							randomly from the majority class. The final classification is made through a
3255							majority vote from all the decision trees. B<(3)> Assuming you are faced with a
3256							big-data problem --- in the sense that you have been given a training database with a
3257							very large number of training records --- the class C will
3258							also let you construct multiple decision trees by pulling training data randomly from
3259							your training database (without paying attention to the relative populations of the
3260							classes). The final classification decision for a test sample is based on a majority
3261							vote from all the decision trees thus constructed. See the C
3262							directory for how to use these new features of the module. And, finally, B<(4)>
3263							Support for the old-style '.dat' training files has been dropped in this version.
3264
3265							B This version makes it easier to use a CSV training file that
3266							violates the assumption that a comma be used only to separate the different field
3267							values in a line record. Some large econometrics databases use double-quoted values
3268							for fields, and these values may contain commas (presumably for better readability).
3269							This version also allows you to specify the leftmost entry in the first CSV record
3270							that names all the fields. Previously, this entry was required to be an empty
3271							double-quoted string. I have also made some minor changes to the
3272							'C' method to make it more user friendly for large
3273							training files that may contain tens of thousands of records. When pulling training
3274							data from such files, this method prints out a dot on the terminal screen for every
3275							10000 records it has processed.
3276
3277							B This version brings the boosting capability to the C
3278							module.
3279
3280							B This version adds bagging to the C module. If your
3281							training dataset is large enough, you can ask the module to construct multiple
3282							decision trees using data bags extracted from your dataset. The module can show you
3283							the results returned by the individual decision trees and also the results obtained
3284							by taking a majority vote of the classification decisions made by the individual
3285							trees. You can specify any arbitrary extent of overlap between the data bags.
3286
3287							B The introspection capability in this version packs more of a punch.
3288							For each training data sample, you can now figure out not only the decision-tree
3289							nodes that are affected directly by that sample, but also those nodes that are
3290							affected indirectly through the generalization achieved by the probabilistic modeling
3291							of the data. The 'examples' directory of this version includes additional scripts
3292							that illustrate these enhancements to the introspection capability. See the section
3293							"The Introspection API" for a declaration of the introspection related methods, old
3294							and new.
3295
3296							B In response to requests from several users, this version includes a new
3297							capability: You can now ask the module to introspect about the classification
3298							decisions returned by the decision tree. Toward that end, the module includes a new
3299							class named C. Perhaps the most important bit of information you
3300							are likely to seek through DT introspection is the list of the training samples that
3301							fall directly in the portion of the feature space that is assigned to a node.
3302							B When training samples are non-uniformly distributed in the underlying
3303							feature space, IT IS POSSIBLE FOR A NODE TO EXIST EVEN WHEN NO TRAINING SAMPLES FALL
3304							IN THE PORTION OF THE FEATURE SPACE ASSIGNED TO THE NODE. B<(This is an important
3305							part of the generalization achieved by probabilistic modeling of the training data.)>
3306							For additional information related to DT introspection, see the section titled
3307							"DECISION TREE INTROSPECTION" in this documentation page.
3308
3309							B makes the logic of tree construction from the old-style '.dat' training
3310							files more consistent with how trees are constructed from the data in `.csv' files.
3311							The inconsistency in the past was with respect to the naming convention for the class
3312							labels associated with the different data records.
3313
3314							B fixes a bug in the part of the module that some folks use for generating
3315							synthetic data for experimenting with decision tree construction and classification.
3316							In the class C that is a part of the module, there
3317							was a problem with the order in which the features were recorded from the
3318							user-supplied parameter file. The basic code for decision tree construction and
3319							classification remains unchanged.
3320
3321							B further downshifts the required version of Perl for this module. This
3322							was a result of testing the module with Version 5.10.1 of Perl. Only one statement
3323							in the module code needed to be changed for the module to work with the older version
3324							of Perl.
3325
3326							B fixes the C restriction on the required Perl version. This
3327							version should work with Perl versions 5.14.0 and higher.
3328
3329							B changes the required version of Perl from 5.18.0 to 5.14.0. Everything
3330							else remains the same.
3331
3332							B should prove more robust when the probability distribution for the
3333							values of a feature is expected to be heavy-tailed; that is, when the supposedly rare
3334							observations can occur with significant probabilities. A new option in the
3335							DecisionTree constructor lets the user specify the precision with which the
3336							probability distributions are estimated for such features.
3337
3338							B fixes a bug that was caused by the explicitly set zero values for
3339							numerical features being misconstrued as "false" in the conditional statements in
3340							some of the method definitions.
3341
3342							B makes it easier to write code for classifying in one go all of your test
3343							data samples in a CSV file. The bulk classifications obtained can be written out to
3344							either a CSV file or to a regular text file. See the script
3345							C in the C directory for how to
3346							classify all of your test data records in a CSV file. This version also includes
3347							improved code for generating synthetic numeric/symbolic training and test data
3348							records for experimenting with the decision tree classifier.
3349
3350							B allows you to test the quality of your training data by running a 10-fold
3351							cross-validation test on the data. This test divides all of the training data into
3352							ten parts, with nine parts used for training a decision tree and one part used for
3353							testing its ability to classify correctly. This selection of nine parts for training
3354							and one part for testing is carried out in all of the ten different ways that are
3355							possible. This testing functionality in Version 2.1 can also be used to find the
3356							best values to use for the constructor parameters C,
3357							C, and C.
3358
3359							B Now you can use both numeric and
3360							symbolic features for constructing a decision tree. A feature is numeric if it can
3361							take any floating-point value over an interval.
3362
3363							B fixes a bug in the code that was triggered by 0 being declared as one of
3364							the features values in the training datafile. Version 1.71 also include an additional
3365							safety feature that is useful for training datafiles that contain a very large number
3366							of features. The new version makes sure that the number of values you declare for
3367							each sample record matches the number of features declared at the beginning of the
3368							training datafile.
3369
3370							B includes safety checks on the consistency of the data you place in your
3371							training datafile. When a training file contains thousands of samples, it is
3372							difficult to manually check that you used the same class names in your sample records
3373							that you declared at top of your training file or that the values you have for your
3374							features are legal vis-a-vis the earlier declarations of the values in the training
3375							file. Another safety feature incorporated in this version is the non-consideration
3376							of classes that are declared at the top of the training file but that have no sample
3377							records in the file.
3378
3379							B uses probability caching much more extensively compared to the previous
3380							versions. This should result in faster construction of large decision trees.
3381							Another new feature in Version 1.6 is the use of a decision tree for interactive
3382							classification. In this mode, after you have constructed a decision tree from the
3383							training data, the user is prompted for answers to the questions pertaining to the
3384							feature tests at the nodes of the tree.
3385
3386							Some key elements of the documentation were cleaned up and made more readable in
3387							B. The implementation code remains unchanged from Version 1.4.
3388
3389							B should make things faster (and easier) for folks who want to use this
3390							module with training data that creates very large decision trees (that is, trees with
3391							tens of thousands or more decision nodes). The speedup in Version 1.4 has been
3392							achieved by eliminating duplicate calculation of probabilities as the tree grows. In
3393							addition, this version provides an additional constructor parameter,
3394							C for controlling the size of the decision tree. This is in
3395							addition to the tree size control achieved by the parameter C that
3396							was introduced in Version 1.3. Since large decision trees can take a long time to
3397							create, you may find yourself wishing you could store the tree you just created in a
3398							disk file and that, subsequently, you could use the stored tree for classification
3399							work. The C directory contains two scripts, C and
3400							C, that show how you can do exactly that with the
3401							help of Perl's C module.
3402
3403							B addresses the issue that arises when the header of a training datafile
3404							declares a certain possible value for a feature but that (feature,value) pair does
3405							NOT show up anywhere in the training data. Version 1.3 also makes it possible for a
3406							user to control the size of the decision tree by changing the value of the parameter
3407							C Additionally, Version 1.3 includes a method called
3408							C that displays useful information regarding the size and
3409							some other attributes of the training data. It also warns the user if the training
3410							data might result in a decision tree that would simply be much too large --- unless
3411							the user controls the size with the entropy_threshold parameter.
3412
3413							In addition to the removal of a couple of serious bugs, B incorporates a
3414							number of enhancements: (1) Version 1.2 includes checks on the names of the features
3415							and values used in test data --- this is the data you want to classify with the
3416							decision tree classifier constructed by this module. (2) Version 1.2 includes a
3417							separate constructor for generating test data. To make it easier to generate test
3418							data whose probabilistic parameters may not be identical to that used for the
3419							training data, I have used separate routines for generating the test data. (3)
3420							Version 1.2 also includes in its examples directory a script that classifies the test
3421							data in a file and outputs the class labels into another file. This is for folks who
3422							do not wish to write their own scripts using this module. (4) Version 1.2 also
3423							includes addition to the documentation regarding the issue of numeric values for
3424							features.
3425
3426							=head1 DESCRIPTION
3427
3428							B is a I module for constructing a decision tree from
3429							a training datafile containing multidimensional data. In one form or another,
3430							decision trees have been around for about fifty years. From a statistical
3431							perspective, they are closely related to classification and regression by recursive
3432							partitioning of multidimensional data. Early work that demonstrated the usefulness
3433							of such partitioning of data for classification and regression can be traced to the
3434							work of Terry Therneau in the early 1980's in the statistics community, and to the
3435							work of Ross Quinlan in the mid 1990's in the machine learning community.
3436
3437							For those not familiar with decision tree ideas, the traditional way to classify
3438							multidimensional data is to start with a feature space whose dimensionality is the
3439							same as that of the data. Each feature in this space corresponds to the attribute
3440							that each dimension of the data measures. You then use the training data to carve up
3441							the feature space into different regions, each corresponding to a different class.
3442							Subsequently, when you try to classify a new data sample, you locate it in the
3443							feature space and find the class label of the region to which it belongs. One can
3444							also give the new data point the same class label as that of the nearest training
3445							sample. This is referred to as the nearest neighbor classification. There exist
3446							hundreds of variations of varying power on these two basic approaches to the
3447							classification of multidimensional data.
3448
3449							A decision tree classifier works differently. When you construct a decision tree,
3450							you select for the root node a feature test that partitions the training data in a
3451							way that causes maximal disambiguation of the class labels associated with the data.
3452							In terms of information content as measured by entropy, such a feature test would
3453							cause maximum reduction in class entropy in going from all of the training data taken
3454							together to the data as partitioned by the feature test. You then drop from the root
3455							node a set of child nodes, one for each partition of the training data created by the
3456							feature test at the root node. When your features are purely symbolic, you'll have
3457							one child node for each value of the feature chosen for the feature test at the root.
3458							When the test at the root involves a numeric feature, you find the decision threshold
3459							for the feature that best bipartitions the data and you drop from the root node two
3460							child nodes, one for each partition. Now at each child node you pose the same
3461							question that you posed when you found the best feature to use at the root: Which
3462							feature at the child node in question would maximally disambiguate the class labels
3463							associated with the training data corresponding to that child node?
3464
3465							As the reader would expect, the two key steps in any approach to decision-tree based
3466							classification are the construction of the decision tree itself from a file
3467							containing the training data, and then using the decision tree thus obtained for
3468							classifying new data.
3469
3470							What is cool about decision tree classification is that it gives you soft
3471							classification, meaning it may associate more than one class label with a given data
3472							vector. When this happens, it may mean that your classes are indeed overlapping in
3473							the underlying feature space. It could also mean that you simply have not supplied
3474							sufficient training data to the decision tree classifier. For a tutorial
3475							introduction to how a decision tree is constructed and used, visit
3476							L
3477
3478							This module also allows you to generate your own synthetic training and test
3479							data. Generating your own training data, using it for constructing a decision-tree
3480							classifier, and subsequently testing the classifier on a synthetically generated
3481							test set of data is a good way to develop greater proficiency with decision trees.
3482
3483
3484							=head1 WHAT PRACTICAL PROBLEM IS SOLVED BY THIS MODULE
3485
3486							If you are new to the concept of a decision tree, their practical utility is best
3487							understood with an example that only involves symbolic features. However, as
3488							mentioned earlier, versions of the module higher than 2.0 allow you to use both
3489							symbolic and numeric features.
3490
3491							Consider the following scenario: Let's say you are running a small investment company
3492							that employs a team of stockbrokers who make buy/sell decisions for the customers of
3493							your company. Assume that your company has asked the traders to make each investment
3494							decision on the basis of the following four criteria:
3495
3496							price_to_earnings_ratio (P_to_E)
3497
3498							price_to_sales_ratio (P_to_S)
3499
3500							return_on_equity (R_on_E)
3501
3502							market_share (MS)
3503
3504							Since you are the boss, you keep track of the buy/sell decisions made by the
3505							individual traders. But one unfortunate day, all of your traders decide to quit
3506							because you did not pay them enough. So what do you do? If you had a module like
3507							the one here, you could still run your company and do so in such a way that, on the
3508							average, would do better than any of the individual traders who worked for your
3509							company. This is what you do: You pool together the individual trader buy/sell
3510							decisions you have accumulated during the last one year. This pooled information is
3511							likely to look like:
3512
3513
3514							example buy/sell P_to_E P_to_S R_on_E MS
3515							============================================================+=
3516
3517							example_1 buy high low medium low
3518							example_2 buy medium medium low low
3519							example_3 sell low medium low high
3520							....
3521							....
3522
3523							This data, when formatted according to CSV, would constitute your training file. You
3524							could feed this file into the module by calling:
3525
3526							my $dt = Algorithm::DecisionTree->new(
3527							training_datafile => $training_datafile,
3528							csv_class_column_index => 1,
3529							csv_columns_for_features => [2,3,4,5],
3530							);
3531							$dt->get_training_data();
3532							$dt->calculate_first_order_probabilities();
3533							$dt->calculate_class_priors();
3534
3535							Subsequently, you would construct a decision tree by calling:
3536
3537							my $root_node = $dt->construct_decision_tree_classifier();
3538
3539							Now you and your company (with practically no employees) are ready to service the
3540							customers again. Suppose your computer needs to make a buy/sell decision about an
3541							investment prospect that is best described by:
3542
3543							price_to_earnings_ratio = low
3544							price_to_sales_ratio = very_low
3545							return_on_equity = none
3546							market_share = medium
3547
3548							All that your computer would need to do would be to construct a data vector like
3549
3550							my @data = qw / P_to_E=low
3551							P_to_S=very_low
3552							R_on_E=none
3553							MS=medium /;
3554
3555							and call the decision tree classifier you just constructed by
3556
3557							$dt->classify($root_node, \@data);
3558
3559							The answer returned will be 'buy' and 'sell', along with the associated
3560							probabilities. So if the probability of 'buy' is considerably greater than the
3561							probability of 'sell', that's what you should instruct your computer to do.
3562
3563							The chances are that, on the average, this approach would beat the performance of any
3564							of your individual traders who worked for you previously since the buy/sell decisions
3565							made by the computer would be based on the collective wisdom of all your previous
3566							traders. B
3567							captured by the silly little example here. However, it does nicely the convey the
3568							sense in which the current module could be used.>
3569
3570							=head1 SYMBOLIC FEATURES VERSUS NUMERIC FEATURES
3571
3572							A feature is symbolic when its values are compared using string comparison operators.
3573							By the same token, a feature is numeric when its values are compared using numeric
3574							comparison operators. Having said that, features that take only a small number of
3575							numeric values in the training data can be treated symbolically provided you are
3576							careful about handling their values in the test data. At the least, you have to set
3577							the test data value for such a feature to its closest value in the training data.
3578							The module does that automatically for you for those numeric features for which the
3579							number different numeric values is less than a user-specified threshold. For those
3580							numeric features that the module is allowed to treat symbolically, this snapping of
3581							the values of the features in the test data to the small set of values in the training
3582							data is carried out automatically by the module. That is, after a user has told the
3583							module which numeric features to treat symbolically, the user need not worry about
3584							how the feature values appear in the test data.
3585
3586							The constructor parameter C let's you tell
3587							the module when to consider an otherwise numeric feature symbolically. Suppose you
3588							set this parameter to 10, that means that all numeric looking features that take 10
3589							or fewer different values in the training datafile will be considered to be symbolic
3590							features by the module. See the tutorial at
3591							L for
3592							further information on the implementation issues related to the symbolic and numeric
3593							features.
3594
3595							=head1 FEATURES WITH NOT SO "NICE" STATISTICAL PROPERTIES
3596
3597							For the purpose of estimating the probabilities, it is necessary to sample the range
3598							of values taken on by a numerical feature. For features with "nice" statistical
3599							properties, this sampling interval is set to the median of the differences between
3600							the successive feature values in the training data. (Obviously, as you would expect,
3601							you first sort all the values for a feature before computing the successive
3602							differences.) This logic will not work for the sort of a feature described below.
3603
3604							Consider a feature whose values are heavy-tailed, and, at the same time, the values
3605							span a million to one range. What I mean by heavy-tailed is that rare values can
3606							occur with significant probabilities. It could happen that most of the values for
3607							such a feature are clustered at one of the two ends of the range. At the same time,
3608							there may exist a significant number of values near the end of the range that is less
3609							populated. (Typically, features related to human economic activities --- such as
3610							wealth, incomes, etc. --- are of this type.) With the logic described in the
3611							previous paragraph, you could end up with a sampling interval that is much too small,
3612							which could result in millions of sampling points for the feature if you are not
3613							careful.
3614
3615							Beginning with Version 2.22, you have two options in dealing with such features. You
3616							can choose to go with the default behavior of the module, which is to sample the
3617							value range for such a feature over a maximum of 500 points. Or, you can supply an
3618							additional option to the constructor that sets a user-defined value for the number of
3619							points to use. The name of the option is C. The following
3620							script
3621
3622							construct_dt_for_heavytailed.pl
3623
3624							in the C directory shows an example of how to call the constructor of the
3625							module with the C option.
3626
3627
3628							=head1 TESTING THE QUALITY OF YOUR TRAINING DATA
3629
3630							Versions 2.1 and higher include a new class named C, derived from
3631							the main class C, that runs a 10-fold cross-validation test on your
3632							training data to test its ability to discriminate between the classes mentioned in
3633							the training file.
3634
3635							The 10-fold cross-validation test divides all of the training data into ten parts,
3636							with nine parts used for training a decision tree and one part used for testing its
3637							ability to classify correctly. This selection of nine parts for training and one part
3638							for testing is carried out in all of the ten different possible ways.
3639
3640							The following code fragment illustrates how you invoke the testing function of the
3641							EvalTrainingData class:
3642
3643							my $training_datafile = "training.csv";
3644							my $eval_data = EvalTrainingData->new(
3645							training_datafile => $training_datafile,
3646							csv_class_column_index => 1,
3647							csv_columns_for_features => [2,3],
3648							entropy_threshold => 0.01,
3649							max_depth_desired => 3,
3650							symbolic_to_numeric_cardinality_threshold => 10,
3651							csv_cleanup_needed => 1,
3652							);
3653							$eval_data->get_training_data();
3654							$eval_data->evaluate_training_data()
3655
3656							The last statement above prints out a Confusion Matrix and the value of Training Data
3657							Quality Index on a scale of 0 to 100, with 100 designating perfect training data.
3658							The Confusion Matrix shows how the different classes were mislabeled in the 10-fold
3659							cross-validation test.
3660
3661							This testing functionality can also be used to find the best values to use for the
3662							constructor parameters C, C, and
3663							C.
3664
3665							The following two scripts in the C directory illustrate the use of the
3666							C class for testing the quality of your data:
3667
3668							evaluate_training_data1.pl
3669							evaluate_training_data2.pl
3670
3671
3672							=head1 HOW TO MAKE THE BEST CHOICES FOR THE CONSTRUCTOR PARAMETERS
3673
3674							Assuming your training data is good, the quality of the results you get from a
3675							decision tree would depend on the choices you make for the constructor parameters
3676							C, C, and
3677							C. You can optimize your choices for
3678							these parameters by running the 10-fold cross-validation test that is made available
3679							in Versions 2.2 and higher through the new class C that is included
3680							in the module file. A description of how to run this test is in the previous section
3681							of this document.
3682
3683
3684							=head1 DECISION TREE INTROSPECTION
3685
3686							Starting with Version 2.30, you can ask the C class of the module to
3687							explain the classification decisions made at the different nodes of the decision
3688							tree.
3689
3690							Perhaps the most important bit of information you are likely to seek through DT
3691							introspection is the list of the training samples that fall directly in the portion
3692							of the feature space that is assigned to a node.
3693
3694							However, note that, when training samples are non-uniformly distributed in the
3695							underlying feature space, it is possible for a node to exist even when there are no
3696							training samples in the portion of the feature space assigned to the node. That is
3697							because the decision tree is constructed from the probability densities estimated
3698							from the training data. When the training samples are non-uniformly distributed, it
3699							is entirely possible for the estimated probability densities to be non-zero in a
3700							small region around a point even when there are no training samples specifically in
3701							that region. (After you have created a statistical model for, say, the height
3702							distribution of people in a community, the model may return a non-zero probability
3703							for the height values in a small interval even if the community does not include a
3704							single individual whose height falls in that interval.)
3705
3706							That a decision-tree node can exist even when there are no training samples in that
3707							portion of the feature space that belongs to the node is an important indication of
3708							the generalization ability of a decision-tree-based classifier.
3709
3710							In light of the explanation provided above, before the DTIntrospection class supplies
3711							any answers at all, it asks you to accept the fact that features can take on non-zero
3712							probabilities at a point in the feature space even though there are zero training
3713							samples at that point (or in a small region around that point). If you do not accept
3714							this rudimentary fact, the introspection class will not yield any answers (since you
3715							are not going to believe the answers anyway).
3716
3717							The point made above implies that the path leading to a node in the decision tree may
3718							test a feature for a certain value or threshold despite the fact that the portion of
3719							the feature space assigned to that node is devoid of any training data.
3720
3721							See the following three scripts in the Examples directory for how to carry out DT
3722							introspection:
3723
3724							introspection_in_a_loop_interactive.pl
3725
3726							introspection_show_training_samples_at_all_nodes_direct_influence.pl
3727
3728							introspection_show_training_samples_to_nodes_influence_propagation.pl
3729
3730							The first script places you in an interactive session in which you will first be
3731							asked for the node number you are interested in. Subsequently, you will be asked for
3732							whether or not you are interested in specific questions that the introspection can
3733							provide answers for. The second script descends down the decision tree and shows for
3734							each node the training samples that fall directly in the portion of the feature space
3735							assigned to that node. The third script shows for each training sample how it
3736							affects the decision-tree nodes either directly or indirectly through the
3737							generalization achieved by the probabilistic modeling of the data.
3738
3739							The output of the script
3740							C looks like:
3741
3742							Node 0: the samples are: None
3743							Node 1: the samples are: [sample_46 sample_58]
3744							Node 2: the samples are: [sample_1 sample_4 sample_7 .....]
3745							Node 3: the samples are: []
3746							Node 4: the samples are: []
3747							...
3748							...
3749
3750							The nodes for which no samples are listed come into existence through
3751							the generalization achieved by the probabilistic modeling of the data.
3752
3753							The output produced by the script
3754							C looks like
3755
3756							sample_1:
3757							nodes affected directly: [2 5 19 23]
3758							nodes affected through probabilistic generalization:
3759							2=> [3 4 25]
3760							25=> [26]
3761							5=> [6]
3762							6=> [7 13]
3763							7=> [8 11]
3764							8=> [9 10]
3765							11=> [12]
3766							13=> [14 18]
3767							14=> [15 16]
3768							16=> [17]
3769							19=> [20]
3770							20=> [21 22]
3771							23=> [24]
3772
3773							sample_4:
3774							nodes affected directly: [2 5 6 7 11]
3775							nodes affected through probabilistic generalization:
3776							2=> [3 4 25]
3777							25=> [26]
3778							5=> [19]
3779							19=> [20 23]
3780							20=> [21 22]
3781							23=> [24]
3782							6=> [13]
3783							13=> [14 18]
3784							14=> [15 16]
3785							16=> [17]
3786							7=> [8]
3787							8=> [9 10]
3788							11=> [12]
3789
3790							...
3791							...
3792							...
3793
3794							For each training sample, the display shown above first presents the list of nodes
3795							that are directly affected by the sample. A node is affected directly by a sample if
3796							the latter falls in the portion of the feature space that belongs to the former.
3797							Subsequently, for each training sample, the display shows a subtree of the nodes that
3798							are affected indirectly by the sample through the generalization achieved by the
3799							probabilistic modeling of the data. In general, a node is affected indirectly by a
3800							sample if it is a descendant of another node that is affected directly.
3801
3802							Also see the section titled B regarding how to invoke the
3803							introspection capabilities of the module in your own code.
3804
3805							=head1 METHODS
3806
3807							The module provides the following methods for constructing a decision tree from
3808							training data in a disk file and for classifying new data records with the decision
3809							tree thus constructed:
3810
3811							=over 4
3812
3813							=item B
3814
3815							my $dt = Algorithm::DecisionTree->new(
3816							training_datafile => $training_datafile,
3817							csv_class_column_index => 2,
3818							csv_columns_for_features => [3,4,5,6,7,8],
3819							entropy_threshold => 0.01,
3820							max_depth_desired => 8,
3821							symbolic_to_numeric_cardinality_threshold => 10,
3822							csv_cleanup_needed => 1,
3823							);
3824
3825							A call to C constructs a new instance of the C class.
3826							For this call to make sense, the training data in the training datafile must be
3827							in the CSV format.
3828
3829							=back
3830
3831							=head2 The Constructor Parameters
3832
3833							=over 8
3834
3835							=item C:
3836
3837							This parameter supplies the name of the file that contains the training data.
3838
3839							=item C:
3840
3841							When using a CSV file for your training data, this parameter supplies the zero-based
3842							column index for the column that contains the class label for each data record in the
3843							training file.
3844
3845
3846							=item C:
3847
3848							You need to set this parameter to 1 if your CSV file has double quoted strings (which
3849							may include commas) as values for the fields and if such values are allowed to
3850							include commas for, presumably, better readability.
3851
3852							=item C:
3853
3854							When using a CSV file for your training data, this parameter supplies a list of
3855							columns corresponding to the features you wish to use for decision tree construction.
3856							Each column is specified by its zero-based index.
3857
3858							=item C:
3859
3860							This parameter sets the granularity with which the entropies are sampled by the
3861							module. For example, a feature test at a node in the decision tree is acceptable if
3862							the entropy gain achieved by the test exceeds this threshold. The larger the value
3863							you choose for this parameter, the smaller the tree. Its default value is 0.001.
3864
3865							=item C:
3866
3867							This parameter sets the maximum depth of the decision tree. For obvious reasons, the
3868							smaller the value you choose for this parameter, the smaller the tree.
3869
3870							=item C:
3871
3872							This parameter allows the module to treat an otherwise numeric feature symbolically
3873							if the number of different values the feature takes in the training data file does
3874							not exceed the value of this parameter.
3875
3876							=item C:
3877
3878							This parameter gives the user the option to set the number of points at which the
3879							value range for a feature should be sampled for estimating the probabilities. This
3880							parameter is effective only for those features that occupy a large value range and
3881							whose probability distributions are heavy tailed. B
3882							when you have a very large training dataset:> In general, the larger the dataset, the
3883							smaller the smallest difference between any two values for a numeric feature in
3884							relation to the overall range of values for that feature. In such cases, the module
3885							may use too large a number of bins for estimating the probabilities and that may slow
3886							down the calculation of the decision tree. You can get around this difficulty by
3887							explicitly giving a value to the 'C' parameter.
3888
3889							=back
3890
3891
3892							You can choose the best values to use for the last three constructor parameters by
3893							running a 10-fold cross-validation test on your training data through the class
3894							C that comes with Versions 2.1 and higher of this module. See the
3895							section "TESTING THE QUALITY OF YOUR TRAINING DATA" of this document page.
3896
3897							=over
3898
3899							=item B
3900
3901							After you have constructed a new instance of the C class,
3902							you must now read in the training data that is the file named in the call to the
3903							constructor. This you do by:
3904
3905							$dt->get_training_data();
3906
3907
3908							=item B
3909
3910							If you wish to see the training data that was just digested by the module,
3911							call
3912
3913							$dt->show_training_data();
3914
3915							=item B
3916
3917							=item B
3918
3919							After the module has read the training data file, it needs to initialize the
3920							probability cache. This you do by invoking:
3921
3922							$dt->calculate_first_order_probabilities()
3923							$dt->calculate_class_priors()
3924
3925							=item B
3926
3927							With the probability cache initialized, it is time to construct a decision tree
3928							classifier. This you do by
3929
3930							my $root_node = $dt->construct_decision_tree_classifier();
3931
3932							This call returns an instance of type C. The C class is defined
3933							within the main package file. So, don't forget, that C<$root_node> in the above
3934							example call will be instantiated to an object of type C.
3935
3936							=item B<$root_nodeC<< -> >>display_decision_tree(" "):>
3937
3938							$root_node->display_decision_tree(" ");
3939
3940							This will display the decision tree in your terminal window by using a recursively
3941							determined offset for each node as the display routine descends down the tree.
3942
3943							I have intentionally left the syntax fragment C<$root_node> in the above call to
3944							remind the reader that C is NOT called on the instance of
3945							the C we constructed earlier, but on the C instance returned by
3946							the call to C.
3947
3948							=item B
3949
3950							Let's say you want to classify the following data record:
3951
3952							my @test_sample = qw / g2=4.2
3953							grade=2.3
3954							gleason=4
3955							eet=1.7
3956							age=55.0
3957							ploidy=diploid /;
3958
3959							you'd make the following call:
3960
3961							my $classification = $dt->classify($root_node, \@test_sample);
3962
3963							where, again, C<$root_node> is an instance of type C returned by the call to
3964							C. The variable C<$classification> holds a
3965							reference to a hash whose keys are the class names and whose values the associated
3966							probabilities. The hash that is returned by the above call also includes a special
3967							key-value pair for a key named C. The value associated with this key
3968							is an anonymous array that holds the path, in the form of a list of nodes, from the
3969							root node to the leaf node in the decision tree where the final classification was
3970							made.
3971
3972
3973							=item B
3974
3975							This method allows you to use a decision-tree based classifier in an interactive
3976							mode. In this mode, a user is prompted for answers to the questions pertaining to
3977							the feature tests at the nodes of the tree. The syntax for invoking this method is:
3978
3979							my $classification = $dt->classify_by_asking_questions($root_node);
3980
3981							where C<$dt> is an instance of the C class returned by a
3982							call to C and C<$root_node> the root node of the decision tree returned by a
3983							call to C.
3984
3985							=back
3986
3987
3988							=head1 THE INTROSPECTION API
3989
3990							To construct an instance of C, you call
3991
3992							my $introspector = DTIntrospection->new($dt);
3993
3994							where you supply the instance of the C class you used for constructing
3995							the decision tree through the parameter C<$dt>. After you have constructed an
3996							instance of the introspection class, you must initialize it by
3997
3998							$introspector->initialize();
3999
4000							Subsequently, you can invoke either of the following methods:
4001
4002							$introspector->explain_classification_at_one_node($node);
4003
4004							$introspector->explain_classifications_at_multiple_nodes_interactively();
4005
4006							depending on whether you want introspection at a single specified node or inside an
4007							infinite loop for an arbitrary number of nodes.
4008
4009							If you want to output a tabular display that shows for each node in the decision tree
4010							all the training samples that fall in the portion of the feature space that belongs
4011							to that node, call
4012
4013							$introspector->display_training_samples_at_all_nodes_direct_influence_only();
4014
4015							If you want to output a tabular display that shows for each training sample a list of
4016							all the nodes that are affected directly AND indirectly by that sample, call
4017
4018							$introspector->display_training_training_samples_to_nodes_influence_propagation();
4019
4020							A training sample affects a node directly if the sample falls in the portion of the
4021							features space assigned to that node. On the other hand, a training sample is
4022							considered to affect a node indirectly if the node is a descendant of a node that is
4023							affected directly by the sample.
4024
4025
4026							=head1 BULK CLASSIFICATION OF DATA RECORDS
4027
4028							For large test datasets, you would obviously want to process an entire file of test
4029							data at a time. The following scripts in the C directory illustrate how you
4030							can do that:
4031
4032							classify_test_data_in_a_file.pl
4033
4034							This script requires three command-line arguments, the first argument names the
4035							training datafile, the second the test datafile, and the third the file in which the
4036							classification results are to be deposited.
4037
4038							The other examples directories, C, C, and
4039							C, also contain scripts that illustrate how to carry out
4040							bulk classification of data records when you wish to take advantage of bagging,
4041							boosting, or tree randomization. In their respective directories, these scripts are
4042							named:
4043
4044							bagging_for_bulk_classification.pl
4045							boosting_for_bulk_classification.pl
4046							classify_database_records.pl
4047
4048
4049							=head1 HOW THE CLASSIFICATION RESULTS ARE DISPLAYED
4050
4051							It depends on whether you apply the classifier at once to all the data samples in a
4052							file, or whether you feed one data sample at a time into the classifier.
4053
4054							In general, the classifier returns soft classification for a test data vector. What
4055							that means is that, in general, the classifier will list all the classes to which a
4056							given data vector could belong and the probability of each such class label for the
4057							data vector. Run the examples scripts in the Examples directory to see how the output
4058							of classification can be displayed.
4059
4060							With regard to the soft classifications returned by this classifier, if the
4061							probability distributions for the different classes overlap in the underlying feature
4062							space, you would want the classifier to return all of the applicable class labels for
4063							a data vector along with the corresponding class probabilities. Another reason for
4064							why the decision tree classifier may associate significant probabilities with
4065							multiple class labels is that you used inadequate number of training samples to
4066							induce the decision tree. The good thing is that the classifier does not lie to you
4067							(unlike, say, a hard classification rule that would return a single class label
4068							corresponding to the partitioning of the underlying feature space). The decision
4069							tree classifier give you the best classification that can be made given the training
4070							data you fed into it.
4071
4072
4073							=head1 USING BAGGING
4074
4075							Starting with Version 3.0, you can use the class C that
4076							comes with the module to incorporate bagging in your decision tree based
4077							classification. Bagging means constructing multiple decision trees for different
4078							(possibly overlapping) segments of the data extracted from your training dataset and
4079							then aggregating the decisions made by the individual decision trees for the final
4080							classification. The aggregation of the classification decisions can average out the
4081							noise and bias that may otherwise affect the classification decision obtained from
4082							just one tree.
4083
4084							=over 4
4085
4086							=item B
4087
4088							A typical call to the constructor for the C class looks
4089							like:
4090
4091							use Algorithm::DecisionTreeWithBagging;
4092
4093							my $training_datafile = "stage3cancer.csv";
4094
4095							my $dtbag = Algorithm::DecisionTreeWithBagging->new(
4096							training_datafile => $training_datafile,
4097							csv_class_column_index => 2,
4098							csv_columns_for_features => [3,4,5,6,7,8],
4099							entropy_threshold => 0.01,
4100							max_depth_desired => 8,
4101							symbolic_to_numeric_cardinality_threshold => 10,
4102							how_many_bags => 4,
4103							bag_overlap_fraction => 0.2,
4104							csv_cleanup_needed => 1,
4105							);
4106
4107							Note in particular the following two constructor parameters:
4108
4109							how_many_bags
4110
4111							bag_overlap_fraction
4112
4113							where, as the name implies, the parameter C controls how many bags
4114							(and, therefore, how many decision trees) will be constructed from your training
4115							dataset; and where the parameter C controls the degree of
4116							overlap between the bags. To understand what exactly is achieved by setting the
4117							parameter C to 0.2 in the above example, let's say that the
4118							non-overlapping partitioning of the training data between the bags results in 100
4119							training samples per bag. With bag_overlap_fraction set to 0.2, additional 20 samples
4120							drawn randomly from the other bags will be added to the data in each bag.
4121
4122							=back
4123
4124							=head2 B class>
4125
4126							=over 8
4127
4128							=item B
4129
4130							This method reads your training datafile, randomizes it, and then partitions it into
4131							the specified number of bags. Subsequently, if the constructor parameter
4132							C is non-zero, it adds to each bag additional samples drawn at
4133							random from the other bags. The number of these additional samples added to each bag
4134							is controlled by the constructor parameter C. If this
4135							parameter is set to, say, 0.2, the size of each bag will grow by 20% with the samples
4136							drawn from the other bags.
4137
4138							=item B
4139
4140							Shows for each bag the names of the training data samples in that bag.
4141
4142							=item B
4143
4144							Calls on the appropriate methods of the main C class to estimate the
4145							first-order probabilities from the data samples in each bag.
4146
4147							=item B
4148
4149							Calls on the appropriate method of the main C class to estimate the
4150							class priors for the data samples in each bag.
4151
4152							=item B
4153
4154							Calls on the appropriate method of the main C class to construct a
4155							decision tree from the training data in each bag.
4156
4157							=item B
4158
4159							Display separately the decision tree for each bag..
4160
4161							=item B
4162
4163							Calls on the appropriate methods of the main C class to classify the
4164							argument test sample.
4165
4166							=item B
4167
4168							Displays separately the classification decision made by each the decision tree
4169							constructed for each bag.
4170
4171							=item B
4172
4173							Using majority voting, this method aggregates the classification decisions made by
4174							the individual decision trees into a single decision.
4175
4176							=back
4177
4178							See the example scripts in the directory C for how to call these
4179							methods for classifying individual samples and for bulk classification when you place
4180							all your test samples in a single file.
4181
4182							=head1 USING BOOSTING
4183
4184							Starting with Version 3.20, you can use the class C for
4185							constructing a boosted decision-tree classifier. Boosting results in a cascade of
4186							decision trees in which each decision tree is constructed with samples that are
4187							mostly those that are misclassified by the previous decision tree. To be precise,
4188							you create a probability distribution over the training samples for the selection of
4189							samples for training each decision tree in the cascade. To start out, the
4190							distribution is uniform over all of the samples. Subsequently, this probability
4191							distribution changes according to the misclassifications by each tree in the cascade:
4192							if a sample is misclassified by a given tree in the cascade, the probability of its
4193							being selected for training the next tree is increased significantly. You also
4194							associate a trust factor with each decision tree depending on its power to classify
4195							correctly all of the training data samples. After a cascade of decision trees is
4196							constructed in this manner, you construct a final classifier that calculates the
4197							class label for a test data sample by taking into account the classification
4198							decisions made by each individual tree in the cascade, the decisions being weighted
4199							by the trust factors associated with the individual classifiers. These boosting
4200							notions --- generally referred to as the AdaBoost algorithm --- are based on a now
4201							celebrated paper "A Decision-Theoretic Generalization of On-Line Learning and an
4202							Application to Boosting" by Yoav Freund and Robert Schapire that appeared in 1995 in
4203							the Proceedings of the 2nd European Conf. on Computational Learning Theory. For a
4204							tutorial introduction to AdaBoost, see L
4205
4206							Keep in mind the fact that, ordinarily, the theoretical guarantees provided by
4207							boosting apply only to the case of binary classification. Additionally, your
4208							training dataset must capture all of the significant statistical variations in the
4209							classes represented therein.
4210
4211							=over 4
4212
4213							=item B
4214
4215							If you'd like to experiment with boosting, a typical call to the constructor for the
4216							C class looks like:
4217
4218							use Algorithm::BoostedDecisionTree;
4219							my $training_datafile = "training6.csv";
4220							my $boosted = Algorithm::BoostedDecisionTree->new(
4221							training_datafile => $training_datafile,
4222							csv_class_column_index => 1,
4223							csv_columns_for_features => [2,3],
4224							entropy_threshold => 0.01,
4225							max_depth_desired => 8,
4226							symbolic_to_numeric_cardinality_threshold => 10,
4227							how_many_stages => 4,
4228							csv_cleanup_needed => 1,
4229							);
4230
4231							Note in particular the constructor parameter:
4232
4233							how_many_stages
4234
4235							As its name implies, this parameter controls how many stages will be used in the
4236							boosted decision tree classifier. As mentioned above, a separate decision tree is
4237							constructed for each stage of boosting using a set of training samples that are drawn
4238							through a probability distribution maintained over the entire training dataset.
4239
4240							=back
4241
4242							=head2 B class>
4243
4244							=over 8
4245
4246							=item B
4247
4248							This method reads your training datafile, creates the data structures from the data
4249							ingested for constructing the base decision tree.
4250
4251							=item B
4252
4253							Writes to the standard output the training data samples and also some relevant
4254							properties of the features used in the training dataset.
4255
4256							=item B
4257
4258							Calls on the appropriate methods of the main C class to estimate the
4259							first-order probabilities and the class priors.
4260
4261							=item B
4262
4263							Calls on the appropriate method of the main C class to construct the
4264							base decision tree.
4265
4266							=item B
4267
4268							Displays the base decision tree in your terminal window. (The textual form of the
4269							decision tree is written out to the standard output.)
4270
4271							=item B
4272
4273							Uses the AdaBoost algorithm to construct a cascade of decision trees. As mentioned
4274							earlier, the training samples for each tree in the cascade are drawn using a
4275							probability distribution over the entire training dataset. This probability
4276							distribution for any given tree in the cascade is heavily influenced by which
4277							training samples are misclassified by the previous tree.
4278
4279							=item B
4280
4281							Displays separately in your terminal window the decision tree constructed for each
4282							stage of the cascade. (The textual form of the trees is written out to the standard
4283							output.)
4284
4285							=item B
4286
4287							Calls on each decision tree in the cascade to classify the argument C<$test_sample>.
4288
4289							=item B
4290
4291							You can call this method to display in your terminal window the classification
4292							decision made by each decision tree in the cascade. The method also prints out the
4293							trust factor associated with each decision tree. It is important to look
4294							simultaneously at the classification decision and the trust factor for each tree ---
4295							since a classification decision made by a specific tree may appear bizarre for a
4296							given test sample. This method is useful primarily for debugging purposes.
4297
4298							=item B
4299
4300							As with the previous method, this method is useful mostly for debugging. It returns
4301							class labels for the samples misclassified by the stage whose integer index is
4302							supplied as an argument to the method. Say you have 10 stages in your cascade. The
4303							value of the argument C would go from 0 to 9, with 0 corresponding to
4304							the base tree.
4305
4306							=item B
4307
4308							Uses the "final classifier" formula of the AdaBoost algorithm to pool together the
4309							classification decisions made by the individual trees while taking into account the
4310							trust factors associated with the trees. As mentioned earlier, we associate with
4311							each tree of the cascade a trust factor that depends on the overall misclassification
4312							rate associated with that tree.
4313
4314							=back
4315
4316							See the example scripts in the C subdirectory for how to call the
4317							methods listed above for classifying individual data samples with boosting and for
4318							bulk classification when you place all your test samples in a single file.
4319
4320
4321							=head1 USING RANDOMIZED DECISION TREES
4322
4323							As mentioned earlier, the new C class allows you to solve
4324							the following two problems: (1) Data classification using the needle-in-a-haystack
4325							metaphor, that is, when a vast majority of your training samples belong to just one
4326							class. And (2) You have access to a very large database of training samples and you
4327							wish to construct an ensemble of decision trees for classification.
4328
4329							=over 4
4330
4331							=item B
4332
4333							Here is how you'd call the C constructor for
4334							needle-in-a-haystack classification:
4335
4336							use Algorithm::RandomizedTreesForBigData;
4337							my $training_datafile = "your_database.csv";
4338							my $rt = Algorithm::RandomizedTreesForBigData->new(
4339							training_datafile => $training_datafile,
4340							csv_class_column_index => 48,
4341							csv_columns_for_features => [24,32,33,34,41],
4342							entropy_threshold => 0.01,
4343							max_depth_desired => 8,
4344							symbolic_to_numeric_cardinality_threshold => 10,
4345							how_many_trees => 5,
4346							looking_for_needles_in_haystack => 1,
4347							csv_cleanup_needed => 1,
4348							);
4349
4350							Note in particular the constructor parameters:
4351
4352							looking_for_needles_in_haystack
4353							how_many_trees
4354
4355							The first of these parameters, C, invokes the logic for
4356							constructing an ensemble of decision trees, each based on a training dataset that
4357							uses all of the minority class samples, and a random drawing from the majority class
4358							samples.
4359
4360							Here is how you'd call the C constructor for a more
4361							general attempt at constructing an ensemble of decision trees, with each tree trained
4362							with randomly drawn samples from a large database of training data (without paying
4363							attention to the differences in the sizes of the populations for the different
4364							classes):
4365
4366							use Algorithm::RandomizedTreesForBigData;
4367							my $training_datafile = "your_database.csv";
4368							my $rt = Algorithm::RandomizedTreesForBigData->new(
4369							training_datafile => $training_datafile,
4370							csv_class_column_index => 2,
4371							csv_columns_for_features => [3,4,5,6,7,8],
4372							entropy_threshold => 0.01,
4373							max_depth_desired => 8,
4374							symbolic_to_numeric_cardinality_threshold => 10,
4375							how_many_trees => 3,
4376							how_many_training_samples_per_tree => 50,
4377							csv_cleanup_needed => 1,
4378							);
4379
4380							Note in particular the constructor parameters:
4381
4382							how_many_training_samples_per_tree
4383							how_many_trees
4384
4385							When you set the C parameter, you are not allowed
4386							to also set the C parameter, and vice versa.
4387
4388							=back
4389
4390							=head2 B class>
4391
4392							=over 8
4393
4394							=item B
4395
4396							What this method does depends on which of the two constructor parameters ---
4397							C or C --- is
4398							set. When the former is set, it creates a collection of training datasets for
4399							C number of decision trees, with each dataset being a mixture of the
4400							minority class and sample drawn randomly from the majority class. However, when the
4401							latter option is set, all the datasets are drawn randomly from the training database
4402							with no particular attention given to the relative populations of the two classes.
4403
4404							=item B
4405
4406							As the name implies, this method shows the training data being used for all the
4407							decision trees. This method is useful for debugging purposes using small datasets.
4408
4409							=item B
4410
4411							Calls on the appropriate method of the main C to estimate the
4412							first-order probabilities for the training dataset to be used for each decision tree.
4413
4414							=item B
4415
4416							Calls on the appropriate method of the main C class to estimate the
4417							class priors for the training dataset to be used for each decision tree.
4418
4419							=item B
4420
4421							Calls on the appropriate method of the main C class to construct the
4422							decision trees.
4423
4424							=item B
4425
4426							Displays all the decision trees in your terminal window. (The textual form of the
4427							decision trees is written out to the standard output.)
4428
4429							=item B
4430
4431							The test_sample is sent to each decision tree for classification.
4432
4433							=item B
4434
4435							The classification decisions returned by the individual decision trees are written
4436							out to the standard output.
4437
4438							=item B
4439
4440							This method aggregates the classification results returned by the individual decision
4441							trees and returns the majority decision.
4442
4443							=back
4444
4445							=head1 CONSTRUCTING REGRESSION TREES:
4446
4447							Decision tree based modeling requires that the class labels be distinct. That is,
4448							the training dataset must contain a relatively small number of discrete class labels
4449							for all of your data records if you want to model the data with one or more decision
4450							trees. However, when one is trying to understand all of the associational
4451							relationships that exist in a large database, one often runs into situations where,
4452							instead of discrete class labels, you have a continuously valued variable as a
4453							dependent variable whose values are predicated on a set of feature values. It is for
4454							such situations that you will find useful the new class C that is now
4455							a part of the C module. The C class has been
4456							programmed as a subclass of the main C class.
4457
4458							You can think of regression with a regression tree as a powerful generalization of
4459							the very commonly used Linear Regression algorithms. Although you can certainly
4460							carry out polynomial regression with run-of-the-mill Linear Regression algorithms for
4461							modeling nonlinearities between the predictor variables and the dependent variable,
4462							specifying the degree of the polynomial is often tricky. Additionally, a polynomial
4463							can inject continuities between the predictor and the predicted variables that may
4464							not really exist in the real data. Regression trees, on the other hand, give you a
4465							piecewise linear relationship between the predictor and the predicted variables that
4466							is freed from the constraints of superimposed continuities at the joins between the
4467							different segments. See the following tutorial for further information regarding the
4468							standard linear regression approach and the regression that can be achieved with the
4469							RegressionTree class in this module:
4470							L
4471
4472							The RegressionTree class in the current version of the module assumes that all of
4473							your data is numerical. That is, unlike what is possible with the DecisionTree class
4474							(and the other more closely related classes in this module) that allow your training
4475							file to contain a mixture of numerical and symbolic data, the RegressionTree class
4476							requires that ALL of your data be numerical. I hope to relax this constraint in
4477							future versions of this module. Obviously, the dependent variable will always be
4478							numerical for regression.
4479
4480							See the example scripts in the directory C if you wish to become
4481							more familiar with the regression capabilities of the module.
4482
4483							=over 4
4484
4485							=item B
4486
4487							my $training_datafile = "gendata5.csv";
4488							my $rt = Algorithm::RegressionTree->new(
4489							training_datafile => $training_datafile,
4490							dependent_variable_column => 2,
4491							predictor_columns => [1],
4492							mse_threshold => 0.01,
4493							max_depth_desired => 2,
4494							jacobian_choice => 0,
4495							csv_cleanup_needed => 1,
4496							);
4497
4498							Note in particular the constructor parameters:
4499
4500							dependent_variable
4501							predictor_columns
4502							mse_threshold
4503							jacobian_choice
4504
4505							The first of these parameters, C, is set to the column index in
4506							the CSV file for the dependent variable. The second constructor parameter,
4507							C, tells the system as to which columns contain values for the
4508							predictor variables. The third parameter, C, is for deciding when to
4509							partition the data at a node into two child nodes as a regression tree is being
4510							constructed. If the minmax of MSE (Mean Squared Error) that can be achieved by
4511							partitioning any of the features at a node is smaller than C, that
4512							node becomes a leaf node of the regression tree.
4513
4514							The last parameter, C, must be set to either 0 or 1 or 2. Its
4515							default value is 0. When this parameter equals 0, the regression coefficients are
4516							calculated using the linear least-squares method and no further "refinement" of the
4517							coefficients is carried out using gradient descent. This is the fastest way to
4518							calculate the regression coefficients. When C is set to 1, you get
4519							a weak version of gradient descent in which the Jacobian is set to the "design
4520							matrix" itself. Choosing 2 for C results in a more reasonable
4521							approximation to the Jacobian. That, however, is at a cost of much longer
4522							computation time. B For most cases, using 0 for C is the
4523							best choice. See my tutorial "I" for why
4524							that is the case.
4525
4526							=back
4527
4528							=head2 B class>
4529
4530							=over 8
4531
4532							=item B
4533
4534							Only CSV training datafiles are allowed. Additionally, the first record in the file
4535							must list the names of the fields, and the first column must contain an integer ID
4536							for each record.
4537
4538							=item B
4539
4540							As the name implies, this is the method that construct a regression tree.
4541
4542							=item B
4543
4544							Displays the regression tree, as the name implies. The white-space string argument
4545							specifies the offset to use in displaying the child nodes in relation to a parent
4546							node.
4547
4548							=item B
4549
4550							You call this method after you have constructed a regression tree if you want to
4551							calculate the prediction for one sample. The parameter C<$root_node> is what is
4552							returned by the call C. The formatting of the argument
4553							bound to the C<$test_sample> parameter is important. To elaborate, let's say you are
4554							using two variables named C<$xvar1> and C<$xvar2> as your predictor variables. In
4555							this case, the C<$test_sample> parameter will be bound to a list that will look like
4556
4557							['xvar1 = 23.4', 'xvar2 = 12.9']
4558
4559							Arbitrary amount of white space, including none, on the two sides of the equality
4560							symbol is allowed in the construct shown above. A call to this method returns a
4561							dictionary with two key-value pairs. One of the keys is called C and
4562							the other C. The value associated with key C is the path
4563							in the regression tree to the leaf node that yielded the prediction. And the value
4564							associated with the key C is the answer you are looking for.
4565
4566							=item B
4567
4568							This call calculates the predictions for all of the predictor variables data in your
4569							training file. The parameter C<$root_node> is what is returned by the call to
4570							C. The values for the dependent variable thus predicted
4571							can be seen by calling C, which is the method mentioned below.
4572
4573							=item B
4574
4575							This method displays the results obtained by calling the prediction method of the
4576							previous entry. This method also creates a hardcopy of the plots and saves it as a
4577							C<.png> disk file. The name of this output file is always C.
4578
4579							=item B
4580
4581							This method carries out an error analysis of the predictions for the samples in your
4582							training datafile. It shows you the overall MSE (Mean Squared Error) with tree-based
4583							regression, the MSE for the data samples at each of the leaf nodes of the regression
4584							tree, and the MSE for the plain old Linear Regression as applied to all of the data.
4585							The parameter C<$root_node> in the call syntax is what is returned by the call to
4586							C.
4587
4588							=item B
4589
4590							Call this method if you want to apply the regression tree to all your test data in a
4591							disk file. The predictions for all of the test samples in the disk file are written
4592							out to another file whose name is the same as that of the test file except for the
4593							addition of C<_output> in the name of the file. The parameter C<$filename> is the
4594							name of the disk file that contains the test data. And the parameter C<$columns> is a
4595							list of the column indices for the predictor variables in the test file.
4596
4597							=back
4598
4599							=head1 GENERATING SYNTHETIC TRAINING DATA
4600
4601							The module file contains the following additional classes: (1)
4602							C, and (2) C for
4603							generating synthetic training data.
4604
4605							The class C outputs one CSV file for the
4606							training data and another one for the test data for experimenting with numeric
4607							features. The numeric values are generated using a multivariate Gaussian
4608							distribution whose mean and covariance are specified in a parameter file. See the
4609							file C in the C directory for an example of such a
4610							parameter file. Note that the dimensionality of the data is inferred from the
4611							information you place in the parameter file.
4612
4613							The class C generates synthetic training for the
4614							purely symbolic case. The relative frequencies of the different possible values for
4615							the features is controlled by the biasing information you place in a parameter file.
4616							See C for an example of such a file.
4617
4618
4619							=head1 THE C DIRECTORY
4620
4621							See the C directory in the distribution for how to construct a decision
4622							tree, and how to then classify new data using the decision tree. To become more
4623							familiar with the module, run the scripts
4624
4625							construct_dt_and_classify_one_sample_case1.pl
4626							construct_dt_and_classify_one_sample_case2.pl
4627							construct_dt_and_classify_one_sample_case3.pl
4628							construct_dt_and_classify_one_sample_case4.pl
4629
4630							The first script is for the purely symbolic case, the second for the case that
4631							involves both numeric and symbolic features, the third for the case of purely numeric
4632							features, and the last for the case when the training data is synthetically generated
4633							by the script C.
4634
4635							Next run the following script as it is for bulk classification of data records placed
4636							in a CSV file:
4637
4638							classify_test_data_in_a_file.pl training4.csv test4.csv out4.csv
4639
4640							The script first constructs a decision tree using the training data in the training
4641							file supplied by the first argument file C. The script then
4642							calculates the class label for each data record in the test data file supplied
4643							through the second argument file, C. The estimated class labels are
4644							written out to the output file which in the call shown above is C. An
4645							important thing to note here is that your test file --- in this case C ---
4646							must have a column for class labels. Obviously, in real-life situations, there will
4647							be no class labels in this column. What that is the case, you can place an empty
4648							string C<""> there for each data record. This is demonstrated by the following call:
4649
4650							classify_test_data_in_a_file.pl training4.csv test4_no_class_labels.csv out4.csv
4651
4652							The following script in the C directory
4653
4654							classify_by_asking_questions.pl
4655
4656							shows how you can use a decision-tree classifier interactively. In this mode, you
4657							first construct the decision tree from the training data and then the user is
4658							prompted for answers to the feature tests at the nodes of the tree.
4659
4660							If your training data has a feature whose values span a large range and, at the same
4661							time, are characterized by a heavy-tail distribution, you should look at the script
4662
4663							construct_dt_for_heavytailed.pl
4664
4665							to see how to use the option C in the call to the
4666							constructor. This option was introduced in Version 2.22 for dealing with such
4667							features. If you do not set this option, the module will use the default value of
4668							500 for the number of points at which to sample the value range for such a feature.
4669
4670							The C directory also contains the following scripts:
4671
4672							generate_training_data_numeric.pl
4673							generate_training_data_symbolic.pl
4674
4675							that show how you can use the module to generate synthetic training. Synthetic
4676							training is generated according to the specifications laid out in a parameter file.
4677							There are constraints on how the information is laid out in a parameter file. See
4678							the files C and C in the C directory
4679							for how to structure these files.
4680
4681							The C directory of Versions 2.1 and higher of the module also contains the
4682							following two scripts:
4683
4684							evaluate_training_data1.pl
4685							evaluate_training_data2.pl
4686
4687							that illustrate how the Perl class C can be used to evaluate the
4688							quality of your training data (as long as it resides in a `C<.csv>' file.) This new
4689							class is a subclass of the C class in the module file. See the README
4690							in the C directory for further information regarding these two scripts.
4691
4692							The C directory of Versions 2.31 and higher of the module contains the
4693							following three scripts:
4694
4695							introspection_in_a_loop_interactive.pl
4696
4697							introspection_show_training_samples_at_all_nodes_direct_influence.pl
4698
4699							introspection_show_training_samples_to_nodes_influence_propagation.pl
4700
4701							The first script illustrates how to use the C class of the module
4702							interactively for generating explanations for the classification decisions made at
4703							the nodes of the decision tree. In the interactive session you are first asked for
4704							the node number you are interested in. Subsequently, you are asked for whether or
4705							not you are interested in specific questions that the introspector can provide
4706							answers for. The second script generates a tabular display that shows for each node
4707							of the decision tree a list of the training samples that fall directly in the portion
4708							of the feature space assigned that node. (As mentioned elsewhere in this
4709							documentation, when this list is empty for a node, that means the node is a result of
4710							the generalization achieved by probabilistic modeling of the data. Note that this
4711							module constructs a decision tree NOT by partitioning the set of training samples,
4712							BUT by partitioning the domains of the probability density functions.) The third
4713							script listed above also generates a tabular display, but one that shows how the
4714							influence of each training sample propagates in the tree. This display first shows
4715							the list of nodes that are affected directly by the data in a training sample. This
4716							list is followed by an indented display of the nodes that are affected indirectly by
4717							the training sample. A training sample affects a node indirectly if the node is a
4718							descendant of one of the nodes affected directly.
4719
4720							The latest addition to the Examples directory is the script:
4721
4722							get_indexes_associated_with_fields.py
4723
4724							As to why you may find this script useful, note that large database files may have
4725							hundreds of fields and it is not always easy to figure out what numerical index is
4726							associated with a given field. At the same time, the constructor of the DecisionTree
4727							module requires that the field that holds the class label and the fields that contain
4728							the feature values be specified by their numerical zero-based indexes. If you have a
4729							large database and you are faced with this problem, you can run this script to see
4730							the zero-based numerical index values associated with the different columns of your
4731							CSV file.
4732
4733
4734							=head1 THE C DIRECTORY
4735
4736							The C directory contains the following scripts:
4737
4738							bagging_for_classifying_one_test_sample.pl
4739
4740							bagging_for_bulk_classification.pl
4741
4742							As the names of the scripts imply, the first shows how to call the different methods
4743							of the C class for classifying a single test sample. When
4744							you are classifying a single test sample, you can also see how each bag is
4745							classifying the test sample. You can, for example, display the training data used in
4746							each bag, the decision tree constructed for each bag, etc.
4747
4748							The second script is for the case when you place all of the test samples in a single
4749							file. The demonstration script displays for each test sample a single aggregate
4750							classification decision that is obtained through majority voting by all the decision
4751							trees.
4752
4753
4754							=head1 THE C DIRECTORY
4755
4756							The C subdirectory in the main installation directory contains the
4757							following three scripts:
4758
4759							boosting_for_classifying_one_test_sample_1.pl
4760
4761							boosting_for_classifying_one_test_sample_2.pl
4762
4763							boosting_for_bulk_classification.pl
4764
4765							As the names of the first two scripts imply, these show how to call the different
4766							methods of the C class for classifying a single test sample.
4767							When you are classifying a single test sample, you can see how each stage of the
4768							cascade of decision trees is classifying the test sample. You can also view each
4769							decision tree separately and also see the trust factor associated with the tree.
4770
4771							The third script is for the case when you place all of the test samples in a single
4772							file. The demonstration script outputs for each test sample a single aggregate
4773							classification decision that is obtained through trust-factor weighted majority
4774							voting by all the decision trees.
4775
4776							=head1 THE C DIRECTORY
4777
4778							The C directory shows example scripts that you can use to
4779							become more familiar with the C class for solving
4780							needle-in-a-haystack and big-data data classification problems. These scripts are:
4781
4782							randomized_trees_for_classifying_one_test_sample_1.pl
4783
4784							randomized_trees_for_classifying_one_test_sample_2.pl
4785
4786							classify_database_records.pl
4787
4788							The first script shows the constructor options to use for solving a
4789							needle-in-a-haystack problem --- that is, a problem in which a vast majority of the
4790							training data belongs to just one class. The second script shows the constructor
4791							options for using randomized decision trees for the case when you have access to a
4792							very large database of training samples and you'd like to construct an ensemble of
4793							decision trees using training samples pulled randomly from the training database.
4794							The last script illustrates how you can evaluate the classification power of an
4795							ensemble of decision trees as constructed by C by classifying
4796							a large number of test samples extracted randomly from the training database.
4797
4798
4799							=head1 THE C DIRECTORY
4800
4801							The C subdirectory in the main installation directory shows
4802							example scripts that you can use to become familiar with regression trees and how
4803							they can be used for nonlinear regression. If you are new to the concept of
4804							regression trees, start by executing the following scripts without changing them and
4805							see what sort of output is produced by them:
4806
4807							regression4.pl
4808
4809							regression5.pl
4810
4811							regression6.pl
4812
4813							regression8.pl
4814
4815							The C script involves only one predictor variable and one dependent
4816							variable. The training data for this exercise is drawn from the file C.
4817							This data file contains strongly nonlinear data. When you run the script
4818							C, you will see how much better the result from tree regression is
4819							compared to what you can get with linear regression.
4820
4821							The C script is essentially the same as the previous script except
4822							for the fact that the training datafile used in this case, C, consists
4823							of three noisy segments, as opposed to just two in the previous case.
4824
4825							The script C deals with the case when we have two predictor variables
4826							and one dependent variable. You can think of the data as consisting of noisy height
4827							values over an C<(x1,x2)> plane. The data used in this script is drawn from the csv
4828							file C.
4829
4830							Finally, the script C shows how you can carry out bulk prediction for
4831							all your test data records in a disk file. The script writes all the calculated
4832							predictions into another disk file whose name is derived from the name of the test
4833							data file.
4834
4835
4836							=head1 EXPORT
4837
4838							None by design.
4839
4840							=head1 BUGS
4841
4842							Please notify the author if you encounter any bugs. When sending email, please place
4843							the string 'DecisionTree' in the subject line.
4844
4845							=head1 INSTALLATION
4846
4847							Download the archive from CPAN in any directory of your choice. Unpack the archive
4848							with a command that on a Linux machine would look like:
4849
4850							tar zxvf Algorithm-DecisionTree-3.42.tar.gz
4851
4852							This will create an installation directory for you whose name will be
4853							C. Enter this directory and execute the following
4854							commands for a standard install of the module if you have root privileges:
4855
4856							perl Makefile.PL
4857							make
4858							make test
4859							sudo make install
4860
4861							If you do not have root privileges, you can carry out a non-standard install the
4862							module in any directory of your choice by:
4863
4864							perl Makefile.PL prefix=/some/other/directory/
4865							make
4866							make test
4867							make install
4868
4869							With a non-standard install, you may also have to set your PERL5LIB environment
4870							variable so that this module can find the required other modules. How you do that
4871							would depend on what platform you are working on. In order to install this module in
4872							a Linux machine on which I use tcsh for the shell, I set the PERL5LIB environment
4873							variable by
4874
4875							setenv PERL5LIB /some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4876
4877							If I used bash, I'd need to declare:
4878
4879							export PERL5LIB=/some/other/directory/lib64/perl5/:/some/other/directory/share/perl5/
4880
4881
4882							=head1 THANKS
4883
4884							I wish to thank many users of this module for their feedback. Many of the
4885							improvements I have made to the module over the years are a result of the feedback
4886							received.
4887
4888							I thank Slaven Rezic for pointing out that the module worked with Perl 5.14.x. For
4889							Version 2.22, I had set the required version of Perl to 5.18.0 since that's what I
4890							used for testing the module. Slaven's feedback in the form of the Bug report
4891							C<#96547> resulted in Version 2.23 of the module. Version 2.25 further downshifts
4892							the required version of Perl to 5.10.
4893
4894							On the basis of the report posted by Slaven at C regarding Version 2.27,
4895							I am removing the Perl version restriction altogether from Version 2.30. Thanks
4896							Slaven!
4897
4898
4899							=head1 AUTHOR
4900
4901							The author, Avinash Kak, recently finished a 17-year long "Objects Trilogy Project"
4902							with the publication of the book I by John-Wiley. If
4903							interested, visit his web page at Purdue to find out what this project was all
4904							about. You might like I especially if you enjoyed reading
4905							Harry Potter as a kid (or even as an adult, for that matter).
4906
4907							If you send email regarding this module, please place the string "DecisionTree" in
4908							your subject line to get past my spam filter. Avi Kak's email address is
4909							C
4910
4911							=head1 COPYRIGHT
4912
4913							This library is free software; you can redistribute it and/or modify it under the
4914							same terms as Perl itself.
4915
4916							Copyright 2016 Avinash Kak
4917
4918							=cut
4919