File Coverage

blib/lib/Algorithm/AM.pm

Criterion	Covered	Total	%
statement	141	143	98.6
branch	28	30	93.3
condition			n/a
subroutine	18	18	100.0
pod	2	3	66.6
total	189	194	97.4

line	stmt	bran	sub	pod	time	code
1						package Algorithm::AM;
2	10		10		284674	use strict;
	10				62
	10				279
3	10		10		47	use warnings;
	10				17
	10				378
4						our $VERSION = '3.12';
5						# ABSTRACT: Classify data with Analogical Modeling
6	10		10		53	use feature 'state';
	10				18
	10				1645
7	10		10		63	use Carp;
	10				15
	10				1523
8						our @CARP_NOT = qw(Algorithm::AM);
9
10						# Place this accessor here so that Class::Tiny doesn't generate
11						# a getter/setter pair.
12						sub training_set {
13	389		389	1	877	my ($self) = @_;
14	389				692	return $self->{training_set};
15						}
16
17	10				69	use Class::Tiny qw(
18						exclude_nulls
19						exclude_given
20						linear
21						training_set
22						), {
23						exclude_nulls => 1,
24						exclude_given => 1,
25						linear => 0,
26	10		10		3396	};
	10				15403
27
28						sub BUILD {
29	195		195	0	21425	my ($self, $args) = @_;
30
31						# check for invalid arguments
32	195				425	my $class = ref $self;
33	195				719	my %valid_attrs = map {$_ => 1}
	780				5887
34						Class::Tiny->get_all_attributes_for($class);
35	195				1031	my @invalids = grep {!$valid_attrs{$_}} sort keys %$args;
	744				1248
36	195	100			573	if(@invalids){
37	1				12	croak "Invalid attributes for $class: " . join ' ',
38						sort @invalids;
39						}
40
41	194	100			531	if(!exists $args->{training_set}){
42	1				18	croak "Missing required parameter 'training_set'";
43						}
44
45	193	100			679	if('Algorithm::AM::DataSet' ne ref $args->{training_set}){
46	1				10	croak 'Parameter training_set should ' .
47						'be an Algorithm::AM::DataSet';
48						}
49	192				594	$self->_initialize();
50						# delete $args->{training_set};
51	192				866	return;
52						}
53
54	10		10		12317	use Algorithm::AM::Result;
	10				29
	10				430
55	10		10		91	use Algorithm::AM::BigInt 'bigcmp';
	10				21
	10				457
56	10		10		4549	use Algorithm::AM::DataSet;
	10				29
	10				609
57	10		10		5471	use Import::Into;
	10				26877
	10				1056
58						# Use Import::Into to export classes into caller
59						sub import {
60	15		15		165948	my $target = caller;
61	15				151	Algorithm::AM::BigInt->import::into($target, 'bigcmp');
62	15				4277	Algorithm::AM::DataSet->import::into($target, 'dataset_from_file');
63	15				3246	Algorithm::AM::DataSet::Item->import::into($target, 'new_item');
64	15				2931	return;
65						}
66
67						require XSLoader;
68						XSLoader::load(__PACKAGE__, $VERSION);
69
70	10		10		3663	use Log::Any qw($log);
	10				58328
	10				63
71
72						# do all of the classification data structure initialization here,
73						# as well as calling the XS initialization method.
74						sub _initialize {
75	192		192		305	my ($self) = @_;
76
77	192				516	my $train = $self->training_set;
78
79						# compute sub-lattices sizes here so that lattice space can be
80						# allocated in the _xs_initialize method. If certain features are
81						# thrown out later, each sub-lattice can only get smaller, so
82						# this is safe to do once here.
83	192				533	my $lattice_sizes = _compute_lattice_sizes($train->cardinality);
84
85						# sum is intitialized to a list of zeros
86	192				499	@{$self->{sum}} = (0.0) x ($train->num_classes + 1);
	192				635
87
88						# preemptively allocate memory
89						# TODO: not sure what this does
90	192				528	@{$self->{itemcontextchain}} = (0) x $train->size;
	192				1616
91
92	192				1637	$self->{$_} = {} for (
93						qw(
94						itemcontextchainhead
95						context_to_class
96						contextsize
97						pointers
98						gang
99						)
100						);
101
102						# Initialize XS data structures
103						# TODO: Perl crashes unless this is saved. The XS
104						# must not be increasing the reference count
105	192				605	$self->{save_this} = $train->_data_classes;
106						$self->_xs_initialize(
107						$lattice_sizes,
108						$self->{save_this},
109						$self->{itemcontextchain},
110						$self->{itemcontextchainhead},
111						$self->{context_to_class},
112						$self->{contextsize},
113						$self->{pointers},
114						$self->{gang},
115						$self->{sum}
116	192				1923	);
117	192				459	return;
118						}
119
120						sub classify {
121	195		195	1	11254	my ($self, $test_item) = @_;
122
123	195				481	my $training_set = $self->training_set;
124	195	100			550	if($training_set->cardinality != $test_item->cardinality){
125	1				4	croak 'Training set and test item do not have the same ' .
126						'cardinality (' . $training_set->cardinality . ' and ' .
127						$test_item->cardinality . ')';
128						}
129
130						# num_feats is the number of features to be used in classification;
131						# if we exclude nulls, then we need to minus the number of '='
132						# found in this test item; otherwise, it's just the number of
133						# columns in a single item vector
134	194				444	my $num_feats = $training_set->cardinality;
135
136	194	100			4997	if($self->exclude_nulls){
137	1808				2524	$num_feats -= grep {$_ eq ''} @{
138	189				1128	$test_item->features };
	189				474
139						}
140
141						# recalculate the lattice sizes with new number of active features
142	194				580	my $lattice_sizes = _compute_lattice_sizes($num_feats);
143						## $activeContexts = 1 << $activeVar;
144
145	194				337	my $nullcontext = pack "b64", '0' x 64;
146
147	194				276	my $given_excluded = 0;
148	194				380	my $test_in_training = 0;
149
150						# initialize classification-related variables
151						# it is important to dereference rather than just
152						# assigning a new one with [] or {}. This is because
153						# the XS code has access to the existing reference,
154						# but will be accessing the wrong variable if we
155						# change it.
156	194				303	%{$self->{contextsize}} = ();
	194				438
157	194				278	%{$self->{itemcontextchainhead}} = ();
	194				350
158	194				327	%{$self->{context_to_class}} = ();
	194				352
159	194				318	%{$self->{pointers}} = ();
	194				347
160	194				325	%{$self->{gang}} = ();
	194				317
161	194				311	@{$self->{itemcontextchain}} = ();
	194				764
162						# big ints are used in AM.xs; these consist of an
163						# array of 8 unsigned longs
164	194				270	foreach (@{$self->{sum}}) {
	194				431
165	753				1178	$_ = pack "L!8", 0, 0, 0, 0, 0, 0, 0, 0;
166						}
167
168						# calculate context labels and associated structures for
169						# the entire data set
170	194				591	for my $index ( 0 .. $training_set->size - 1 ) {
171						my $context = _context_label(
172						# Note: this must be copied to prevent infinite loop;
173						# see todo note for _context_label
174	30034				30423	[@{$lattice_sizes}],
	30034				56815
175						$training_set->get_item($index)->features,
176						$test_item->features,
177						$self->exclude_nulls
178						);
179	30034				79501	$self->{contextsize}->{$context}++;
180						# TODO: explain itemcontextchain and itemcontextchainhead
181						$self->{itemcontextchain}->[$index] =
182	30034				41125	$self->{itemcontextchainhead}->{$context};
183	30034				37849	$self->{itemcontextchainhead}->{$context} = $index;
184
185						# store the class for the subcontext; if there
186						# is already a different class for this subcontext,
187						# then store 0, signifying heterogeneity.
188	30034				55783	my $class = $training_set->_index_for_class(
189						$training_set->get_item($index)->class);
190	30034	100			52173	if ( defined $self->{context_to_class}->{$context} ) {
191	21669	100			38003	if($self->{context_to_class}->{$context} != $class){
192	9834				15328	$self->{context_to_class}->{$context} = 0;
193						}
194						}
195						else {
196	8365				15445	$self->{context_to_class}->{$context} = $class;
197						}
198						}
199						# $nullcontext is all 0's, which is a context label for
200						# a training item that exactly matches the test item. Exclude
201						# the item if required, and set a flag that the test item was
202						# found in the training set.
203	194	100			694	if ( exists $self->{context_to_class}->{$nullcontext} ) {
204	182				300	$test_in_training = 1;
205	182	100			2858	if($self->exclude_given){
206	177				1425	delete $self->{context_to_class}->{$nullcontext};
207	177				343	$given_excluded = 1;
208						}
209						}
210						# initialize the results object to hold all of the configuration
211						# info.
212	194	100			2637	my $result = Algorithm::AM::Result->new(
213						given_excluded => $given_excluded,
214						cardinality => $num_feats,
215						exclude_nulls => $self->exclude_nulls,
216						count_method => $self->linear ? 'linear' : 'squared',
217						training_set => $training_set,
218						test_item => $test_item,
219						test_in_train => $test_in_training,
220						);
221
222	194	50			16384	$log->debug(${$result->config_info})
	0				0
223						if($log->is_debug);
224
225	194				17376	$result->start_time([ (localtime)[0..2] ]);
226	194	100			3854	$self->_fillandcount(
227						$lattice_sizes, $self->linear ? 1 : 0);
228	194				46897	$result->end_time([ (localtime)[0..2] ]);
229
230	194	100			1531	unless ($self->{pointers}->{'grandtotal'}) {
231						#TODO: is this tested yet?
232	1	50			10	if($log->is_warn){
233	0				0	$log->warn('No training items considered. ' .
234						'No prediction possible.');
235						}
236	1				11	return;
237						}
238
239						$result->_process_stats(
240						# TODO: after refactoring to a "guts" object,
241						# just pass that in
242						$self->{sum},
243						$self->{pointers},
244						$self->{itemcontextchainhead},
245						$self->{itemcontextchain},
246						$self->{context_to_class},
247						$self->{gang},
248						$lattice_sizes,
249						$self->{contextsize}
250	193				1323	);
251	193				630	return $result;
252						}
253
254						# since we split the lattice in four, we have to decide which features
255						# go where. Given the number of features being used, return an arrayref
256						# containing the number of features to be used in each of the the four
257						# lattices.
258						sub _compute_lattice_sizes {
259	386		386		626	my ($num_feats) = @_;
260
261	10		10		31140	use integer;
	10				142
	10				62
262	386				496	my @lattice_sizes;
263	386				627	my $half = $num_feats / 2;
264	386				774	$lattice_sizes[0] = $half / 2;
265	386				585	$lattice_sizes[1] = $half - $lattice_sizes[0];
266	386				528	$half = $num_feats - $half;
267	386				569	$lattice_sizes[2] = $half / 2;
268	386				622	$lattice_sizes[3] = $half - $lattice_sizes[2];
269	386				690	return \@lattice_sizes;
270						}
271
272						# Create binary context labels for a training item
273						# by comparing it with a test item. Each training item
274						# needs one binary label for each sublattice (of which
275						# there are currently four), but this is packed into a
276						# single scalar representing an array of 4 shorts (this
277						# format is used in the XS side).
278
279						# TODO: we have to copy lattice_sizes out of $self in order to
280						# iterate it. Otherwise it goes on forever. Why?
281						sub _context_label {
282						# inputs:
283						# number of active features in each lattice,
284						# training item features, test item features,
285						# and boolean indicating if nulls should be excluded
286	30034		30034		119571	my ($lattice_sizes, $train_feats, $test_feats, $skip_nulls) = @_;
287
288						# feature index
289	30034				31683	my $index = 0;
290						# the binary context labels for each separate lattice
291	30034				32506	my @context_list = ();
292
293	30034				37172	for my $a (@$lattice_sizes) {
294						# binary context label for a single sublattice
295	120136				112519	my $context = 0;
296						# loop through all features in the sublattice
297						# assign 0 if features match, 1 if they do not
298	120136				151928	for ( ; $a ; --$a ) {
299
300						# skip null features if indicated
301	247274	100			292639	if($skip_nulls){
302	247187				338202	++$index while $test_feats->[$index] eq '';
303						}
304						# add a 1 for mismatched variable, 0 for matched variable
305	247274				273573	$context = ( $context << 1 ) \| (
306						$test_feats->[$index] ne $train_feats->[$index] );
307	247274				310486	++$index;
308						}
309	120136				145713	push @context_list, $context;
310						}
311						# a context label is an array of unsigned shorts in XS
312	30034				52929	my $context = pack "S!4", @context_list;
313	30034				51064	return $context;
314						}
315
316						1;
317
318						__END__