File Coverage

blib/lib/Algorithm/AM.pm

Criterion	Covered	Total	%
statement	141	143	98.6
branch	28	30	93.3
condition			n/a
subroutine	18	18	100.0
pod	2	3	66.6
total	189	194	97.4

line	stmt	bran	sub	pod	time	code
1						package Algorithm::AM;
2	10		10		75665	use strict;
	10				12
	10				207
3	10		10		24	use warnings;
	10				10
	10				292
4						our $VERSION = '3.10';
5						# ABSTRACT: Classify data with Analogical Modeling
6	10		10		28	use feature 'state';
	10				10
	10				606
7	10		10		33	use Carp;
	10				20
	10				967
8						our @CARP_NOT = qw(Algorithm::AM);
9
10						# Place this accessor here so that Class::Tiny doesn't generate
11						# a getter/setter pair.
12						sub training_set {
13	389		389	1	297	my ($self) = @_;
14	389				468	return $self->{training_set};
15						}
16
17	10				52	use Class::Tiny qw(
18						exclude_nulls
19						exclude_given
20						linear
21						training_set
22						), {
23						exclude_nulls => 1,
24						exclude_given => 1,
25						linear => 0,
26	10		10		2707	};
	10				14209
27
28						sub BUILD {
29	195		195	0	11255	my ($self, $args) = @_;
30
31						# check for invalid arguments
32	195				212	my $class = ref $self;
33	195				371	my %valid_attrs = map {$_ => 1}
	780				3301
34						Class::Tiny->get_all_attributes_for($class);
35	195				610	my @invalids = grep {!$valid_attrs{$_}} sort keys %$args;
	744				807
36	195	100			342	if(@invalids){
37	1				10	croak "Invalid attributes for $class: " . join ' ',
38						sort @invalids;
39						}
40
41	194	100			306	if(!exists $args->{training_set}){
42	1				14	croak "Missing required parameter 'training_set'";
43						}
44
45	193	100			362	if('Algorithm::AM::DataSet' ne ref $args->{training_set}){
46	1				7	croak 'Parameter training_set should ' .
47						'be an Algorithm::AM::DataSet';
48						}
49	192				286	$self->_initialize();
50						# delete $args->{training_set};
51	192				459	return;
52						}
53
54	10		10		9145	use Algorithm::AM::Result;
	10				16
	10				257
55	10		10		45	use Algorithm::AM::BigInt 'bigcmp';
	10				10
	10				358
56	10		10		3253	use Algorithm::AM::DataSet;
	10				14
	10				471
57	10		10		3795	use Import::Into;
	10				19219
	10				789
58						# Use Import::Into to export classes into caller
59						sub import {
60	15		15		42236	my $target = caller;
61	15				93	Algorithm::AM::BigInt->import::into($target, 'bigcmp');
62	15				2513	Algorithm::AM::DataSet->import::into($target, 'dataset_from_file');
63	15				2101	Algorithm::AM::DataSet::Item->import::into($target, 'new_item');
64	15				2280	return;
65						}
66
67						require XSLoader;
68						XSLoader::load(__PACKAGE__, $VERSION);
69
70	10		10		2725	use Log::Any qw($log);
	10				57954
	10				32
71
72						# do all of the classification data structure initialization here,
73						# as well as calling the XS initialization method.
74						sub _initialize {
75	192		192		165	my ($self) = @_;
76
77	192				229	my $train = $self->training_set;
78
79						# compute sub-lattices sizes here so that lattice space can be
80						# allocated in the _xs_initialize method. If certain features are
81						# thrown out later, each sub-lattice can only get smaller, so
82						# this is safe to do once here.
83	192				418	my $lattice_sizes = _compute_lattice_sizes($train->cardinality);
84
85						# sum is intitialized to a list of zeros
86	192				342	@{$self->{sum}} = (0.0) x ($train->num_classes + 1);
	192				357
87
88						# preemptively allocate memory
89						# TODO: not sure what this does
90	192				325	@{$self->{itemcontextchain}} = (0) x $train->size;
	192				1285
91
92	192				1056	$self->{$_} = {} for (
93						qw(
94						itemcontextchainhead
95						context_to_class
96						contextsize
97						pointers
98						gang
99						)
100						);
101
102						# Initialize XS data structures
103						# TODO: Perl crashes unless this is saved. The XS
104						# must not be increasing the reference count
105	192				323	$self->{save_this} = $train->_data_classes;
106						$self->_xs_initialize(
107						$lattice_sizes,
108						$self->{save_this},
109						$self->{itemcontextchain},
110						$self->{itemcontextchainhead},
111						$self->{context_to_class},
112						$self->{contextsize},
113						$self->{pointers},
114						$self->{gang},
115						$self->{sum}
116	192				948	);
117	192				260	return;
118						}
119
120						sub classify {
121	195		195	1	3807	my ($self, $test_item) = @_;
122
123	195				237	my $training_set = $self->training_set;
124	195	100			332	if($training_set->cardinality != $test_item->cardinality){
125	1				2	croak 'Training set and test item do not have the same ' .
126						'cardinality (' . $training_set->cardinality . ' and ' .
127						$test_item->cardinality . ')';
128						}
129
130						# num_feats is the number of features to be used in classification;
131						# if we exclude nulls, then we need to minus the number of '='
132						# found in this test item; otherwise, it's just the number of
133						# columns in a single item vector
134	194				424	my $num_feats = $training_set->cardinality;
135
136	194	100			3508	if($self->exclude_nulls){
137	1808				1523	$num_feats -= grep {$_ eq ''} @{
138	189				919	$test_item->features };
	189				322
139						}
140
141						# recalculate the lattice sizes with new number of active features
142	194				391	my $lattice_sizes = _compute_lattice_sizes($num_feats);
143						## $activeContexts = 1 << $activeVar;
144
145	194				190	my $nullcontext = pack "b64", '0' x 64;
146
147	194				129	my $given_excluded = 0;
148	194				167	my $test_in_training = 0;
149
150						# initialize classification-related variables
151						# it is important to dereference rather than just
152						# assigning a new one with [] or {}. This is because
153						# the XS code has access to the existing reference,
154						# but will be accessing the wrong variable if we
155						# change it.
156	194				136	%{$self->{contextsize}} = ();
	194				284
157	194				119	%{$self->{itemcontextchainhead}} = ();
	194				157
158	194				138	%{$self->{context_to_class}} = ();
	194				149
159	194				121	%{$self->{pointers}} = ();
	194				164
160	194				119	%{$self->{gang}} = ();
	194				158
161	194				129	@{$self->{itemcontextchain}} = ();
	194				591
162						# big ints are used in AM.xs; these consist of an
163						# array of 8 unsigned longs
164	194				120	foreach (@{$self->{sum}}) {
	194				258
165	753				710	$_ = pack "L!8", 0, 0, 0, 0, 0, 0, 0, 0;
166						}
167
168						# calculate context labels and associated structures for
169						# the entire data set
170	194				359	for my $index ( 0 .. $training_set->size - 1 ) {
171						my $context = _context_label(
172						# Note: this must be copied to prevent infinite loop;
173						# see todo note for _context_label
174	30034				17359	[@{$lattice_sizes}],
	30034				53202
175						$training_set->get_item($index)->features,
176						$test_item->features,
177						$self->exclude_nulls
178						);
179	30034				58609	$self->{contextsize}->{$context}++;
180						# TODO: explain itemcontextchain and itemcontextchainhead
181						$self->{itemcontextchain}->[$index] =
182	30034				27870	$self->{itemcontextchainhead}->{$context};
183	30034				23036	$self->{itemcontextchainhead}->{$context} = $index;
184
185						# store the class for the subcontext; if there
186						# is already a different class for this subcontext,
187						# then store 0, signifying heterogeneity.
188	30034				46738	my $class = $training_set->_index_for_class(
189						$training_set->get_item($index)->class);
190	30034	100			39817	if ( defined $self->{context_to_class}->{$context} ) {
191	21669	100			32272	if($self->{context_to_class}->{$context} != $class){
192	9834				11489	$self->{context_to_class}->{$context} = 0;
193						}
194						}
195						else {
196	8365				9909	$self->{context_to_class}->{$context} = $class;
197						}
198						}
199						# $nullcontext is all 0's, which is a context label for
200						# a training item that exactly matches the test item. Exclude
201						# the item if required, and set a flag that the test item was
202						# found in the training set.
203	194	100			402	if ( exists $self->{context_to_class}->{$nullcontext} ) {
204	182				133	$test_in_training = 1;
205	182	100			2553	if($self->exclude_given){
206	177				908	delete $self->{context_to_class}->{$nullcontext};
207	177				160	$given_excluded = 1;
208						}
209						}
210						# initialize the results object to hold all of the configuration
211						# info.
212	194	100			2422	my $result = Algorithm::AM::Result->new(
213						given_excluded => $given_excluded,
214						cardinality => $num_feats,
215						exclude_nulls => $self->exclude_nulls,
216						count_method => $self->linear ? 'linear' : 'squared',
217						training_set => $training_set,
218						test_item => $test_item,
219						test_in_train => $test_in_training,
220						);
221
222	194	50			9175	$log->debug(${$result->config_info})
	0				0
223						if($log->is_debug);
224
225	194				7864	$result->start_time([ (localtime)[0..2] ]);
226	194	100			3270	$self->_fillandcount(
227						$lattice_sizes, $self->linear ? 1 : 0);
228	194				39343	$result->end_time([ (localtime)[0..2] ]);
229
230	194	100			1079	unless ($self->{pointers}->{'grandtotal'}) {
231						#TODO: is this tested yet?
232	1	50			4	if($log->is_warn){
233	0				0	$log->warn('No training items considered. ' .
234						'No prediction possible.');
235						}
236	1				8	return;
237						}
238
239						$result->_process_stats(
240						# TODO: after refactoring to a "guts" object,
241						# just pass that in
242						$self->{sum},
243						$self->{pointers},
244						$self->{itemcontextchainhead},
245						$self->{itemcontextchain},
246						$self->{context_to_class},
247						$self->{gang},
248						$lattice_sizes,
249						$self->{contextsize}
250	193				550	);
251	193				381	return $result;
252						}
253
254						# since we split the lattice in four, we have to decide which features
255						# go where. Given the number of features being used, return an arrayref
256						# containing the number of features to be used in each of the the four
257						# lattices.
258						sub _compute_lattice_sizes {
259	386		386		323	my ($num_feats) = @_;
260
261	10		10		31762	use integer;
	10				75
	10				34
262	386				256	my @lattice_sizes;
263	386				396	my $half = $num_feats / 2;
264	386				424	$lattice_sizes[0] = $half / 2;
265	386				335	$lattice_sizes[1] = $half - $lattice_sizes[0];
266	386				290	$half = $num_feats - $half;
267	386				273	$lattice_sizes[2] = $half / 2;
268	386				278	$lattice_sizes[3] = $half - $lattice_sizes[2];
269	386				433	return \@lattice_sizes;
270						}
271
272						# Create binary context labels for a training item
273						# by comparing it with a test item. Each training item
274						# needs one binary label for each sublattice (of which
275						# there are currently four), but this is packed into a
276						# single scalar representing an array of 4 shorts (this
277						# format is used in the XS side).
278
279						# TODO: we have to copy lattice_sizes out of $self in order to
280						# iterate it. Otherwise it goes on forever. Why?
281						sub _context_label {
282						# inputs:
283						# number of active features in each lattice,
284						# training item features, test item features,
285						# and boolean indicating if nulls should be excluded
286	30034		30034		102508	my ($lattice_sizes, $train_feats, $test_feats, $skip_nulls) = @_;
287
288						# feature index
289	30034				19962	my $index = 0;
290						# the binary context labels for each separate lattice
291	30034				23331	my @context_list = ();
292
293	30034				26383	for my $a (@$lattice_sizes) {
294						# binary context label for a single sublattice
295	120136				64282	my $context = 0;
296						# loop through all features in the sublattice
297						# assign 0 if features match, 1 if they do not
298	120136				129549	for ( ; $a ; --$a ) {
299
300						# skip null features if indicated
301	247274	100			243662	if($skip_nulls){
302	247187				308481	++$index while $test_feats->[$index] eq '';
303						}
304						# add a 1 for mismatched variable, 0 for matched variable
305	247274				173076	$context = ( $context << 1 ) \| (
306						$test_feats->[$index] ne $train_feats->[$index] );
307	247274				272879	++$index;
308						}
309	120136				97642	push @context_list, $context;
310						}
311						# a context label is an array of unsigned shorts in XS
312	30034				41973	my $context = pack "S!4", @context_list;
313	30034				33552	return $context;
314						}
315
316						1;
317
318						__END__