File Coverage

blib/lib/Algorithm/AM.pm
Criterion Covered Total %
statement 141 143 98.6
branch 28 30 93.3
condition n/a
subroutine 18 18 100.0
pod 2 3 66.6
total 189 194 97.4


line stmt bran cond sub pod time code
1             package Algorithm::AM;
2 10     10   75665 use strict;
  10         12  
  10         207  
3 10     10   24 use warnings;
  10         10  
  10         292  
4             our $VERSION = '3.10';
5             # ABSTRACT: Classify data with Analogical Modeling
6 10     10   28 use feature 'state';
  10         10  
  10         606  
7 10     10   33 use Carp;
  10         20  
  10         967  
8             our @CARP_NOT = qw(Algorithm::AM);
9              
10             # Place this accessor here so that Class::Tiny doesn't generate
11             # a getter/setter pair.
12             sub training_set {
13 389     389 1 297 my ($self) = @_;
14 389         468 return $self->{training_set};
15             }
16              
17 10         52 use Class::Tiny qw(
18             exclude_nulls
19             exclude_given
20             linear
21             training_set
22             ), {
23             exclude_nulls => 1,
24             exclude_given => 1,
25             linear => 0,
26 10     10   2707 };
  10         14209  
27              
28             sub BUILD {
29 195     195 0 11255 my ($self, $args) = @_;
30              
31             # check for invalid arguments
32 195         212 my $class = ref $self;
33 195         371 my %valid_attrs = map {$_ => 1}
  780         3301  
34             Class::Tiny->get_all_attributes_for($class);
35 195         610 my @invalids = grep {!$valid_attrs{$_}} sort keys %$args;
  744         807  
36 195 100       342 if(@invalids){
37 1         10 croak "Invalid attributes for $class: " . join ' ',
38             sort @invalids;
39             }
40              
41 194 100       306 if(!exists $args->{training_set}){
42 1         14 croak "Missing required parameter 'training_set'";
43             }
44              
45 193 100       362 if('Algorithm::AM::DataSet' ne ref $args->{training_set}){
46 1         7 croak 'Parameter training_set should ' .
47             'be an Algorithm::AM::DataSet';
48             }
49 192         286 $self->_initialize();
50             # delete $args->{training_set};
51 192         459 return;
52             }
53              
54 10     10   9145 use Algorithm::AM::Result;
  10         16  
  10         257  
55 10     10   45 use Algorithm::AM::BigInt 'bigcmp';
  10         10  
  10         358  
56 10     10   3253 use Algorithm::AM::DataSet;
  10         14  
  10         471  
57 10     10   3795 use Import::Into;
  10         19219  
  10         789  
58             # Use Import::Into to export classes into caller
59             sub import {
60 15     15   42236 my $target = caller;
61 15         93 Algorithm::AM::BigInt->import::into($target, 'bigcmp');
62 15         2513 Algorithm::AM::DataSet->import::into($target, 'dataset_from_file');
63 15         2101 Algorithm::AM::DataSet::Item->import::into($target, 'new_item');
64 15         2280 return;
65             }
66              
67             require XSLoader;
68             XSLoader::load(__PACKAGE__, $VERSION);
69              
70 10     10   2725 use Log::Any qw($log);
  10         57954  
  10         32  
71              
72             # do all of the classification data structure initialization here,
73             # as well as calling the XS initialization method.
74             sub _initialize {
75 192     192   165 my ($self) = @_;
76              
77 192         229 my $train = $self->training_set;
78              
79             # compute sub-lattices sizes here so that lattice space can be
80             # allocated in the _xs_initialize method. If certain features are
81             # thrown out later, each sub-lattice can only get smaller, so
82             # this is safe to do once here.
83 192         418 my $lattice_sizes = _compute_lattice_sizes($train->cardinality);
84              
85             # sum is intitialized to a list of zeros
86 192         342 @{$self->{sum}} = (0.0) x ($train->num_classes + 1);
  192         357  
87              
88             # preemptively allocate memory
89             # TODO: not sure what this does
90 192         325 @{$self->{itemcontextchain}} = (0) x $train->size;
  192         1285  
91              
92 192         1056 $self->{$_} = {} for (
93             qw(
94             itemcontextchainhead
95             context_to_class
96             contextsize
97             pointers
98             gang
99             )
100             );
101              
102             # Initialize XS data structures
103             # TODO: Perl crashes unless this is saved. The XS
104             # must not be increasing the reference count
105 192         323 $self->{save_this} = $train->_data_classes;
106             $self->_xs_initialize(
107             $lattice_sizes,
108             $self->{save_this},
109             $self->{itemcontextchain},
110             $self->{itemcontextchainhead},
111             $self->{context_to_class},
112             $self->{contextsize},
113             $self->{pointers},
114             $self->{gang},
115             $self->{sum}
116 192         948 );
117 192         260 return;
118             }
119              
120             sub classify {
121 195     195 1 3807 my ($self, $test_item) = @_;
122              
123 195         237 my $training_set = $self->training_set;
124 195 100       332 if($training_set->cardinality != $test_item->cardinality){
125 1         2 croak 'Training set and test item do not have the same ' .
126             'cardinality (' . $training_set->cardinality . ' and ' .
127             $test_item->cardinality . ')';
128             }
129              
130             # num_feats is the number of features to be used in classification;
131             # if we exclude nulls, then we need to minus the number of '='
132             # found in this test item; otherwise, it's just the number of
133             # columns in a single item vector
134 194         424 my $num_feats = $training_set->cardinality;
135              
136 194 100       3508 if($self->exclude_nulls){
137 1808         1523 $num_feats -= grep {$_ eq ''} @{
138 189         919 $test_item->features };
  189         322  
139             }
140              
141             # recalculate the lattice sizes with new number of active features
142 194         391 my $lattice_sizes = _compute_lattice_sizes($num_feats);
143             ## $activeContexts = 1 << $activeVar;
144              
145 194         190 my $nullcontext = pack "b64", '0' x 64;
146              
147 194         129 my $given_excluded = 0;
148 194         167 my $test_in_training = 0;
149              
150             # initialize classification-related variables
151             # it is important to dereference rather than just
152             # assigning a new one with [] or {}. This is because
153             # the XS code has access to the existing reference,
154             # but will be accessing the wrong variable if we
155             # change it.
156 194         136 %{$self->{contextsize}} = ();
  194         284  
157 194         119 %{$self->{itemcontextchainhead}} = ();
  194         157  
158 194         138 %{$self->{context_to_class}} = ();
  194         149  
159 194         121 %{$self->{pointers}} = ();
  194         164  
160 194         119 %{$self->{gang}} = ();
  194         158  
161 194         129 @{$self->{itemcontextchain}} = ();
  194         591  
162             # big ints are used in AM.xs; these consist of an
163             # array of 8 unsigned longs
164 194         120 foreach (@{$self->{sum}}) {
  194         258  
165 753         710 $_ = pack "L!8", 0, 0, 0, 0, 0, 0, 0, 0;
166             }
167              
168             # calculate context labels and associated structures for
169             # the entire data set
170 194         359 for my $index ( 0 .. $training_set->size - 1 ) {
171             my $context = _context_label(
172             # Note: this must be copied to prevent infinite loop;
173             # see todo note for _context_label
174 30034         17359 [@{$lattice_sizes}],
  30034         53202  
175             $training_set->get_item($index)->features,
176             $test_item->features,
177             $self->exclude_nulls
178             );
179 30034         58609 $self->{contextsize}->{$context}++;
180             # TODO: explain itemcontextchain and itemcontextchainhead
181             $self->{itemcontextchain}->[$index] =
182 30034         27870 $self->{itemcontextchainhead}->{$context};
183 30034         23036 $self->{itemcontextchainhead}->{$context} = $index;
184              
185             # store the class for the subcontext; if there
186             # is already a different class for this subcontext,
187             # then store 0, signifying heterogeneity.
188 30034         46738 my $class = $training_set->_index_for_class(
189             $training_set->get_item($index)->class);
190 30034 100       39817 if ( defined $self->{context_to_class}->{$context} ) {
191 21669 100       32272 if($self->{context_to_class}->{$context} != $class){
192 9834         11489 $self->{context_to_class}->{$context} = 0;
193             }
194             }
195             else {
196 8365         9909 $self->{context_to_class}->{$context} = $class;
197             }
198             }
199             # $nullcontext is all 0's, which is a context label for
200             # a training item that exactly matches the test item. Exclude
201             # the item if required, and set a flag that the test item was
202             # found in the training set.
203 194 100       402 if ( exists $self->{context_to_class}->{$nullcontext} ) {
204 182         133 $test_in_training = 1;
205 182 100       2553 if($self->exclude_given){
206 177         908 delete $self->{context_to_class}->{$nullcontext};
207 177         160 $given_excluded = 1;
208             }
209             }
210             # initialize the results object to hold all of the configuration
211             # info.
212 194 100       2422 my $result = Algorithm::AM::Result->new(
213             given_excluded => $given_excluded,
214             cardinality => $num_feats,
215             exclude_nulls => $self->exclude_nulls,
216             count_method => $self->linear ? 'linear' : 'squared',
217             training_set => $training_set,
218             test_item => $test_item,
219             test_in_train => $test_in_training,
220             );
221              
222 194 50       9175 $log->debug(${$result->config_info})
  0         0  
223             if($log->is_debug);
224              
225 194         7864 $result->start_time([ (localtime)[0..2] ]);
226 194 100       3270 $self->_fillandcount(
227             $lattice_sizes, $self->linear ? 1 : 0);
228 194         39343 $result->end_time([ (localtime)[0..2] ]);
229              
230 194 100       1079 unless ($self->{pointers}->{'grandtotal'}) {
231             #TODO: is this tested yet?
232 1 50       4 if($log->is_warn){
233 0         0 $log->warn('No training items considered. ' .
234             'No prediction possible.');
235             }
236 1         8 return;
237             }
238              
239             $result->_process_stats(
240             # TODO: after refactoring to a "guts" object,
241             # just pass that in
242             $self->{sum},
243             $self->{pointers},
244             $self->{itemcontextchainhead},
245             $self->{itemcontextchain},
246             $self->{context_to_class},
247             $self->{gang},
248             $lattice_sizes,
249             $self->{contextsize}
250 193         550 );
251 193         381 return $result;
252             }
253              
254             # since we split the lattice in four, we have to decide which features
255             # go where. Given the number of features being used, return an arrayref
256             # containing the number of features to be used in each of the the four
257             # lattices.
258             sub _compute_lattice_sizes {
259 386     386   323 my ($num_feats) = @_;
260              
261 10     10   31762 use integer;
  10         75  
  10         34  
262 386         256 my @lattice_sizes;
263 386         396 my $half = $num_feats / 2;
264 386         424 $lattice_sizes[0] = $half / 2;
265 386         335 $lattice_sizes[1] = $half - $lattice_sizes[0];
266 386         290 $half = $num_feats - $half;
267 386         273 $lattice_sizes[2] = $half / 2;
268 386         278 $lattice_sizes[3] = $half - $lattice_sizes[2];
269 386         433 return \@lattice_sizes;
270             }
271              
272             # Create binary context labels for a training item
273             # by comparing it with a test item. Each training item
274             # needs one binary label for each sublattice (of which
275             # there are currently four), but this is packed into a
276             # single scalar representing an array of 4 shorts (this
277             # format is used in the XS side).
278              
279             # TODO: we have to copy lattice_sizes out of $self in order to
280             # iterate it. Otherwise it goes on forever. Why?
281             sub _context_label {
282             # inputs:
283             # number of active features in each lattice,
284             # training item features, test item features,
285             # and boolean indicating if nulls should be excluded
286 30034     30034   102508 my ($lattice_sizes, $train_feats, $test_feats, $skip_nulls) = @_;
287              
288             # feature index
289 30034         19962 my $index = 0;
290             # the binary context labels for each separate lattice
291 30034         23331 my @context_list = ();
292              
293 30034         26383 for my $a (@$lattice_sizes) {
294             # binary context label for a single sublattice
295 120136         64282 my $context = 0;
296             # loop through all features in the sublattice
297             # assign 0 if features match, 1 if they do not
298 120136         129549 for ( ; $a ; --$a ) {
299              
300             # skip null features if indicated
301 247274 100       243662 if($skip_nulls){
302 247187         308481 ++$index while $test_feats->[$index] eq '';
303             }
304             # add a 1 for mismatched variable, 0 for matched variable
305 247274         173076 $context = ( $context << 1 ) | (
306             $test_feats->[$index] ne $train_feats->[$index] );
307 247274         272879 ++$index;
308             }
309 120136         97642 push @context_list, $context;
310             }
311             # a context label is an array of unsigned shorts in XS
312 30034         41973 my $context = pack "S!4", @context_list;
313 30034         33552 return $context;
314             }
315              
316             1;
317              
318             __END__