File Coverage

blib/lib/Algorithm/AM.pm
Criterion Covered Total %
statement 141 143 98.6
branch 28 30 93.3
condition n/a
subroutine 18 18 100.0
pod 2 3 66.6
total 189 194 97.4


line stmt bran cond sub pod time code
1             package Algorithm::AM;
2 10     10   212060 use strict;
  10         54  
  10         240  
3 10     10   46 use warnings;
  10         16  
  10         320  
4             our $VERSION = '3.11';
5             # ABSTRACT: Classify data with Analogical Modeling
6 10     10   45 use feature 'state';
  10         15  
  10         881  
7 10     10   64 use Carp;
  10         15  
  10         1232  
8             our @CARP_NOT = qw(Algorithm::AM);
9              
10             # Place this accessor here so that Class::Tiny doesn't generate
11             # a getter/setter pair.
12             sub training_set {
13 389     389 1 622 my ($self) = @_;
14 389         674 return $self->{training_set};
15             }
16              
17 10         68 use Class::Tiny qw(
18             exclude_nulls
19             exclude_given
20             linear
21             training_set
22             ), {
23             exclude_nulls => 1,
24             exclude_given => 1,
25             linear => 0,
26 10     10   2159 };
  10         11489  
27              
28             sub BUILD {
29 195     195 0 18456 my ($self, $args) = @_;
30              
31             # check for invalid arguments
32 195         349 my $class = ref $self;
33 195         624 my %valid_attrs = map {$_ => 1}
  780         5689  
34             Class::Tiny->get_all_attributes_for($class);
35 195         1000 my @invalids = grep {!$valid_attrs{$_}} sort keys %$args;
  744         1246  
36 195 100       544 if(@invalids){
37 1         12 croak "Invalid attributes for $class: " . join ' ',
38             sort @invalids;
39             }
40              
41 194 100       509 if(!exists $args->{training_set}){
42 1         15 croak "Missing required parameter 'training_set'";
43             }
44              
45 193 100       542 if('Algorithm::AM::DataSet' ne ref $args->{training_set}){
46 1         8 croak 'Parameter training_set should ' .
47             'be an Algorithm::AM::DataSet';
48             }
49 192         602 $self->_initialize();
50             # delete $args->{training_set};
51 192         643 return;
52             }
53              
54 10     10   9220 use Algorithm::AM::Result;
  10         28  
  10         419  
55 10     10   81 use Algorithm::AM::BigInt 'bigcmp';
  10         16  
  10         463  
56 10     10   3096 use Algorithm::AM::DataSet;
  10         29  
  10         544  
57 10     10   3189 use Import::Into;
  10         19736  
  10         998  
58             # Use Import::Into to export classes into caller
59             sub import {
60 15     15   125351 my $target = caller;
61 15         135 Algorithm::AM::BigInt->import::into($target, 'bigcmp');
62 15         3558 Algorithm::AM::DataSet->import::into($target, 'dataset_from_file');
63 15         2621 Algorithm::AM::DataSet::Item->import::into($target, 'new_item');
64 15         2820 return;
65             }
66              
67             require XSLoader;
68             XSLoader::load(__PACKAGE__, $VERSION);
69              
70 10     10   1844 use Log::Any qw($log);
  10         40230  
  10         61  
71              
72             # do all of the classification data structure initialization here,
73             # as well as calling the XS initialization method.
74             sub _initialize {
75 192     192   372 my ($self) = @_;
76              
77 192         458 my $train = $self->training_set;
78              
79             # compute sub-lattices sizes here so that lattice space can be
80             # allocated in the _xs_initialize method. If certain features are
81             # thrown out later, each sub-lattice can only get smaller, so
82             # this is safe to do once here.
83 192         584 my $lattice_sizes = _compute_lattice_sizes($train->cardinality);
84              
85             # sum is intitialized to a list of zeros
86 192         450 @{$self->{sum}} = (0.0) x ($train->num_classes + 1);
  192         601  
87              
88             # preemptively allocate memory
89             # TODO: not sure what this does
90 192         529 @{$self->{itemcontextchain}} = (0) x $train->size;
  192         1361  
91              
92 192         1437 $self->{$_} = {} for (
93             qw(
94             itemcontextchainhead
95             context_to_class
96             contextsize
97             pointers
98             gang
99             )
100             );
101              
102             # Initialize XS data structures
103             # TODO: Perl crashes unless this is saved. The XS
104             # must not be increasing the reference count
105 192         563 $self->{save_this} = $train->_data_classes;
106             $self->_xs_initialize(
107             $lattice_sizes,
108             $self->{save_this},
109             $self->{itemcontextchain},
110             $self->{itemcontextchainhead},
111             $self->{context_to_class},
112             $self->{contextsize},
113             $self->{pointers},
114             $self->{gang},
115             $self->{sum}
116 192         1451 );
117 192         399 return;
118             }
119              
120             sub classify {
121 195     195 1 9915 my ($self, $test_item) = @_;
122              
123 195         443 my $training_set = $self->training_set;
124 195 100       461 if($training_set->cardinality != $test_item->cardinality){
125 1         4 croak 'Training set and test item do not have the same ' .
126             'cardinality (' . $training_set->cardinality . ' and ' .
127             $test_item->cardinality . ')';
128             }
129              
130             # num_feats is the number of features to be used in classification;
131             # if we exclude nulls, then we need to minus the number of '='
132             # found in this test item; otherwise, it's just the number of
133             # columns in a single item vector
134 194         415 my $num_feats = $training_set->cardinality;
135              
136 194 100       4141 if($self->exclude_nulls){
137 1808         2712 $num_feats -= grep {$_ eq ''} @{
138 189         1061 $test_item->features };
  189         368  
139             }
140              
141             # recalculate the lattice sizes with new number of active features
142 194         539 my $lattice_sizes = _compute_lattice_sizes($num_feats);
143             ## $activeContexts = 1 << $activeVar;
144              
145 194         332 my $nullcontext = pack "b64", '0' x 64;
146              
147 194         269 my $given_excluded = 0;
148 194         245 my $test_in_training = 0;
149              
150             # initialize classification-related variables
151             # it is important to dereference rather than just
152             # assigning a new one with [] or {}. This is because
153             # the XS code has access to the existing reference,
154             # but will be accessing the wrong variable if we
155             # change it.
156 194         243 %{$self->{contextsize}} = ();
  194         375  
157 194         269 %{$self->{itemcontextchainhead}} = ();
  194         294  
158 194         264 %{$self->{context_to_class}} = ();
  194         299  
159 194         230 %{$self->{pointers}} = ();
  194         253  
160 194         239 %{$self->{gang}} = ();
  194         270  
161 194         316 @{$self->{itemcontextchain}} = ();
  194         771  
162             # big ints are used in AM.xs; these consist of an
163             # array of 8 unsigned longs
164 194         258 foreach (@{$self->{sum}}) {
  194         402  
165 753         1078 $_ = pack "L!8", 0, 0, 0, 0, 0, 0, 0, 0;
166             }
167              
168             # calculate context labels and associated structures for
169             # the entire data set
170 194         526 for my $index ( 0 .. $training_set->size - 1 ) {
171             my $context = _context_label(
172             # Note: this must be copied to prevent infinite loop;
173             # see todo note for _context_label
174 30034         32003 [@{$lattice_sizes}],
  30034         62614  
175             $training_set->get_item($index)->features,
176             $test_item->features,
177             $self->exclude_nulls
178             );
179 30034         80078 $self->{contextsize}->{$context}++;
180             # TODO: explain itemcontextchain and itemcontextchainhead
181             $self->{itemcontextchain}->[$index] =
182 30034         43417 $self->{itemcontextchainhead}->{$context};
183 30034         38692 $self->{itemcontextchainhead}->{$context} = $index;
184              
185             # store the class for the subcontext; if there
186             # is already a different class for this subcontext,
187             # then store 0, signifying heterogeneity.
188 30034         57031 my $class = $training_set->_index_for_class(
189             $training_set->get_item($index)->class);
190 30034 100       53857 if ( defined $self->{context_to_class}->{$context} ) {
191 21669 100       40242 if($self->{context_to_class}->{$context} != $class){
192 9834         15247 $self->{context_to_class}->{$context} = 0;
193             }
194             }
195             else {
196 8365         14278 $self->{context_to_class}->{$context} = $class;
197             }
198             }
199             # $nullcontext is all 0's, which is a context label for
200             # a training item that exactly matches the test item. Exclude
201             # the item if required, and set a flag that the test item was
202             # found in the training set.
203 194 100       595 if ( exists $self->{context_to_class}->{$nullcontext} ) {
204 182         252 $test_in_training = 1;
205 182 100       2798 if($self->exclude_given){
206 177         1218 delete $self->{context_to_class}->{$nullcontext};
207 177         332 $given_excluded = 1;
208             }
209             }
210             # initialize the results object to hold all of the configuration
211             # info.
212 194 100       2712 my $result = Algorithm::AM::Result->new(
213             given_excluded => $given_excluded,
214             cardinality => $num_feats,
215             exclude_nulls => $self->exclude_nulls,
216             count_method => $self->linear ? 'linear' : 'squared',
217             training_set => $training_set,
218             test_item => $test_item,
219             test_in_train => $test_in_training,
220             );
221              
222 194 50       14562 $log->debug(${$result->config_info})
  0         0  
223             if($log->is_debug);
224              
225 194         12732 $result->start_time([ (localtime)[0..2] ]);
226 194 100       3590 $self->_fillandcount(
227             $lattice_sizes, $self->linear ? 1 : 0);
228 194         42769 $result->end_time([ (localtime)[0..2] ]);
229              
230 194 100       1379 unless ($self->{pointers}->{'grandtotal'}) {
231             #TODO: is this tested yet?
232 1 50       7 if($log->is_warn){
233 0         0 $log->warn('No training items considered. ' .
234             'No prediction possible.');
235             }
236 1         11 return;
237             }
238              
239             $result->_process_stats(
240             # TODO: after refactoring to a "guts" object,
241             # just pass that in
242             $self->{sum},
243             $self->{pointers},
244             $self->{itemcontextchainhead},
245             $self->{itemcontextchain},
246             $self->{context_to_class},
247             $self->{gang},
248             $lattice_sizes,
249             $self->{contextsize}
250 193         1115 );
251 193         560 return $result;
252             }
253              
254             # since we split the lattice in four, we have to decide which features
255             # go where. Given the number of features being used, return an arrayref
256             # containing the number of features to be used in each of the the four
257             # lattices.
258             sub _compute_lattice_sizes {
259 386     386   619 my ($num_feats) = @_;
260              
261 10     10   21736 use integer;
  10         124  
  10         45  
262 386         479 my @lattice_sizes;
263 386         594 my $half = $num_feats / 2;
264 386         658 $lattice_sizes[0] = $half / 2;
265 386         564 $lattice_sizes[1] = $half - $lattice_sizes[0];
266 386         558 $half = $num_feats - $half;
267 386         601 $lattice_sizes[2] = $half / 2;
268 386         516 $lattice_sizes[3] = $half - $lattice_sizes[2];
269 386         666 return \@lattice_sizes;
270             }
271              
272             # Create binary context labels for a training item
273             # by comparing it with a test item. Each training item
274             # needs one binary label for each sublattice (of which
275             # there are currently four), but this is packed into a
276             # single scalar representing an array of 4 shorts (this
277             # format is used in the XS side).
278              
279             # TODO: we have to copy lattice_sizes out of $self in order to
280             # iterate it. Otherwise it goes on forever. Why?
281             sub _context_label {
282             # inputs:
283             # number of active features in each lattice,
284             # training item features, test item features,
285             # and boolean indicating if nulls should be excluded
286 30034     30034   134060 my ($lattice_sizes, $train_feats, $test_feats, $skip_nulls) = @_;
287              
288             # feature index
289 30034         33147 my $index = 0;
290             # the binary context labels for each separate lattice
291 30034         34830 my @context_list = ();
292              
293 30034         42092 for my $a (@$lattice_sizes) {
294             # binary context label for a single sublattice
295 120136         122636 my $context = 0;
296             # loop through all features in the sublattice
297             # assign 0 if features match, 1 if they do not
298 120136         163287 for ( ; $a ; --$a ) {
299              
300             # skip null features if indicated
301 247274 100       314286 if($skip_nulls){
302 247187         365419 ++$index while $test_feats->[$index] eq '';
303             }
304             # add a 1 for mismatched variable, 0 for matched variable
305 247274         295349 $context = ( $context << 1 ) | (
306             $test_feats->[$index] ne $train_feats->[$index] );
307 247274         348548 ++$index;
308             }
309 120136         151889 push @context_list, $context;
310             }
311             # a context label is an array of unsigned shorts in XS
312 30034         56730 my $context = pack "S!4", @context_list;
313 30034         52758 return $context;
314             }
315              
316             1;
317              
318             __END__