| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Algorithm::AM; | 
| 2 | 10 |  |  | 10 |  | 284674 | use strict; | 
|  | 10 |  |  |  |  | 62 |  | 
|  | 10 |  |  |  |  | 279 |  | 
| 3 | 10 |  |  | 10 |  | 47 | use warnings; | 
|  | 10 |  |  |  |  | 17 |  | 
|  | 10 |  |  |  |  | 378 |  | 
| 4 |  |  |  |  |  |  | our $VERSION = '3.12'; | 
| 5 |  |  |  |  |  |  | # ABSTRACT: Classify data with Analogical Modeling | 
| 6 | 10 |  |  | 10 |  | 53 | use feature 'state'; | 
|  | 10 |  |  |  |  | 18 |  | 
|  | 10 |  |  |  |  | 1645 |  | 
| 7 | 10 |  |  | 10 |  | 63 | use Carp; | 
|  | 10 |  |  |  |  | 15 |  | 
|  | 10 |  |  |  |  | 1523 |  | 
| 8 |  |  |  |  |  |  | our @CARP_NOT = qw(Algorithm::AM); | 
| 9 |  |  |  |  |  |  |  | 
| 10 |  |  |  |  |  |  | # Place this accessor here so that Class::Tiny doesn't generate | 
| 11 |  |  |  |  |  |  | # a getter/setter pair. | 
| 12 |  |  |  |  |  |  | sub training_set { | 
| 13 | 389 |  |  | 389 | 1 | 877 | my ($self) = @_; | 
| 14 | 389 |  |  |  |  | 692 | return $self->{training_set}; | 
| 15 |  |  |  |  |  |  | } | 
| 16 |  |  |  |  |  |  |  | 
| 17 | 10 |  |  |  |  | 69 | use Class::Tiny qw( | 
| 18 |  |  |  |  |  |  | exclude_nulls | 
| 19 |  |  |  |  |  |  | exclude_given | 
| 20 |  |  |  |  |  |  | linear | 
| 21 |  |  |  |  |  |  | training_set | 
| 22 |  |  |  |  |  |  | ), { | 
| 23 |  |  |  |  |  |  | exclude_nulls     => 1, | 
| 24 |  |  |  |  |  |  | exclude_given    => 1, | 
| 25 |  |  |  |  |  |  | linear      => 0, | 
| 26 | 10 |  |  | 10 |  | 3396 | }; | 
|  | 10 |  |  |  |  | 15403 |  | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | sub BUILD { | 
| 29 | 195 |  |  | 195 | 0 | 21425 | my ($self, $args) = @_; | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | # check for invalid arguments | 
| 32 | 195 |  |  |  |  | 425 | my $class = ref $self; | 
| 33 | 195 |  |  |  |  | 719 | my %valid_attrs = map {$_ => 1} | 
|  | 780 |  |  |  |  | 5887 |  | 
| 34 |  |  |  |  |  |  | Class::Tiny->get_all_attributes_for($class); | 
| 35 | 195 |  |  |  |  | 1031 | my @invalids = grep {!$valid_attrs{$_}} sort keys %$args; | 
|  | 744 |  |  |  |  | 1248 |  | 
| 36 | 195 | 100 |  |  |  | 573 | if(@invalids){ | 
| 37 | 1 |  |  |  |  | 12 | croak "Invalid attributes for $class: " . join ' ', | 
| 38 |  |  |  |  |  |  | sort @invalids; | 
| 39 |  |  |  |  |  |  | } | 
| 40 |  |  |  |  |  |  |  | 
| 41 | 194 | 100 |  |  |  | 531 | if(!exists $args->{training_set}){ | 
| 42 | 1 |  |  |  |  | 18 | croak "Missing required parameter 'training_set'"; | 
| 43 |  |  |  |  |  |  | } | 
| 44 |  |  |  |  |  |  |  | 
| 45 | 193 | 100 |  |  |  | 679 | if('Algorithm::AM::DataSet' ne ref $args->{training_set}){ | 
| 46 | 1 |  |  |  |  | 10 | croak 'Parameter training_set should ' . | 
| 47 |  |  |  |  |  |  | 'be an Algorithm::AM::DataSet'; | 
| 48 |  |  |  |  |  |  | } | 
| 49 | 192 |  |  |  |  | 594 | $self->_initialize(); | 
| 50 |  |  |  |  |  |  | # delete $args->{training_set}; | 
| 51 | 192 |  |  |  |  | 866 | return; | 
| 52 |  |  |  |  |  |  | } | 
| 53 |  |  |  |  |  |  |  | 
| 54 | 10 |  |  | 10 |  | 12317 | use Algorithm::AM::Result; | 
|  | 10 |  |  |  |  | 29 |  | 
|  | 10 |  |  |  |  | 430 |  | 
| 55 | 10 |  |  | 10 |  | 91 | use Algorithm::AM::BigInt 'bigcmp'; | 
|  | 10 |  |  |  |  | 21 |  | 
|  | 10 |  |  |  |  | 457 |  | 
| 56 | 10 |  |  | 10 |  | 4549 | use Algorithm::AM::DataSet; | 
|  | 10 |  |  |  |  | 29 |  | 
|  | 10 |  |  |  |  | 609 |  | 
| 57 | 10 |  |  | 10 |  | 5471 | use Import::Into; | 
|  | 10 |  |  |  |  | 26877 |  | 
|  | 10 |  |  |  |  | 1056 |  | 
| 58 |  |  |  |  |  |  | # Use Import::Into to export classes into caller | 
| 59 |  |  |  |  |  |  | sub import { | 
| 60 | 15 |  |  | 15 |  | 165948 | my $target = caller; | 
| 61 | 15 |  |  |  |  | 151 | Algorithm::AM::BigInt->import::into($target, 'bigcmp'); | 
| 62 | 15 |  |  |  |  | 4277 | Algorithm::AM::DataSet->import::into($target, 'dataset_from_file'); | 
| 63 | 15 |  |  |  |  | 3246 | Algorithm::AM::DataSet::Item->import::into($target, 'new_item'); | 
| 64 | 15 |  |  |  |  | 2931 | return; | 
| 65 |  |  |  |  |  |  | } | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | require XSLoader; | 
| 68 |  |  |  |  |  |  | XSLoader::load(__PACKAGE__, $VERSION); | 
| 69 |  |  |  |  |  |  |  | 
| 70 | 10 |  |  | 10 |  | 3663 | use Log::Any qw($log); | 
|  | 10 |  |  |  |  | 58328 |  | 
|  | 10 |  |  |  |  | 63 |  | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  | # do all of the classification data structure initialization here, | 
| 73 |  |  |  |  |  |  | # as well as calling the XS initialization method. | 
| 74 |  |  |  |  |  |  | sub _initialize { | 
| 75 | 192 |  |  | 192 |  | 305 | my ($self) = @_; | 
| 76 |  |  |  |  |  |  |  | 
| 77 | 192 |  |  |  |  | 516 | my $train = $self->training_set; | 
| 78 |  |  |  |  |  |  |  | 
| 79 |  |  |  |  |  |  | # compute sub-lattices sizes here so that lattice space can be | 
| 80 |  |  |  |  |  |  | # allocated in the _xs_initialize method. If certain features are | 
| 81 |  |  |  |  |  |  | # thrown out later, each sub-lattice can only get smaller, so | 
| 82 |  |  |  |  |  |  | # this is safe to do once here. | 
| 83 | 192 |  |  |  |  | 533 | my $lattice_sizes = _compute_lattice_sizes($train->cardinality); | 
| 84 |  |  |  |  |  |  |  | 
| 85 |  |  |  |  |  |  | # sum is intitialized to a list of zeros | 
| 86 | 192 |  |  |  |  | 499 | @{$self->{sum}} = (0.0) x ($train->num_classes + 1); | 
|  | 192 |  |  |  |  | 635 |  | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | # preemptively allocate memory | 
| 89 |  |  |  |  |  |  | # TODO: not sure what this does | 
| 90 | 192 |  |  |  |  | 528 | @{$self->{itemcontextchain}} = (0) x $train->size; | 
|  | 192 |  |  |  |  | 1616 |  | 
| 91 |  |  |  |  |  |  |  | 
| 92 | 192 |  |  |  |  | 1637 | $self->{$_} = {} for ( | 
| 93 |  |  |  |  |  |  | qw( | 
| 94 |  |  |  |  |  |  | itemcontextchainhead | 
| 95 |  |  |  |  |  |  | context_to_class | 
| 96 |  |  |  |  |  |  | contextsize | 
| 97 |  |  |  |  |  |  | pointers | 
| 98 |  |  |  |  |  |  | gang | 
| 99 |  |  |  |  |  |  | ) | 
| 100 |  |  |  |  |  |  | ); | 
| 101 |  |  |  |  |  |  |  | 
| 102 |  |  |  |  |  |  | # Initialize XS data structures | 
| 103 |  |  |  |  |  |  | # TODO: Perl crashes unless this is saved. The XS | 
| 104 |  |  |  |  |  |  | # must not be increasing the reference count | 
| 105 | 192 |  |  |  |  | 605 | $self->{save_this} = $train->_data_classes; | 
| 106 |  |  |  |  |  |  | $self->_xs_initialize( | 
| 107 |  |  |  |  |  |  | $lattice_sizes, | 
| 108 |  |  |  |  |  |  | $self->{save_this}, | 
| 109 |  |  |  |  |  |  | $self->{itemcontextchain}, | 
| 110 |  |  |  |  |  |  | $self->{itemcontextchainhead}, | 
| 111 |  |  |  |  |  |  | $self->{context_to_class}, | 
| 112 |  |  |  |  |  |  | $self->{contextsize}, | 
| 113 |  |  |  |  |  |  | $self->{pointers}, | 
| 114 |  |  |  |  |  |  | $self->{gang}, | 
| 115 |  |  |  |  |  |  | $self->{sum} | 
| 116 | 192 |  |  |  |  | 1923 | ); | 
| 117 | 192 |  |  |  |  | 459 | return; | 
| 118 |  |  |  |  |  |  | } | 
| 119 |  |  |  |  |  |  |  | 
| 120 |  |  |  |  |  |  | sub classify { | 
| 121 | 195 |  |  | 195 | 1 | 11254 | my ($self, $test_item) = @_; | 
| 122 |  |  |  |  |  |  |  | 
| 123 | 195 |  |  |  |  | 481 | my $training_set = $self->training_set; | 
| 124 | 195 | 100 |  |  |  | 550 | if($training_set->cardinality != $test_item->cardinality){ | 
| 125 | 1 |  |  |  |  | 4 | croak 'Training set and test item do not have the same ' . | 
| 126 |  |  |  |  |  |  | 'cardinality (' . $training_set->cardinality . ' and ' . | 
| 127 |  |  |  |  |  |  | $test_item->cardinality . ')'; | 
| 128 |  |  |  |  |  |  | } | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | # num_feats is the number of features to be used in classification; | 
| 131 |  |  |  |  |  |  | # if we exclude nulls, then we need to minus the number of '=' | 
| 132 |  |  |  |  |  |  | # found in this test item; otherwise, it's just the number of | 
| 133 |  |  |  |  |  |  | # columns in a single item vector | 
| 134 | 194 |  |  |  |  | 444 | my $num_feats = $training_set->cardinality; | 
| 135 |  |  |  |  |  |  |  | 
| 136 | 194 | 100 |  |  |  | 4997 | if($self->exclude_nulls){ | 
| 137 | 1808 |  |  |  |  | 2524 | $num_feats -= grep {$_ eq ''} @{ | 
| 138 | 189 |  |  |  |  | 1128 | $test_item->features }; | 
|  | 189 |  |  |  |  | 474 |  | 
| 139 |  |  |  |  |  |  | } | 
| 140 |  |  |  |  |  |  |  | 
| 141 |  |  |  |  |  |  | # recalculate the lattice sizes with new number of active features | 
| 142 | 194 |  |  |  |  | 580 | my $lattice_sizes = _compute_lattice_sizes($num_feats); | 
| 143 |  |  |  |  |  |  | ##  $activeContexts = 1 << $activeVar; | 
| 144 |  |  |  |  |  |  |  | 
| 145 | 194 |  |  |  |  | 337 | my $nullcontext = pack "b64", '0' x 64; | 
| 146 |  |  |  |  |  |  |  | 
| 147 | 194 |  |  |  |  | 276 | my $given_excluded = 0; | 
| 148 | 194 |  |  |  |  | 380 | my $test_in_training   = 0; | 
| 149 |  |  |  |  |  |  |  | 
| 150 |  |  |  |  |  |  | # initialize classification-related variables | 
| 151 |  |  |  |  |  |  | # it is important to dereference rather than just | 
| 152 |  |  |  |  |  |  | # assigning a new one with [] or {}. This is because | 
| 153 |  |  |  |  |  |  | # the XS code has access to the existing reference, | 
| 154 |  |  |  |  |  |  | # but will be accessing the wrong variable if we | 
| 155 |  |  |  |  |  |  | # change it. | 
| 156 | 194 |  |  |  |  | 303 | %{$self->{contextsize}}             = (); | 
|  | 194 |  |  |  |  | 438 |  | 
| 157 | 194 |  |  |  |  | 278 | %{$self->{itemcontextchainhead}}    = (); | 
|  | 194 |  |  |  |  | 350 |  | 
| 158 | 194 |  |  |  |  | 327 | %{$self->{context_to_class}}      = (); | 
|  | 194 |  |  |  |  | 352 |  | 
| 159 | 194 |  |  |  |  | 318 | %{$self->{pointers}}                = (); | 
|  | 194 |  |  |  |  | 347 |  | 
| 160 | 194 |  |  |  |  | 325 | %{$self->{gang}}                    = (); | 
|  | 194 |  |  |  |  | 317 |  | 
| 161 | 194 |  |  |  |  | 311 | @{$self->{itemcontextchain}}        = (); | 
|  | 194 |  |  |  |  | 764 |  | 
| 162 |  |  |  |  |  |  | # big ints are used in AM.xs; these consist of an | 
| 163 |  |  |  |  |  |  | # array of 8 unsigned longs | 
| 164 | 194 |  |  |  |  | 270 | foreach (@{$self->{sum}}) { | 
|  | 194 |  |  |  |  | 431 |  | 
| 165 | 753 |  |  |  |  | 1178 | $_ = pack "L!8", 0, 0, 0, 0, 0, 0, 0, 0; | 
| 166 |  |  |  |  |  |  | } | 
| 167 |  |  |  |  |  |  |  | 
| 168 |  |  |  |  |  |  | # calculate context labels and associated structures for | 
| 169 |  |  |  |  |  |  | # the entire data set | 
| 170 | 194 |  |  |  |  | 591 | for my $index ( 0 .. $training_set->size - 1 ) { | 
| 171 |  |  |  |  |  |  | my $context = _context_label( | 
| 172 |  |  |  |  |  |  | # Note: this must be copied to prevent infinite loop; | 
| 173 |  |  |  |  |  |  | # see todo note for _context_label | 
| 174 | 30034 |  |  |  |  | 30423 | [@{$lattice_sizes}], | 
|  | 30034 |  |  |  |  | 56815 |  | 
| 175 |  |  |  |  |  |  | $training_set->get_item($index)->features, | 
| 176 |  |  |  |  |  |  | $test_item->features, | 
| 177 |  |  |  |  |  |  | $self->exclude_nulls | 
| 178 |  |  |  |  |  |  | ); | 
| 179 | 30034 |  |  |  |  | 79501 | $self->{contextsize}->{$context}++; | 
| 180 |  |  |  |  |  |  | # TODO: explain itemcontextchain and itemcontextchainhead | 
| 181 |  |  |  |  |  |  | $self->{itemcontextchain}->[$index] = | 
| 182 | 30034 |  |  |  |  | 41125 | $self->{itemcontextchainhead}->{$context}; | 
| 183 | 30034 |  |  |  |  | 37849 | $self->{itemcontextchainhead}->{$context} = $index; | 
| 184 |  |  |  |  |  |  |  | 
| 185 |  |  |  |  |  |  | # store the class for the subcontext; if there | 
| 186 |  |  |  |  |  |  | # is already a different class for this subcontext, | 
| 187 |  |  |  |  |  |  | # then store 0, signifying heterogeneity. | 
| 188 | 30034 |  |  |  |  | 55783 | my $class = $training_set->_index_for_class( | 
| 189 |  |  |  |  |  |  | $training_set->get_item($index)->class); | 
| 190 | 30034 | 100 |  |  |  | 52173 | if ( defined $self->{context_to_class}->{$context} ) { | 
| 191 | 21669 | 100 |  |  |  | 38003 | if($self->{context_to_class}->{$context} != $class){ | 
| 192 | 9834 |  |  |  |  | 15328 | $self->{context_to_class}->{$context} = 0; | 
| 193 |  |  |  |  |  |  | } | 
| 194 |  |  |  |  |  |  | } | 
| 195 |  |  |  |  |  |  | else { | 
| 196 | 8365 |  |  |  |  | 15445 | $self->{context_to_class}->{$context} = $class; | 
| 197 |  |  |  |  |  |  | } | 
| 198 |  |  |  |  |  |  | } | 
| 199 |  |  |  |  |  |  | # $nullcontext is all 0's, which is a context label for | 
| 200 |  |  |  |  |  |  | # a training item that exactly matches the test item. Exclude | 
| 201 |  |  |  |  |  |  | # the item if required, and set a flag that the test item was | 
| 202 |  |  |  |  |  |  | # found in the training set. | 
| 203 | 194 | 100 |  |  |  | 694 | if ( exists $self->{context_to_class}->{$nullcontext} ) { | 
| 204 | 182 |  |  |  |  | 300 | $test_in_training = 1; | 
| 205 | 182 | 100 |  |  |  | 2858 | if($self->exclude_given){ | 
| 206 | 177 |  |  |  |  | 1425 | delete $self->{context_to_class}->{$nullcontext}; | 
| 207 | 177 |  |  |  |  | 343 | $given_excluded = 1; | 
| 208 |  |  |  |  |  |  | } | 
| 209 |  |  |  |  |  |  | } | 
| 210 |  |  |  |  |  |  | # initialize the results object to hold all of the configuration | 
| 211 |  |  |  |  |  |  | # info. | 
| 212 | 194 | 100 |  |  |  | 2637 | my $result = Algorithm::AM::Result->new( | 
| 213 |  |  |  |  |  |  | given_excluded => $given_excluded, | 
| 214 |  |  |  |  |  |  | cardinality => $num_feats, | 
| 215 |  |  |  |  |  |  | exclude_nulls => $self->exclude_nulls, | 
| 216 |  |  |  |  |  |  | count_method => $self->linear ? 'linear' : 'squared', | 
| 217 |  |  |  |  |  |  | training_set => $training_set, | 
| 218 |  |  |  |  |  |  | test_item => $test_item, | 
| 219 |  |  |  |  |  |  | test_in_train => $test_in_training, | 
| 220 |  |  |  |  |  |  | ); | 
| 221 |  |  |  |  |  |  |  | 
| 222 | 194 | 50 |  |  |  | 16384 | $log->debug(${$result->config_info}) | 
|  | 0 |  |  |  |  | 0 |  | 
| 223 |  |  |  |  |  |  | if($log->is_debug); | 
| 224 |  |  |  |  |  |  |  | 
| 225 | 194 |  |  |  |  | 17376 | $result->start_time([ (localtime)[0..2] ]); | 
| 226 | 194 | 100 |  |  |  | 3854 | $self->_fillandcount( | 
| 227 |  |  |  |  |  |  | $lattice_sizes, $self->linear ? 1 : 0); | 
| 228 | 194 |  |  |  |  | 46897 | $result->end_time([ (localtime)[0..2] ]); | 
| 229 |  |  |  |  |  |  |  | 
| 230 | 194 | 100 |  |  |  | 1531 | unless ($self->{pointers}->{'grandtotal'}) { | 
| 231 |  |  |  |  |  |  | #TODO: is this tested yet? | 
| 232 | 1 | 50 |  |  |  | 10 | if($log->is_warn){ | 
| 233 | 0 |  |  |  |  | 0 | $log->warn('No training items considered. ' . | 
| 234 |  |  |  |  |  |  | 'No prediction possible.'); | 
| 235 |  |  |  |  |  |  | } | 
| 236 | 1 |  |  |  |  | 11 | return; | 
| 237 |  |  |  |  |  |  | } | 
| 238 |  |  |  |  |  |  |  | 
| 239 |  |  |  |  |  |  | $result->_process_stats( | 
| 240 |  |  |  |  |  |  | # TODO: after refactoring to a "guts" object, | 
| 241 |  |  |  |  |  |  | # just pass that in | 
| 242 |  |  |  |  |  |  | $self->{sum}, | 
| 243 |  |  |  |  |  |  | $self->{pointers}, | 
| 244 |  |  |  |  |  |  | $self->{itemcontextchainhead}, | 
| 245 |  |  |  |  |  |  | $self->{itemcontextchain}, | 
| 246 |  |  |  |  |  |  | $self->{context_to_class}, | 
| 247 |  |  |  |  |  |  | $self->{gang}, | 
| 248 |  |  |  |  |  |  | $lattice_sizes, | 
| 249 |  |  |  |  |  |  | $self->{contextsize} | 
| 250 | 193 |  |  |  |  | 1323 | ); | 
| 251 | 193 |  |  |  |  | 630 | return $result; | 
| 252 |  |  |  |  |  |  | } | 
| 253 |  |  |  |  |  |  |  | 
| 254 |  |  |  |  |  |  | # since we split the lattice in four, we have to decide which features | 
| 255 |  |  |  |  |  |  | # go where. Given the number of features being used, return an arrayref | 
| 256 |  |  |  |  |  |  | # containing the number of features to be used in each of the the four | 
| 257 |  |  |  |  |  |  | # lattices. | 
| 258 |  |  |  |  |  |  | sub _compute_lattice_sizes { | 
| 259 | 386 |  |  | 386 |  | 626 | my ($num_feats) = @_; | 
| 260 |  |  |  |  |  |  |  | 
| 261 | 10 |  |  | 10 |  | 31140 | use integer; | 
|  | 10 |  |  |  |  | 142 |  | 
|  | 10 |  |  |  |  | 62 |  | 
| 262 | 386 |  |  |  |  | 496 | my @lattice_sizes; | 
| 263 | 386 |  |  |  |  | 627 | my $half = $num_feats / 2; | 
| 264 | 386 |  |  |  |  | 774 | $lattice_sizes[0] = $half / 2; | 
| 265 | 386 |  |  |  |  | 585 | $lattice_sizes[1] = $half - $lattice_sizes[0]; | 
| 266 | 386 |  |  |  |  | 528 | $half         = $num_feats - $half; | 
| 267 | 386 |  |  |  |  | 569 | $lattice_sizes[2] = $half / 2; | 
| 268 | 386 |  |  |  |  | 622 | $lattice_sizes[3] = $half - $lattice_sizes[2]; | 
| 269 | 386 |  |  |  |  | 690 | return \@lattice_sizes; | 
| 270 |  |  |  |  |  |  | } | 
| 271 |  |  |  |  |  |  |  | 
| 272 |  |  |  |  |  |  | # Create binary context labels for a training item | 
| 273 |  |  |  |  |  |  | # by comparing it with a test item. Each training item | 
| 274 |  |  |  |  |  |  | # needs one binary label for each sublattice (of which | 
| 275 |  |  |  |  |  |  | # there are currently four), but this is packed into a | 
| 276 |  |  |  |  |  |  | # single scalar representing an array of 4 shorts (this | 
| 277 |  |  |  |  |  |  | # format is used in the XS side). | 
| 278 |  |  |  |  |  |  |  | 
| 279 |  |  |  |  |  |  | # TODO: we have to copy lattice_sizes out of $self in order to | 
| 280 |  |  |  |  |  |  | # iterate it. Otherwise it goes on forever. Why? | 
| 281 |  |  |  |  |  |  | sub _context_label { | 
| 282 |  |  |  |  |  |  | # inputs: | 
| 283 |  |  |  |  |  |  | # number of active features in each lattice, | 
| 284 |  |  |  |  |  |  | # training item features, test item features, | 
| 285 |  |  |  |  |  |  | # and boolean indicating if nulls should be excluded | 
| 286 | 30034 |  |  | 30034 |  | 119571 | my ($lattice_sizes, $train_feats, $test_feats, $skip_nulls) = @_; | 
| 287 |  |  |  |  |  |  |  | 
| 288 |  |  |  |  |  |  | # feature index | 
| 289 | 30034 |  |  |  |  | 31683 | my $index        = 0; | 
| 290 |  |  |  |  |  |  | # the binary context labels for each separate lattice | 
| 291 | 30034 |  |  |  |  | 32506 | my @context_list    = (); | 
| 292 |  |  |  |  |  |  |  | 
| 293 | 30034 |  |  |  |  | 37172 | for my $a (@$lattice_sizes) { | 
| 294 |  |  |  |  |  |  | # binary context label for a single sublattice | 
| 295 | 120136 |  |  |  |  | 112519 | my $context = 0; | 
| 296 |  |  |  |  |  |  | # loop through all features in the sublattice | 
| 297 |  |  |  |  |  |  | # assign 0 if features match, 1 if they do not | 
| 298 | 120136 |  |  |  |  | 151928 | for ( ; $a ; --$a ) { | 
| 299 |  |  |  |  |  |  |  | 
| 300 |  |  |  |  |  |  | # skip null features if indicated | 
| 301 | 247274 | 100 |  |  |  | 292639 | if($skip_nulls){ | 
| 302 | 247187 |  |  |  |  | 338202 | ++$index while $test_feats->[$index] eq ''; | 
| 303 |  |  |  |  |  |  | } | 
| 304 |  |  |  |  |  |  | # add a 1 for mismatched variable, 0 for matched variable | 
| 305 | 247274 |  |  |  |  | 273573 | $context = ( $context << 1 ) | ( | 
| 306 |  |  |  |  |  |  | $test_feats->[$index] ne $train_feats->[$index] ); | 
| 307 | 247274 |  |  |  |  | 310486 | ++$index; | 
| 308 |  |  |  |  |  |  | } | 
| 309 | 120136 |  |  |  |  | 145713 | push @context_list, $context; | 
| 310 |  |  |  |  |  |  | } | 
| 311 |  |  |  |  |  |  | # a context label is an array of unsigned shorts in XS | 
| 312 | 30034 |  |  |  |  | 52929 | my $context = pack "S!4", @context_list; | 
| 313 | 30034 |  |  |  |  | 51064 | return $context; | 
| 314 |  |  |  |  |  |  | } | 
| 315 |  |  |  |  |  |  |  | 
| 316 |  |  |  |  |  |  | 1; | 
| 317 |  |  |  |  |  |  |  | 
| 318 |  |  |  |  |  |  | __END__ |