| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Algorithm::AM::DataSet; | 
| 2 | 10 |  |  | 10 |  | 76247 | use strict; | 
|  | 10 |  |  |  |  | 30 |  | 
|  | 10 |  |  |  |  | 296 |  | 
| 3 | 10 |  |  | 10 |  | 49 | use warnings; | 
|  | 10 |  |  |  |  | 18 |  | 
|  | 10 |  |  |  |  | 457 |  | 
| 4 |  |  |  |  |  |  | our $VERSION = '3.12'; | 
| 5 |  |  |  |  |  |  | # ABSTRACT: Manage data used by Algorithm::AM | 
| 6 | 10 |  |  | 10 |  | 117 | use Carp; | 
|  | 10 |  |  |  |  | 22 |  | 
|  | 10 |  |  |  |  | 628 |  | 
| 7 | 10 |  |  | 10 |  | 4357 | use Algorithm::AM::DataSet::Item; | 
|  | 10 |  |  |  |  | 23 |  | 
|  | 10 |  |  |  |  | 441 |  | 
| 8 | 10 |  |  | 10 |  | 7017 | use Path::Tiny; | 
|  | 10 |  |  |  |  | 99881 |  | 
|  | 10 |  |  |  |  | 603 |  | 
| 9 |  |  |  |  |  |  | use Exporter::Easy ( | 
| 10 | 10 |  |  |  |  | 69 | OK => ['dataset_from_file'] | 
| 11 | 10 |  |  | 10 |  | 94 | ); | 
|  | 10 |  |  |  |  | 21 |  | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | #pod =head1 SYNOPSIS | 
| 14 |  |  |  |  |  |  | #pod | 
| 15 |  |  |  |  |  |  | #pod  use Algorithm::AM::DataSet 'dataset_from_file'; | 
| 16 |  |  |  |  |  |  | #pod  use Algorithm::AM::DataSet::Item 'new_item'; | 
| 17 |  |  |  |  |  |  | #pod  my $dataset = Algorithm::AM::DataSet->new(cardinality => 10); | 
| 18 |  |  |  |  |  |  | #pod  # or | 
| 19 |  |  |  |  |  |  | #pod  $dataset = dataset_from_file(path => 'finnverb', format => 'nocommas'); | 
| 20 |  |  |  |  |  |  | #pod  $dataset->add_item( | 
| 21 |  |  |  |  |  |  | #pod    new_item(features => [qw(a b c d e f g h i)])); | 
| 22 |  |  |  |  |  |  | #pod  my $item = $dataset->get_item(2); | 
| 23 |  |  |  |  |  |  | #pod | 
| 24 |  |  |  |  |  |  | #pod =head1 DESCRIPTION | 
| 25 |  |  |  |  |  |  | #pod | 
| 26 |  |  |  |  |  |  | #pod This package contains a list of items that can be used by | 
| 27 |  |  |  |  |  |  | #pod L or L for classification. | 
| 28 |  |  |  |  |  |  | #pod DataSets can be made one item at a time via the L method, | 
| 29 |  |  |  |  |  |  | #pod or they can be read from files via the L function. | 
| 30 |  |  |  |  |  |  | #pod | 
| 31 |  |  |  |  |  |  | #pod =head2 C | 
| 32 |  |  |  |  |  |  | #pod | 
| 33 |  |  |  |  |  |  | #pod Creates a new DataSet object. You must provide a C argument | 
| 34 |  |  |  |  |  |  | #pod indicating the number of features to be contained in each data vector. | 
| 35 |  |  |  |  |  |  | #pod You can then add items via the add_item method. Each item will contain | 
| 36 |  |  |  |  |  |  | #pod a feature vector, and also optionally a class label and a comment | 
| 37 |  |  |  |  |  |  | #pod (also called a "spec"). | 
| 38 |  |  |  |  |  |  | #pod | 
| 39 |  |  |  |  |  |  | #pod =cut | 
| 40 |  |  |  |  |  |  | sub new { | 
| 41 | 50 |  |  | 50 | 1 | 34249 | my ($class, %opts) = @_; | 
| 42 |  |  |  |  |  |  |  | 
| 43 | 50 |  |  |  |  | 176 | my $new_opts = _check_opts(%opts); | 
| 44 |  |  |  |  |  |  |  | 
| 45 | 48 |  |  |  |  | 96 | my $self = bless $new_opts, $class; | 
| 46 |  |  |  |  |  |  |  | 
| 47 | 48 |  |  |  |  | 261 | $self->_init; | 
| 48 |  |  |  |  |  |  |  | 
| 49 | 48 |  |  |  |  | 166 | return $self; | 
| 50 |  |  |  |  |  |  | } | 
| 51 |  |  |  |  |  |  |  | 
| 52 |  |  |  |  |  |  | # check the options for validity | 
| 53 |  |  |  |  |  |  | # Return an option hash to initialize $self with | 
| 54 |  |  |  |  |  |  | # For now only 'cardinality' is allowed/required. | 
| 55 |  |  |  |  |  |  | sub _check_opts { | 
| 56 | 50 |  |  | 50 |  | 131 | my (%opts) = @_; | 
| 57 |  |  |  |  |  |  |  | 
| 58 | 50 |  |  |  |  | 79 | my %final_opts; | 
| 59 |  |  |  |  |  |  |  | 
| 60 | 50 | 100 |  |  |  | 141 | if(!defined $opts{cardinality}){ | 
| 61 | 1 |  |  |  |  | 16 | croak q{Failed to provide 'cardinality' parameter}; | 
| 62 |  |  |  |  |  |  | } | 
| 63 | 49 |  |  |  |  | 111 | $final_opts{cardinality} = $opts{cardinality}; | 
| 64 | 49 |  |  |  |  | 112 | delete $opts{cardinality}; | 
| 65 |  |  |  |  |  |  |  | 
| 66 | 49 | 100 |  |  |  | 125 | if(keys %opts){ | 
| 67 |  |  |  |  |  |  | # sort the keys in the error message to make testing possible | 
| 68 | 1 |  |  |  |  | 15 | croak 'Unknown parameters in DataSet constructor: ' . | 
| 69 |  |  |  |  |  |  | (join ', ', sort keys %opts); | 
| 70 |  |  |  |  |  |  | } | 
| 71 |  |  |  |  |  |  |  | 
| 72 | 48 |  |  |  |  | 111 | return \%final_opts; | 
| 73 |  |  |  |  |  |  | } | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | # initialize internal state | 
| 76 |  |  |  |  |  |  | sub _init { | 
| 77 | 48 |  |  | 48 |  | 79 | my ($self) = @_; | 
| 78 |  |  |  |  |  |  | # contains all of the items in the dataset | 
| 79 | 48 |  |  |  |  | 171 | $self->{items} = []; | 
| 80 |  |  |  |  |  |  |  | 
| 81 |  |  |  |  |  |  | # map unique class labels to unique integers; | 
| 82 |  |  |  |  |  |  | # these are the indices of the class labels in class_list below; | 
| 83 |  |  |  |  |  |  | # the indices must start at 1 for AM to work, as 0 is reserved | 
| 84 |  |  |  |  |  |  | # for heterogeneity. | 
| 85 | 48 |  |  |  |  | 115 | $self->{class_num_index} = {}; | 
| 86 |  |  |  |  |  |  | # contains the list of class strings in an order that matches | 
| 87 |  |  |  |  |  |  | # the indices in class_num_index | 
| 88 | 48 |  |  |  |  | 83 | $self->{class_list} = []; | 
| 89 |  |  |  |  |  |  | # the total number of different classes contained in the data set | 
| 90 | 48 |  |  |  |  | 76 | $self->{num_classes} = 0; | 
| 91 | 48 |  |  |  |  | 66 | return; | 
| 92 |  |  |  |  |  |  | } | 
| 93 |  |  |  |  |  |  |  | 
| 94 |  |  |  |  |  |  | #pod =head2 C | 
| 95 |  |  |  |  |  |  | #pod | 
| 96 |  |  |  |  |  |  | #pod Returns the number of features contained in the feature vector of a | 
| 97 |  |  |  |  |  |  | #pod single item. | 
| 98 |  |  |  |  |  |  | #pod | 
| 99 |  |  |  |  |  |  | #pod =cut | 
| 100 |  |  |  |  |  |  | sub cardinality { | 
| 101 | 881 |  |  | 881 | 1 | 1362 | my ($self) = @_; | 
| 102 | 881 |  |  |  |  | 2218 | return $self->{cardinality}; | 
| 103 |  |  |  |  |  |  | } | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  | #pod =head2 C | 
| 106 |  |  |  |  |  |  | #pod | 
| 107 |  |  |  |  |  |  | #pod Returns the number of items in the data set. | 
| 108 |  |  |  |  |  |  | #pod | 
| 109 |  |  |  |  |  |  | #pod =cut | 
| 110 |  |  |  |  |  |  | sub size { | 
| 111 | 830 |  |  | 830 | 1 | 61262 | my ($self) = @_; | 
| 112 | 830 |  |  |  |  | 1053 | return scalar @{$self->{items}}; | 
|  | 830 |  |  |  |  | 2530 |  | 
| 113 |  |  |  |  |  |  | } | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | #pod =head2 C | 
| 116 |  |  |  |  |  |  | #pod | 
| 117 |  |  |  |  |  |  | #pod Returns the list of all unique class labels in the data set. | 
| 118 |  |  |  |  |  |  | #pod | 
| 119 |  |  |  |  |  |  | #pod =cut | 
| 120 |  |  |  |  |  |  | sub classes { | 
| 121 | 0 |  |  | 0 | 1 | 0 | my ($self) = @_; | 
| 122 | 0 |  |  |  |  | 0 | return @{ $self->{class_list} }; | 
|  | 0 |  |  |  |  | 0 |  | 
| 123 |  |  |  |  |  |  | } | 
| 124 |  |  |  |  |  |  |  | 
| 125 |  |  |  |  |  |  | #pod =head2 C | 
| 126 |  |  |  |  |  |  | #pod | 
| 127 |  |  |  |  |  |  | #pod Adds a new item to the data set. The input may be either an | 
| 128 |  |  |  |  |  |  | #pod L object, or the arguments to create | 
| 129 |  |  |  |  |  |  | #pod one via its constructor (features, class, comment). This method will | 
| 130 |  |  |  |  |  |  | #pod croak if the cardinality of the item does not match L. | 
| 131 |  |  |  |  |  |  | #pod | 
| 132 |  |  |  |  |  |  | #pod =cut | 
| 133 |  |  |  |  |  |  | sub add_item { | 
| 134 | 276 |  |  | 276 | 1 | 745 | my ($self, @args) = @_; | 
| 135 | 276 |  |  |  |  | 314 | my $item; | 
| 136 | 276 | 100 |  |  |  | 512 | if('Algorithm::AM::DataSet::Item' eq ref $args[0]){ | 
| 137 | 219 |  |  |  |  | 255 | $item = $args[0]; | 
| 138 |  |  |  |  |  |  | }else{ | 
| 139 | 57 |  |  |  |  | 156 | $item = Algorithm::AM::DataSet::Item->new(@args); | 
| 140 |  |  |  |  |  |  | } | 
| 141 |  |  |  |  |  |  |  | 
| 142 | 275 | 100 |  |  |  | 456 | if($self->cardinality != $item->cardinality){ | 
| 143 |  |  |  |  |  |  | croak 'Expected ' . $self->cardinality . | 
| 144 |  |  |  |  |  |  | ' features, but found ' . (scalar $item->cardinality) . | 
| 145 | 1 |  |  |  |  | 4 | ' in ' . (join ' ', @{$item->features}) . | 
|  | 1 |  |  |  |  | 2 |  | 
| 146 |  |  |  |  |  |  | ' (' . $item->comment . ')'; | 
| 147 |  |  |  |  |  |  | } | 
| 148 |  |  |  |  |  |  |  | 
| 149 | 274 | 100 |  |  |  | 569 | if(defined $item->class){ | 
| 150 | 270 |  |  |  |  | 437 | $self->_update_class_vars($item->class); | 
| 151 |  |  |  |  |  |  | } | 
| 152 |  |  |  |  |  |  |  | 
| 153 |  |  |  |  |  |  | # store the new item | 
| 154 | 274 |  |  |  |  | 335 | push @{$self->{items}}, $item; | 
|  | 274 |  |  |  |  | 516 |  | 
| 155 | 274 |  |  |  |  | 502 | return; | 
| 156 |  |  |  |  |  |  | } | 
| 157 |  |  |  |  |  |  |  | 
| 158 |  |  |  |  |  |  | # keep track of classes; needs updating for new item | 
| 159 |  |  |  |  |  |  | sub _update_class_vars { | 
| 160 | 270 |  |  | 270 |  | 404 | my ($self, $class) = @_; | 
| 161 |  |  |  |  |  |  |  | 
| 162 | 270 | 100 |  |  |  | 549 | if(!$self->{class_num_index}->{$class}){ | 
| 163 | 47 |  |  |  |  | 74 | $self->{num_classes}++; | 
| 164 | 47 |  |  |  |  | 100 | $self->{class_num_index}->{$class} = $self->{num_classes}; | 
| 165 | 47 |  |  |  |  | 62 | push @{$self->{class_list}}, $class; | 
|  | 47 |  |  |  |  | 162 |  | 
| 166 |  |  |  |  |  |  | } | 
| 167 | 270 |  |  |  |  | 348 | return; | 
| 168 |  |  |  |  |  |  | } | 
| 169 |  |  |  |  |  |  |  | 
| 170 |  |  |  |  |  |  | #pod =head2 C | 
| 171 |  |  |  |  |  |  | #pod | 
| 172 |  |  |  |  |  |  | #pod Return the item at the given index. This will be a | 
| 173 |  |  |  |  |  |  | #pod L object. | 
| 174 |  |  |  |  |  |  | #pod | 
| 175 |  |  |  |  |  |  | #pod =cut | 
| 176 |  |  |  |  |  |  | sub get_item { | 
| 177 | 60306 |  |  | 60306 | 1 | 76637 | my ($self, $index) = @_; | 
| 178 | 60306 |  |  |  |  | 103392 | return $self->{items}->[$index]; | 
| 179 |  |  |  |  |  |  | } | 
| 180 |  |  |  |  |  |  |  | 
| 181 |  |  |  |  |  |  | #pod =head2 C | 
| 182 |  |  |  |  |  |  | #pod | 
| 183 |  |  |  |  |  |  | #pod Returns the number of different classification labels contained in | 
| 184 |  |  |  |  |  |  | #pod the data set. | 
| 185 |  |  |  |  |  |  | #pod | 
| 186 |  |  |  |  |  |  | #pod =cut | 
| 187 |  |  |  |  |  |  | sub num_classes { | 
| 188 | 388 |  |  | 388 | 1 | 1635 | my ($self) = @_; | 
| 189 | 388 |  |  |  |  | 1379 | return $self->{num_classes}; | 
| 190 |  |  |  |  |  |  | } | 
| 191 |  |  |  |  |  |  |  | 
| 192 |  |  |  |  |  |  | # Used by AM. Return an arrayref containing all of the | 
| 193 |  |  |  |  |  |  | # classes for the data set (ordered the same as the data set). | 
| 194 |  |  |  |  |  |  | sub _data_classes { | 
| 195 | 194 |  |  | 194 |  | 282 | my ($self) = @_; | 
| 196 |  |  |  |  |  |  | my @classes = map { | 
| 197 | 30018 | 50 |  |  |  | 42696 | defined $_->class ? | 
| 198 |  |  |  |  |  |  | $self->_index_for_class($_->class) : | 
| 199 |  |  |  |  |  |  | undef | 
| 200 | 194 |  |  |  |  | 321 | } @{$self->{items}}; | 
|  | 194 |  |  |  |  | 446 |  | 
| 201 | 194 |  |  |  |  | 1097 | return \@classes; | 
| 202 |  |  |  |  |  |  | } | 
| 203 |  |  |  |  |  |  |  | 
| 204 |  |  |  |  |  |  | # Used by AM. Return the integer mapped to the given class string. | 
| 205 |  |  |  |  |  |  | sub _index_for_class { | 
| 206 | 60053 |  |  | 60053 |  | 75647 | my ($self, $class) = @_; | 
| 207 | 60053 |  |  |  |  | 102113 | return $self->{class_num_index}->{$class}; | 
| 208 |  |  |  |  |  |  | } | 
| 209 |  |  |  |  |  |  |  | 
| 210 |  |  |  |  |  |  | # Used by Result, which traverses data structures from | 
| 211 |  |  |  |  |  |  | # AM's guts. | 
| 212 |  |  |  |  |  |  | sub _class_for_index { | 
| 213 | 364 |  |  | 364 |  | 1857 | my ($self, $index) = @_; | 
| 214 | 364 |  |  |  |  | 1295 | return $self->{class_list}->[$index - 1]; | 
| 215 |  |  |  |  |  |  | } | 
| 216 |  |  |  |  |  |  |  | 
| 217 |  |  |  |  |  |  | #pod =head2 C | 
| 218 |  |  |  |  |  |  | #pod | 
| 219 |  |  |  |  |  |  | #pod This function may be exported. Given 'path' and 'format' arguments, | 
| 220 |  |  |  |  |  |  | #pod it reads a file containing a dataset and returns a new DataSet object | 
| 221 |  |  |  |  |  |  | #pod with the given data. The 'path' argument should be the path to the | 
| 222 |  |  |  |  |  |  | #pod file. The 'format' argument should be 'commas' or 'nocommas', | 
| 223 |  |  |  |  |  |  | #pod indicating one of the following formats. You may also specify 'unknown' | 
| 224 |  |  |  |  |  |  | #pod and 'null' arguments to indicate the strings meant to represent an | 
| 225 |  |  |  |  |  |  | #pod unknown class value and null feature values. By default these are | 
| 226 |  |  |  |  |  |  | #pod 'UNK' and '='. | 
| 227 |  |  |  |  |  |  | #pod | 
| 228 |  |  |  |  |  |  | #pod The 'commas' file format is shown below: | 
| 229 |  |  |  |  |  |  | #pod | 
| 230 |  |  |  |  |  |  | #pod  class , f eat u re s , your comment here | 
| 231 |  |  |  |  |  |  | #pod | 
| 232 |  |  |  |  |  |  | #pod The commas separate the class label, feature values, and comments, | 
| 233 |  |  |  |  |  |  | #pod and the whitespace around the commas is optional. Each feature value | 
| 234 |  |  |  |  |  |  | #pod is separated with whitespace. | 
| 235 |  |  |  |  |  |  | #pod | 
| 236 |  |  |  |  |  |  | #pod The 'nocommas' file format is shown below: | 
| 237 |  |  |  |  |  |  | #pod | 
| 238 |  |  |  |  |  |  | #pod  class   features  your comment here | 
| 239 |  |  |  |  |  |  | #pod | 
| 240 |  |  |  |  |  |  | #pod Here the class, feature values, and comments are separated by | 
| 241 |  |  |  |  |  |  | #pod whitespace. Each feature value must be a single character with no | 
| 242 |  |  |  |  |  |  | #pod separating characters, so here the features are f, e, a, t, u, r, | 
| 243 |  |  |  |  |  |  | #pod e, and s. | 
| 244 |  |  |  |  |  |  | #pod | 
| 245 |  |  |  |  |  |  | #pod Lines beginning with a pound character (C<#>) are ignored. | 
| 246 |  |  |  |  |  |  | #pod | 
| 247 |  |  |  |  |  |  | #pod =cut | 
| 248 |  |  |  |  |  |  | sub dataset_from_file {## no critic (RequireArgUnpacking) | 
| 249 | 12 |  |  | 12 | 1 | 9579 | my (%opts) = ( | 
| 250 |  |  |  |  |  |  | unknown => 'UNK', | 
| 251 |  |  |  |  |  |  | null => '=', | 
| 252 |  |  |  |  |  |  | @_ | 
| 253 |  |  |  |  |  |  | ); | 
| 254 |  |  |  |  |  |  |  | 
| 255 |  |  |  |  |  |  | croak q[Failed to provide 'path' parameter] | 
| 256 | 12 | 100 |  |  |  | 56 | unless exists $opts{path}; | 
| 257 |  |  |  |  |  |  | croak q[Failed to provide 'format' parameter] | 
| 258 | 11 | 100 |  |  |  | 42 | unless exists $opts{format}; | 
| 259 |  |  |  |  |  |  |  | 
| 260 |  |  |  |  |  |  | my ($path, $format, $unknown, $null) = ( | 
| 261 | 10 |  |  |  |  | 33 | path($opts{path}), @opts{'format', 'unknown', 'null'}); | 
| 262 |  |  |  |  |  |  |  | 
| 263 | 10 | 100 |  |  |  | 286 | croak "Could not find file $path" | 
| 264 |  |  |  |  |  |  | unless $path->exists; | 
| 265 |  |  |  |  |  |  |  | 
| 266 | 9 |  |  |  |  | 298 | my ($field_sep, $feature_sep); | 
| 267 | 9 | 100 |  |  |  | 38 | if($format eq 'commas'){ | 
|  |  | 100 |  |  |  |  |  | 
| 268 |  |  |  |  |  |  | # class/features/comment separated by a comma | 
| 269 | 4 |  |  |  |  | 24 | $field_sep   = qr{\s*,\s*}; | 
| 270 |  |  |  |  |  |  | # features separated by space | 
| 271 | 4 |  |  |  |  | 14 | $feature_sep = qr{\s+}; | 
| 272 |  |  |  |  |  |  | }elsif($format eq 'nocommas'){ | 
| 273 |  |  |  |  |  |  | # class/features/comment separated by space | 
| 274 | 4 |  |  |  |  | 30 | $field_sep   = qr{\s+}; | 
| 275 |  |  |  |  |  |  | # no seps for features; each is a single character | 
| 276 | 4 |  |  |  |  | 16 | $feature_sep = qr{}; | 
| 277 |  |  |  |  |  |  | }else{ | 
| 278 | 1 |  |  |  |  | 16 | croak "Unknown value $format for format parameter " . | 
| 279 |  |  |  |  |  |  | q{(should be 'commas' or 'nocommas')}; | 
| 280 |  |  |  |  |  |  | } | 
| 281 |  |  |  |  |  |  |  | 
| 282 | 8 | 50 |  |  |  | 26 | if(!defined $unknown){ | 
| 283 | 0 |  |  |  |  | 0 | croak q[Must provide a defined value for 'unknown' parameter]; | 
| 284 |  |  |  |  |  |  | } | 
| 285 |  |  |  |  |  |  |  | 
| 286 | 8 |  |  |  |  | 43 | my $reader = _read_data_sub( | 
| 287 |  |  |  |  |  |  | $path, $unknown, $null, $field_sep, $feature_sep); | 
| 288 | 8 |  |  |  |  | 22 | my $item = $reader->(); | 
| 289 | 8 | 50 |  |  |  | 39 | if(!$item){ | 
| 290 | 0 |  |  |  |  | 0 | croak "No data found in file $path"; | 
| 291 |  |  |  |  |  |  | } | 
| 292 | 8 |  |  |  |  | 33 | my $dataset = __PACKAGE__->new(cardinality => $item->cardinality); | 
| 293 | 8 |  |  |  |  | 30 | $dataset->add_item($item); | 
| 294 | 8 |  |  |  |  | 19 | while($item = $reader->()){ | 
| 295 | 189 |  |  |  |  | 363 | $dataset->add_item($item); | 
| 296 |  |  |  |  |  |  | } | 
| 297 | 7 |  |  |  |  | 236 | return $dataset; | 
| 298 |  |  |  |  |  |  | } | 
| 299 |  |  |  |  |  |  |  | 
| 300 |  |  |  |  |  |  | # return a sub that returns one Item per call from the given FH, | 
| 301 |  |  |  |  |  |  | # and returns undef once the file is done being read. Throws errors | 
| 302 |  |  |  |  |  |  | # on bad file contents. | 
| 303 |  |  |  |  |  |  | # Input is file (Path::Tiny), string representing unknown class, | 
| 304 |  |  |  |  |  |  | # string representing null feature, field separator (class, | 
| 305 |  |  |  |  |  |  | # features, comment) and feature separator | 
| 306 |  |  |  |  |  |  | sub _read_data_sub { | 
| 307 | 8 |  |  | 8 |  | 27 | my ($data_file, $unknown, $null, | 
| 308 |  |  |  |  |  |  | $field_sep, $feature_sep) = @_; | 
| 309 | 8 |  |  |  |  | 34 | my $data_fh = $data_file->openr_utf8; | 
| 310 | 8 |  |  |  |  | 49553 | my $line_num = 0; | 
| 311 |  |  |  |  |  |  | return sub { | 
| 312 | 205 |  |  | 205 |  | 250 | my $line; | 
| 313 |  |  |  |  |  |  | # grab the next non-blank line from the file | 
| 314 | 205 |  |  |  |  | 911 | while($line = <$data_fh>){ | 
| 315 | 201 |  |  |  |  | 360 | $line_num++; | 
| 316 |  |  |  |  |  |  | # skip comments | 
| 317 | 201 | 100 |  |  |  | 448 | next if $line =~ m/^\s*#/; | 
| 318 |  |  |  |  |  |  | # cross-platform chomp | 
| 319 | 199 |  |  |  |  | 849 | $line =~ s/\R$//; | 
| 320 | 199 |  |  |  |  | 958 | $line =~ s/^\s+|\s+$//g; | 
| 321 | 199 | 100 |  |  |  | 352 | last if $line; | 
| 322 |  |  |  |  |  |  | } | 
| 323 | 205 | 100 |  |  |  | 360 | return unless $line; | 
| 324 | 198 |  |  |  |  | 1040 | my ($class, $feats, $comment) = split /$field_sep/, $line, 3; | 
| 325 |  |  |  |  |  |  | # the line has to have at least the class label and features | 
| 326 | 198 | 100 |  |  |  | 404 | if(!defined $feats){ | 
| 327 | 1 |  |  |  |  | 6 | croak "Couldn't read data at line $line_num in $data_file"; | 
| 328 |  |  |  |  |  |  | } | 
| 329 |  |  |  |  |  |  | # if the class is specified as unknown, set it to undef to | 
| 330 |  |  |  |  |  |  | # indicate this to Item | 
| 331 | 197 | 100 |  |  |  | 312 | if($class eq $unknown){ | 
| 332 | 4 |  |  |  |  | 9 | undef $class; | 
| 333 |  |  |  |  |  |  | } | 
| 334 |  |  |  |  |  |  |  | 
| 335 | 197 |  |  |  |  | 1380 | my @data_vars = split /$feature_sep/, $feats; | 
| 336 |  |  |  |  |  |  | # set null features to '' | 
| 337 | 197 | 100 |  |  |  | 414 | @data_vars = map {$_ eq $null ? '' : $_} @data_vars; | 
|  | 1913 |  |  |  |  | 3297 |  | 
| 338 |  |  |  |  |  |  |  | 
| 339 | 197 |  |  |  |  | 625 | return Algorithm::AM::DataSet::Item->new( | 
| 340 |  |  |  |  |  |  | features=> \@data_vars, | 
| 341 |  |  |  |  |  |  | class => $class, | 
| 342 |  |  |  |  |  |  | comment => $comment | 
| 343 |  |  |  |  |  |  | ); | 
| 344 | 8 |  |  |  |  | 69 | }; | 
| 345 |  |  |  |  |  |  | } | 
| 346 |  |  |  |  |  |  |  | 
| 347 |  |  |  |  |  |  | 1; | 
| 348 |  |  |  |  |  |  |  | 
| 349 |  |  |  |  |  |  | __END__ |