File Coverage

blib/lib/Algorithm/AM/DataSet.pm
Criterion Covered Total %
statement 118 123 95.9
branch 37 40 92.5
condition n/a
subroutine 21 22 95.4
pod 8 8 100.0
total 184 193 95.3


line stmt bran cond sub pod time code
1             package Algorithm::AM::DataSet;
2 10     10   76247 use strict;
  10         30  
  10         296  
3 10     10   49 use warnings;
  10         18  
  10         457  
4             our $VERSION = '3.12';
5             # ABSTRACT: Manage data used by Algorithm::AM
6 10     10   117 use Carp;
  10         22  
  10         628  
7 10     10   4357 use Algorithm::AM::DataSet::Item;
  10         23  
  10         441  
8 10     10   7017 use Path::Tiny;
  10         99881  
  10         603  
9             use Exporter::Easy (
10 10         69 OK => ['dataset_from_file']
11 10     10   94 );
  10         21  
12              
13             #pod =head1 SYNOPSIS
14             #pod
15             #pod use Algorithm::AM::DataSet 'dataset_from_file';
16             #pod use Algorithm::AM::DataSet::Item 'new_item';
17             #pod my $dataset = Algorithm::AM::DataSet->new(cardinality => 10);
18             #pod # or
19             #pod $dataset = dataset_from_file(path => 'finnverb', format => 'nocommas');
20             #pod $dataset->add_item(
21             #pod new_item(features => [qw(a b c d e f g h i)]));
22             #pod my $item = $dataset->get_item(2);
23             #pod
24             #pod =head1 DESCRIPTION
25             #pod
26             #pod This package contains a list of items that can be used by
27             #pod L or L for classification.
28             #pod DataSets can be made one item at a time via the L method,
29             #pod or they can be read from files via the L function.
30             #pod
31             #pod =head2 C
32             #pod
33             #pod Creates a new DataSet object. You must provide a C argument
34             #pod indicating the number of features to be contained in each data vector.
35             #pod You can then add items via the add_item method. Each item will contain
36             #pod a feature vector, and also optionally a class label and a comment
37             #pod (also called a "spec").
38             #pod
39             #pod =cut
40             sub new {
41 50     50 1 34249 my ($class, %opts) = @_;
42              
43 50         176 my $new_opts = _check_opts(%opts);
44              
45 48         96 my $self = bless $new_opts, $class;
46              
47 48         261 $self->_init;
48              
49 48         166 return $self;
50             }
51              
52             # check the options for validity
53             # Return an option hash to initialize $self with
54             # For now only 'cardinality' is allowed/required.
55             sub _check_opts {
56 50     50   131 my (%opts) = @_;
57              
58 50         79 my %final_opts;
59              
60 50 100       141 if(!defined $opts{cardinality}){
61 1         16 croak q{Failed to provide 'cardinality' parameter};
62             }
63 49         111 $final_opts{cardinality} = $opts{cardinality};
64 49         112 delete $opts{cardinality};
65              
66 49 100       125 if(keys %opts){
67             # sort the keys in the error message to make testing possible
68 1         15 croak 'Unknown parameters in DataSet constructor: ' .
69             (join ', ', sort keys %opts);
70             }
71              
72 48         111 return \%final_opts;
73             }
74              
75             # initialize internal state
76             sub _init {
77 48     48   79 my ($self) = @_;
78             # contains all of the items in the dataset
79 48         171 $self->{items} = [];
80              
81             # map unique class labels to unique integers;
82             # these are the indices of the class labels in class_list below;
83             # the indices must start at 1 for AM to work, as 0 is reserved
84             # for heterogeneity.
85 48         115 $self->{class_num_index} = {};
86             # contains the list of class strings in an order that matches
87             # the indices in class_num_index
88 48         83 $self->{class_list} = [];
89             # the total number of different classes contained in the data set
90 48         76 $self->{num_classes} = 0;
91 48         66 return;
92             }
93              
94             #pod =head2 C
95             #pod
96             #pod Returns the number of features contained in the feature vector of a
97             #pod single item.
98             #pod
99             #pod =cut
100             sub cardinality {
101 881     881 1 1362 my ($self) = @_;
102 881         2218 return $self->{cardinality};
103             }
104              
105             #pod =head2 C
106             #pod
107             #pod Returns the number of items in the data set.
108             #pod
109             #pod =cut
110             sub size {
111 830     830 1 61262 my ($self) = @_;
112 830         1053 return scalar @{$self->{items}};
  830         2530  
113             }
114              
115             #pod =head2 C
116             #pod
117             #pod Returns the list of all unique class labels in the data set.
118             #pod
119             #pod =cut
120             sub classes {
121 0     0 1 0 my ($self) = @_;
122 0         0 return @{ $self->{class_list} };
  0         0  
123             }
124              
125             #pod =head2 C
126             #pod
127             #pod Adds a new item to the data set. The input may be either an
128             #pod L object, or the arguments to create
129             #pod one via its constructor (features, class, comment). This method will
130             #pod croak if the cardinality of the item does not match L.
131             #pod
132             #pod =cut
133             sub add_item {
134 276     276 1 745 my ($self, @args) = @_;
135 276         314 my $item;
136 276 100       512 if('Algorithm::AM::DataSet::Item' eq ref $args[0]){
137 219         255 $item = $args[0];
138             }else{
139 57         156 $item = Algorithm::AM::DataSet::Item->new(@args);
140             }
141              
142 275 100       456 if($self->cardinality != $item->cardinality){
143             croak 'Expected ' . $self->cardinality .
144             ' features, but found ' . (scalar $item->cardinality) .
145 1         4 ' in ' . (join ' ', @{$item->features}) .
  1         2  
146             ' (' . $item->comment . ')';
147             }
148              
149 274 100       569 if(defined $item->class){
150 270         437 $self->_update_class_vars($item->class);
151             }
152              
153             # store the new item
154 274         335 push @{$self->{items}}, $item;
  274         516  
155 274         502 return;
156             }
157              
158             # keep track of classes; needs updating for new item
159             sub _update_class_vars {
160 270     270   404 my ($self, $class) = @_;
161              
162 270 100       549 if(!$self->{class_num_index}->{$class}){
163 47         74 $self->{num_classes}++;
164 47         100 $self->{class_num_index}->{$class} = $self->{num_classes};
165 47         62 push @{$self->{class_list}}, $class;
  47         162  
166             }
167 270         348 return;
168             }
169              
170             #pod =head2 C
171             #pod
172             #pod Return the item at the given index. This will be a
173             #pod L object.
174             #pod
175             #pod =cut
176             sub get_item {
177 60306     60306 1 76637 my ($self, $index) = @_;
178 60306         103392 return $self->{items}->[$index];
179             }
180              
181             #pod =head2 C
182             #pod
183             #pod Returns the number of different classification labels contained in
184             #pod the data set.
185             #pod
186             #pod =cut
187             sub num_classes {
188 388     388 1 1635 my ($self) = @_;
189 388         1379 return $self->{num_classes};
190             }
191              
192             # Used by AM. Return an arrayref containing all of the
193             # classes for the data set (ordered the same as the data set).
194             sub _data_classes {
195 194     194   282 my ($self) = @_;
196             my @classes = map {
197 30018 50       42696 defined $_->class ?
198             $self->_index_for_class($_->class) :
199             undef
200 194         321 } @{$self->{items}};
  194         446  
201 194         1097 return \@classes;
202             }
203              
204             # Used by AM. Return the integer mapped to the given class string.
205             sub _index_for_class {
206 60053     60053   75647 my ($self, $class) = @_;
207 60053         102113 return $self->{class_num_index}->{$class};
208             }
209              
210             # Used by Result, which traverses data structures from
211             # AM's guts.
212             sub _class_for_index {
213 364     364   1857 my ($self, $index) = @_;
214 364         1295 return $self->{class_list}->[$index - 1];
215             }
216              
217             #pod =head2 C
218             #pod
219             #pod This function may be exported. Given 'path' and 'format' arguments,
220             #pod it reads a file containing a dataset and returns a new DataSet object
221             #pod with the given data. The 'path' argument should be the path to the
222             #pod file. The 'format' argument should be 'commas' or 'nocommas',
223             #pod indicating one of the following formats. You may also specify 'unknown'
224             #pod and 'null' arguments to indicate the strings meant to represent an
225             #pod unknown class value and null feature values. By default these are
226             #pod 'UNK' and '='.
227             #pod
228             #pod The 'commas' file format is shown below:
229             #pod
230             #pod class , f eat u re s , your comment here
231             #pod
232             #pod The commas separate the class label, feature values, and comments,
233             #pod and the whitespace around the commas is optional. Each feature value
234             #pod is separated with whitespace.
235             #pod
236             #pod The 'nocommas' file format is shown below:
237             #pod
238             #pod class features your comment here
239             #pod
240             #pod Here the class, feature values, and comments are separated by
241             #pod whitespace. Each feature value must be a single character with no
242             #pod separating characters, so here the features are f, e, a, t, u, r,
243             #pod e, and s.
244             #pod
245             #pod Lines beginning with a pound character (C<#>) are ignored.
246             #pod
247             #pod =cut
248             sub dataset_from_file {## no critic (RequireArgUnpacking)
249 12     12 1 9579 my (%opts) = (
250             unknown => 'UNK',
251             null => '=',
252             @_
253             );
254              
255             croak q[Failed to provide 'path' parameter]
256 12 100       56 unless exists $opts{path};
257             croak q[Failed to provide 'format' parameter]
258 11 100       42 unless exists $opts{format};
259              
260             my ($path, $format, $unknown, $null) = (
261 10         33 path($opts{path}), @opts{'format', 'unknown', 'null'});
262              
263 10 100       286 croak "Could not find file $path"
264             unless $path->exists;
265              
266 9         298 my ($field_sep, $feature_sep);
267 9 100       38 if($format eq 'commas'){
    100          
268             # class/features/comment separated by a comma
269 4         24 $field_sep = qr{\s*,\s*};
270             # features separated by space
271 4         14 $feature_sep = qr{\s+};
272             }elsif($format eq 'nocommas'){
273             # class/features/comment separated by space
274 4         30 $field_sep = qr{\s+};
275             # no seps for features; each is a single character
276 4         16 $feature_sep = qr{};
277             }else{
278 1         16 croak "Unknown value $format for format parameter " .
279             q{(should be 'commas' or 'nocommas')};
280             }
281              
282 8 50       26 if(!defined $unknown){
283 0         0 croak q[Must provide a defined value for 'unknown' parameter];
284             }
285              
286 8         43 my $reader = _read_data_sub(
287             $path, $unknown, $null, $field_sep, $feature_sep);
288 8         22 my $item = $reader->();
289 8 50       39 if(!$item){
290 0         0 croak "No data found in file $path";
291             }
292 8         33 my $dataset = __PACKAGE__->new(cardinality => $item->cardinality);
293 8         30 $dataset->add_item($item);
294 8         19 while($item = $reader->()){
295 189         363 $dataset->add_item($item);
296             }
297 7         236 return $dataset;
298             }
299              
300             # return a sub that returns one Item per call from the given FH,
301             # and returns undef once the file is done being read. Throws errors
302             # on bad file contents.
303             # Input is file (Path::Tiny), string representing unknown class,
304             # string representing null feature, field separator (class,
305             # features, comment) and feature separator
306             sub _read_data_sub {
307 8     8   27 my ($data_file, $unknown, $null,
308             $field_sep, $feature_sep) = @_;
309 8         34 my $data_fh = $data_file->openr_utf8;
310 8         49553 my $line_num = 0;
311             return sub {
312 205     205   250 my $line;
313             # grab the next non-blank line from the file
314 205         911 while($line = <$data_fh>){
315 201         360 $line_num++;
316             # skip comments
317 201 100       448 next if $line =~ m/^\s*#/;
318             # cross-platform chomp
319 199         849 $line =~ s/\R$//;
320 199         958 $line =~ s/^\s+|\s+$//g;
321 199 100       352 last if $line;
322             }
323 205 100       360 return unless $line;
324 198         1040 my ($class, $feats, $comment) = split /$field_sep/, $line, 3;
325             # the line has to have at least the class label and features
326 198 100       404 if(!defined $feats){
327 1         6 croak "Couldn't read data at line $line_num in $data_file";
328             }
329             # if the class is specified as unknown, set it to undef to
330             # indicate this to Item
331 197 100       312 if($class eq $unknown){
332 4         9 undef $class;
333             }
334              
335 197         1380 my @data_vars = split /$feature_sep/, $feats;
336             # set null features to ''
337 197 100       414 @data_vars = map {$_ eq $null ? '' : $_} @data_vars;
  1913         3297  
338              
339 197         625 return Algorithm::AM::DataSet::Item->new(
340             features=> \@data_vars,
341             class => $class,
342             comment => $comment
343             );
344 8         69 };
345             }
346              
347             1;
348              
349             __END__