File Coverage

blib/lib/Algorithm/AM/DataSet.pm

Criterion	Covered	Total	%
statement	118	123	95.9
branch	37	40	92.5
condition			n/a
subroutine	21	22	95.4
pod	8	8	100.0
total	184	193	95.3

line	stmt	bran	sub	pod	time	code
1						package Algorithm::AM::DataSet;
2	10		10		76247	use strict;
	10				30
	10				296
3	10		10		49	use warnings;
	10				18
	10				457
4						our $VERSION = '3.12';
5						# ABSTRACT: Manage data used by Algorithm::AM
6	10		10		117	use Carp;
	10				22
	10				628
7	10		10		4357	use Algorithm::AM::DataSet::Item;
	10				23
	10				441
8	10		10		7017	use Path::Tiny;
	10				99881
	10				603
9						use Exporter::Easy (
10	10				69	OK => ['dataset_from_file']
11	10		10		94	);
	10				21
12
13						#pod =head1 SYNOPSIS
14						#pod
15						#pod use Algorithm::AM::DataSet 'dataset_from_file';
16						#pod use Algorithm::AM::DataSet::Item 'new_item';
17						#pod my $dataset = Algorithm::AM::DataSet->new(cardinality => 10);
18						#pod # or
19						#pod $dataset = dataset_from_file(path => 'finnverb', format => 'nocommas');
20						#pod $dataset->add_item(
21						#pod new_item(features => [qw(a b c d e f g h i)]));
22						#pod my $item = $dataset->get_item(2);
23						#pod
24						#pod =head1 DESCRIPTION
25						#pod
26						#pod This package contains a list of items that can be used by
27						#pod L or L for classification.
28						#pod DataSets can be made one item at a time via the L method,
29						#pod or they can be read from files via the L function.
30						#pod
31						#pod =head2 C
32						#pod
33						#pod Creates a new DataSet object. You must provide a C argument
34						#pod indicating the number of features to be contained in each data vector.
35						#pod You can then add items via the add_item method. Each item will contain
36						#pod a feature vector, and also optionally a class label and a comment
37						#pod (also called a "spec").
38						#pod
39						#pod =cut
40						sub new {
41	50		50	1	34249	my ($class, %opts) = @_;
42
43	50				176	my $new_opts = _check_opts(%opts);
44
45	48				96	my $self = bless $new_opts, $class;
46
47	48				261	$self->_init;
48
49	48				166	return $self;
50						}
51
52						# check the options for validity
53						# Return an option hash to initialize $self with
54						# For now only 'cardinality' is allowed/required.
55						sub _check_opts {
56	50		50		131	my (%opts) = @_;
57
58	50				79	my %final_opts;
59
60	50	100			141	if(!defined $opts{cardinality}){
61	1				16	croak q{Failed to provide 'cardinality' parameter};
62						}
63	49				111	$final_opts{cardinality} = $opts{cardinality};
64	49				112	delete $opts{cardinality};
65
66	49	100			125	if(keys %opts){
67						# sort the keys in the error message to make testing possible
68	1				15	croak 'Unknown parameters in DataSet constructor: ' .
69						(join ', ', sort keys %opts);
70						}
71
72	48				111	return \%final_opts;
73						}
74
75						# initialize internal state
76						sub _init {
77	48		48		79	my ($self) = @_;
78						# contains all of the items in the dataset
79	48				171	$self->{items} = [];
80
81						# map unique class labels to unique integers;
82						# these are the indices of the class labels in class_list below;
83						# the indices must start at 1 for AM to work, as 0 is reserved
84						# for heterogeneity.
85	48				115	$self->{class_num_index} = {};
86						# contains the list of class strings in an order that matches
87						# the indices in class_num_index
88	48				83	$self->{class_list} = [];
89						# the total number of different classes contained in the data set
90	48				76	$self->{num_classes} = 0;
91	48				66	return;
92						}
93
94						#pod =head2 C
95						#pod
96						#pod Returns the number of features contained in the feature vector of a
97						#pod single item.
98						#pod
99						#pod =cut
100						sub cardinality {
101	881		881	1	1362	my ($self) = @_;
102	881				2218	return $self->{cardinality};
103						}
104
105						#pod =head2 C
106						#pod
107						#pod Returns the number of items in the data set.
108						#pod
109						#pod =cut
110						sub size {
111	830		830	1	61262	my ($self) = @_;
112	830				1053	return scalar @{$self->{items}};
	830				2530
113						}
114
115						#pod =head2 C
116						#pod
117						#pod Returns the list of all unique class labels in the data set.
118						#pod
119						#pod =cut
120						sub classes {
121	0		0	1	0	my ($self) = @_;
122	0				0	return @{ $self->{class_list} };
	0				0
123						}
124
125						#pod =head2 C
126						#pod
127						#pod Adds a new item to the data set. The input may be either an
128						#pod L object, or the arguments to create
129						#pod one via its constructor (features, class, comment). This method will
130						#pod croak if the cardinality of the item does not match L.
131						#pod
132						#pod =cut
133						sub add_item {
134	276		276	1	745	my ($self, @args) = @_;
135	276				314	my $item;
136	276	100			512	if('Algorithm::AM::DataSet::Item' eq ref $args[0]){
137	219				255	$item = $args[0];
138						}else{
139	57				156	$item = Algorithm::AM::DataSet::Item->new(@args);
140						}
141
142	275	100			456	if($self->cardinality != $item->cardinality){
143						croak 'Expected ' . $self->cardinality .
144						' features, but found ' . (scalar $item->cardinality) .
145	1				4	' in ' . (join ' ', @{$item->features}) .
	1				2
146						' (' . $item->comment . ')';
147						}
148
149	274	100			569	if(defined $item->class){
150	270				437	$self->_update_class_vars($item->class);
151						}
152
153						# store the new item
154	274				335	push @{$self->{items}}, $item;
	274				516
155	274				502	return;
156						}
157
158						# keep track of classes; needs updating for new item
159						sub _update_class_vars {
160	270		270		404	my ($self, $class) = @_;
161
162	270	100			549	if(!$self->{class_num_index}->{$class}){
163	47				74	$self->{num_classes}++;
164	47				100	$self->{class_num_index}->{$class} = $self->{num_classes};
165	47				62	push @{$self->{class_list}}, $class;
	47				162
166						}
167	270				348	return;
168						}
169
170						#pod =head2 C
171						#pod
172						#pod Return the item at the given index. This will be a
173						#pod L object.
174						#pod
175						#pod =cut
176						sub get_item {
177	60306		60306	1	76637	my ($self, $index) = @_;
178	60306				103392	return $self->{items}->[$index];
179						}
180
181						#pod =head2 C
182						#pod
183						#pod Returns the number of different classification labels contained in
184						#pod the data set.
185						#pod
186						#pod =cut
187						sub num_classes {
188	388		388	1	1635	my ($self) = @_;
189	388				1379	return $self->{num_classes};
190						}
191
192						# Used by AM. Return an arrayref containing all of the
193						# classes for the data set (ordered the same as the data set).
194						sub _data_classes {
195	194		194		282	my ($self) = @_;
196						my @classes = map {
197	30018	50			42696	defined $_->class ?
198						$self->_index_for_class($_->class) :
199						undef
200	194				321	} @{$self->{items}};
	194				446
201	194				1097	return \@classes;
202						}
203
204						# Used by AM. Return the integer mapped to the given class string.
205						sub _index_for_class {
206	60053		60053		75647	my ($self, $class) = @_;
207	60053				102113	return $self->{class_num_index}->{$class};
208						}
209
210						# Used by Result, which traverses data structures from
211						# AM's guts.
212						sub _class_for_index {
213	364		364		1857	my ($self, $index) = @_;
214	364				1295	return $self->{class_list}->[$index - 1];
215						}
216
217						#pod =head2 C
218						#pod
219						#pod This function may be exported. Given 'path' and 'format' arguments,
220						#pod it reads a file containing a dataset and returns a new DataSet object
221						#pod with the given data. The 'path' argument should be the path to the
222						#pod file. The 'format' argument should be 'commas' or 'nocommas',
223						#pod indicating one of the following formats. You may also specify 'unknown'
224						#pod and 'null' arguments to indicate the strings meant to represent an
225						#pod unknown class value and null feature values. By default these are
226						#pod 'UNK' and '='.
227						#pod
228						#pod The 'commas' file format is shown below:
229						#pod
230						#pod class , f eat u re s , your comment here
231						#pod
232						#pod The commas separate the class label, feature values, and comments,
233						#pod and the whitespace around the commas is optional. Each feature value
234						#pod is separated with whitespace.
235						#pod
236						#pod The 'nocommas' file format is shown below:
237						#pod
238						#pod class features your comment here
239						#pod
240						#pod Here the class, feature values, and comments are separated by
241						#pod whitespace. Each feature value must be a single character with no
242						#pod separating characters, so here the features are f, e, a, t, u, r,
243						#pod e, and s.
244						#pod
245						#pod Lines beginning with a pound character (C<#>) are ignored.
246						#pod
247						#pod =cut
248						sub dataset_from_file {## no critic (RequireArgUnpacking)
249	12		12	1	9579	my (%opts) = (
250						unknown => 'UNK',
251						null => '=',
252						@_
253						);
254
255						croak q[Failed to provide 'path' parameter]
256	12	100			56	unless exists $opts{path};
257						croak q[Failed to provide 'format' parameter]
258	11	100			42	unless exists $opts{format};
259
260						my ($path, $format, $unknown, $null) = (
261	10				33	path($opts{path}), @opts{'format', 'unknown', 'null'});
262
263	10	100			286	croak "Could not find file $path"
264						unless $path->exists;
265
266	9				298	my ($field_sep, $feature_sep);
267	9	100			38	if($format eq 'commas'){
		100
268						# class/features/comment separated by a comma
269	4				24	$field_sep = qr{\s,\s};
270						# features separated by space
271	4				14	$feature_sep = qr{\s+};
272						}elsif($format eq 'nocommas'){
273						# class/features/comment separated by space
274	4				30	$field_sep = qr{\s+};
275						# no seps for features; each is a single character
276	4				16	$feature_sep = qr{};
277						}else{
278	1				16	croak "Unknown value $format for format parameter " .
279						q{(should be 'commas' or 'nocommas')};
280						}
281
282	8	50			26	if(!defined $unknown){
283	0				0	croak q[Must provide a defined value for 'unknown' parameter];
284						}
285
286	8				43	my $reader = _read_data_sub(
287						$path, $unknown, $null, $field_sep, $feature_sep);
288	8				22	my $item = $reader->();
289	8	50			39	if(!$item){
290	0				0	croak "No data found in file $path";
291						}
292	8				33	my $dataset = __PACKAGE__->new(cardinality => $item->cardinality);
293	8				30	$dataset->add_item($item);
294	8				19	while($item = $reader->()){
295	189				363	$dataset->add_item($item);
296						}
297	7				236	return $dataset;
298						}
299
300						# return a sub that returns one Item per call from the given FH,
301						# and returns undef once the file is done being read. Throws errors
302						# on bad file contents.
303						# Input is file (Path::Tiny), string representing unknown class,
304						# string representing null feature, field separator (class,
305						# features, comment) and feature separator
306						sub _read_data_sub {
307	8		8		27	my ($data_file, $unknown, $null,
308						$field_sep, $feature_sep) = @_;
309	8				34	my $data_fh = $data_file->openr_utf8;
310	8				49553	my $line_num = 0;
311						return sub {
312	205		205		250	my $line;
313						# grab the next non-blank line from the file
314	205				911	while($line = <$data_fh>){
315	201				360	$line_num++;
316						# skip comments
317	201	100			448	next if $line =~ m/^\s*#/;
318						# cross-platform chomp
319	199				849	$line =~ s/\R$//;
320	199				958	$line =~ s/^\s+\|\s+$//g;
321	199	100			352	last if $line;
322						}
323	205	100			360	return unless $line;
324	198				1040	my ($class, $feats, $comment) = split /$field_sep/, $line, 3;
325						# the line has to have at least the class label and features
326	198	100			404	if(!defined $feats){
327	1				6	croak "Couldn't read data at line $line_num in $data_file";
328						}
329						# if the class is specified as unknown, set it to undef to
330						# indicate this to Item
331	197	100			312	if($class eq $unknown){
332	4				9	undef $class;
333						}
334
335	197				1380	my @data_vars = split /$feature_sep/, $feats;
336						# set null features to ''
337	197	100			414	@data_vars = map {$_ eq $null ? '' : $_} @data_vars;
	1913				3297
338
339	197				625	return Algorithm::AM::DataSet::Item->new(
340						features=> \@data_vars,
341						class => $class,
342						comment => $comment
343						);
344	8				69	};
345						}
346
347						1;
348
349						__END__