File Coverage

blib/lib/Data/Frame.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				package Data::Frame;
2				# ABSTRACT: data frame implementation
3				$Data::Frame::VERSION = '0.003';
4	5	5	216082	use strict;
	5		8
	5		149
5	5	5	18	use warnings;
	5		5
	5		94
6
7	5	5	2383	use Tie::IxHash;
	5		17061
	5		820
8	5	5	1694	use Tie::IxHash::Extension;
	0
	0
9				use PDL::Lite;
10				use Data::Perl ();
11				use List::AllUtils;
12				use Try::Tiny;
13				use PDL::SV;
14				use PDL::StringfiableExtension;
15				use Carp;
16				use Scalar::Util qw(blessed);
17
18				use Text::Table::Tiny;
19
20				use Data::Frame::Column::Helper;
21
22				use overload (
23				'""' => \&Data::Frame::string,
24				'==' => \&Data::Frame::equal,
25				'eq' => \&Data::Frame::equal,
26				);
27
28				{
29				# TODO temporary column role
30				no strict;
31				*PDL::number_of_rows = sub { $_[0]->getdim(0) };
32				*Data::Perl::Collection::Array::number_of_rows = sub { $_[0]->count };
33				}
34
35				use Moo;
36				with 'MooX::Traits';
37
38				sub _trait_namespace { 'Data::Frame::Role' } # override for MooX::Traits
39
40				has _columns => ( is => 'ro', default => sub { Tie::IxHash->new; } );
41
42				has _row_names => ( is => 'rw', predicate => 1 );
43
44				sub BUILD {
45				my ($self, $args) = @_;
46				my $colspec = delete $args->{columns};
47
48				if( defined $colspec ) {
49				my @columns =
50				ref $colspec eq 'HASH'
51				? map { ($_, $colspec->{$_} ) } sort { $a cmp $b } keys %$colspec
52				: @$colspec;
53				$self->add_columns(@columns);
54				}
55				}
56
57				sub string {
58				my ($self) = @_;
59				my $rows = [];
60				push @$rows, [ '', @{ $self->column_names } ];
61				for my $r_idx ( 0..$self->number_of_rows-1 ) {
62				my $r = [
63				$self->row_names->slice($r_idx)->squeeze->string,
64				map {
65				my $col = $self->nth_column($_);
66				$col->slice($r_idx)->squeeze->string
67				} 0..$self->number_of_columns-1 ];
68				push @$rows, $r;
69				}
70				{
71				# clear column separators
72				local $Text::Table::Tiny::COLUMN_SEPARATOR = '';
73				local $Text::Table::Tiny::CORNER_MARKER = '';
74
75				Text::Table::Tiny::table(rows => $rows, header_row => 1)
76				}
77				}
78
79				sub number_of_columns {
80				my ($self) = @_;
81				$self->_columns->Length;
82				}
83
84				sub number_of_rows {
85				my ($self) = @_;
86				if( $self->number_of_columns ) {
87				return $self->nth_column(0)->number_of_rows;
88				}
89				0;
90				}
91
92				# supports negative indices
93				sub nth_column {
94				my ($self, $index) = @_;
95				confess "requires index" unless defined $index;
96				confess "column index out of bounds" if $index >= $self->number_of_columns;
97				# fine if $index < 0 because negative indices are supported
98				$self->_columns->Values( $index );
99				}
100
101				sub column_names {
102				my ($self, @colnames) = @_;
103				if( @colnames ) {
104				try {
105				$self->_columns->RenameKeys( @colnames );
106				} catch {
107				confess "incorrect number of column names" if /@{[ Tie::IxHash::ERROR_KEY_LENGTH_MISMATCH ]}/;
108				};
109				}
110				[ $self->_columns->Keys ];
111				}
112
113				sub row_names {
114				my ($self, @rest) = @_;
115				if( @rest ) {
116				# setting row names
117				my $new_rows;
118				if( ref $rest[0] ) {
119				if( ref $rest[0] eq 'ARRAY' ) {
120				$new_rows = Data::Perl::array( @{ $rest[0] });
121				} elsif( $rest[0]->isa('PDL') ) {
122				# TODO just run uniq?
123				$new_rows = Data::Perl::array( @{ $rest[0]->unpdl } );
124				} else {
125				$new_rows = Data::Perl::array(@rest);
126				}
127				} else {
128				$new_rows = Data::Perl::array(@rest);
129				}
130
131				confess "invalid row names length"
132				if $self->number_of_rows != $new_rows->count;
133				confess "non-unique row names"
134				if $new_rows->count != $new_rows->uniq->count;
135
136				return $self->_row_names( PDL::SV->new($new_rows) );
137				}
138				if( not $self->_has_row_names ) {
139				# if it has never been set before
140				return sequence($self->number_of_rows);
141				}
142				# else, if row_names has been set
143				return $self->_row_names;
144				}
145
146				sub _make_actual_row_names {
147				my ($self) = @_;
148				if( not $self->_has_row_names ) {
149				$self->_row_names( $self->row_names );
150				}
151				}
152
153				sub column {
154				my ($self, $colname) = @_;
155				confess "column $colname does not exist" unless $self->_columns->EXISTS( $colname );
156				$self->_columns->FETCH( $colname );
157				}
158
159				sub _column_validate {
160				my ($self, $name, $data) = @_;
161				if( $name =~ /^\d+$/ ) {
162				confess "invalid column name: $name can not be an integer";
163				}
164				if( $self->number_of_columns ) {
165				if( $data->number_of_rows != $self->number_of_rows ) {
166				confess "number of rows in column is @{[ $data->number_of_rows ]}; expected @{[ $self->number_of_rows ]}";
167				}
168				}
169				1;
170				}
171
172				sub add_columns {
173				my ($self, @columns) = @_;
174				confess "uneven number of elements for column specification" unless @columns % 2 == 0;
175				for ( List::AllUtils::pairs(@columns) ) {
176				my ( $name, $data ) = @$_;
177				$self->add_column( $name => $data );
178				}
179				}
180
181				sub add_column {
182				my ($self, $name, $data) = @_;
183				confess "column $name already exists"
184				if $self->_columns->EXISTS( $name );
185
186				# TODO apply column role to data
187				$data = PDL::SV->new( $data ) if ref $data eq 'ARRAY';
188
189				$self->_column_validate( $name => $data);
190
191
192				$self->_columns->Push( $name => $data );
193				}
194
195				# R
196				# > iris[c(1,2,3,3,3,3),]
197				# PDL
198				# $ sequence(10,4)->dice(X,[0,1,1,0])
199				sub select_rows {
200				my ($self, @which_rest) = @_;
201
202				my $which = [];
203				if( @which_rest > 1 ) {
204				$which = \@which_rest; # array to arrayref
205				} elsif( @which_rest == 1 ) {
206				$which = $which_rest[0]; # get the first value off
207				} else { # @which_rest == 0
208				$which = pdl []; # Empty PDL
209				}
210
211				$which = PDL::Core::topdl($which); # ensure it is a PDL
212
213				my $colnames = $self->column_names;
214				my $colspec = [ map {
215				( $colnames->[$_] => $self->nth_column($_)->dice($which) )
216				} 0..$self->number_of_columns-1 ];
217
218				$self->_make_actual_row_names;
219				my $select_df = $self->new(
220				columns => $colspec,
221				_row_names => $self->row_names->dice( $which ) );
222				$select_df;
223				}
224
225				sub _column_helper {
226				my ($self) = @_;
227				Data::Frame::Column::Helper->new( dataframe => $self );
228				}
229
230				sub equal {
231				my ($self, $other, $d) = @_;
232				if( blessed($self) && $self->isa('Data::Frame') && blessed($other) && $other->isa('Data::Frame') ) {
233				if( $self->number_of_columns == $other->number_of_columns ) {
234				my @eq_cols = map { $self->nth_column($_) == $other->nth_column($_) }
235				0..$self->number_of_columns-1;
236				my @colnames = @{ $self->columns };
237				my @colspec = List::AllUtils::mesh( @colnames, @eq_cols );
238				return $self->new( columns => \@colspec );
239				} else {
240				die "number of columns is not equal: @{[$self->number_of_columns]} != @{[$other->number_of_columns]}";
241				}
242				}
243				}
244
245				1;
246
247				__END__
248
249				=pod
250
251				=encoding UTF-8
252
253				=head1 NAME
254
255				Data::Frame - data frame implementation
256
257				=head1 VERSION
258
259				version 0.003
260
261				=head1 SYNOPSIS
262
263				use Data::Frame;
264				use PDL;
265
266				my $df = Data::Frame->new( columns => [
267				z => pdl(1, 2, 3, 4),
268				y => ( sequence(4) >= 2 ) ,
269				x => [ qw/foo bar baz quux/ ],
270				] );
271
272				say $df;
273				# ---------------
274				# z y x
275				# ---------------
276				# 0 1 0 foo
277				# 1 2 0 bar
278				# 2 3 1 baz
279				# 3 4 1 quux
280				# ---------------
281
282				say $df->nth_column(0);
283				# [1 2 3 4]
284
285				say $df->select_rows( 3,1 )
286				# ---------------
287				# z y x
288				# ---------------
289				# 3 4 1 quux
290				# 1 2 0 bar
291				# ---------------
292
293				=head1 DESCRIPTION
294
295				This implements a data frame container that uses L<PDL> for individual columns.
296				As such, it supports marking missing values (C<BAD> values).
297
298				The API is currently experimental and is made to work with
299				L<Statistics::NiceR>, so be aware that it could change.
300
301				=head1 METHODS
302
303				=head2 new
304
305				new( Hash %options ) # returns Data::Frame
306
307				Creates a new C<Data::Frame> when passed the following options as a
308				specification of the columns to add:
309
310				=over 4
311
312				=item * columns => ArrayRef $columns_array
313
314				When C<columns> is passed an C<ArrayRef> of pairs of the form
315
316				$columns_array = [
317				column_name_z => $column_01_data, # first column data
318				column_name_y => $column_02_data, # second column data
319				column_name_x => $column_03_data, # third column data
320				]
321
322				then the column data is added to the data frame in the order that the pairs
323				appear in the C<ArrayRef>.
324
325				=item * columns => HashRef $columns_hash
326
327				$columns_hash = {
328				column_name_z => $column_03_data, # third column data
329				column_name_y => $column_02_data, # second column data
330				column_name_x => $column_01_data, # first column data
331				}
332
333				then the column data is added to the data frame by the order of the keys in the
334				C<HashRef> (sorted with a stringwise C<cmp>).
335
336				=back
337
338				=head2 string
339
340				string() # returns Str
341
342				Returns a string representation of the C<Data::Frame>.
343
344				=head2 number_of_columns
345
346				number_of_columns() # returns Int
347
348				Returns the count of the number of columns in the C<Data::Frame>.
349
350				=head2 number_of_rows
351
352				number_of_rows() # returns Int
353
354				Returns the count of the number of rows in the C<Data::Frame>.
355
356				=head2 nth_columm
357
358				number_of_rows(Int $n) # returns a column
359
360				Returns column number C<$n>. Supports negative indices (e.g., $n = -1 returns
361				the last column).
362
363				=head2 column_names
364
365				column_names() # returns an ArrayRef
366
367				column_names( @new_column_names ) # returns an ArrayRef
368
369				Returns an C<ArrayRef> of the names of the columns.
370
371				If passed a list of arguments C<@new_column_names>, then the columns will be
372				renamed to the elements of C<@new_column_names>. The length of the argument
373				must match the number of columns in the C<Data::Frame>.
374
375				=head2 row_names
376
377				row_names() # returns a PDL
378
379				row_names( Array @new_row_names ) # returns a PDL
380
381				row_names( ArrayRef $new_row_names ) # returns a PDL
382
383				row_names( PDL $new_row_names ) # returns a PDL
384
385				Returns an C<ArrayRef> of the names of the columns.
386
387				If passed a argument, then the rows will be renamed. The length of the argument
388				must match the number of rows in the C<Data::Frame>.
389
390				=head2 column
391
392				column( Str $column_name )
393
394				Returns the column with the name C<$column_name>.
395
396				=head2 add_columns
397
398				add_columns( Array @column_pairlist )
399
400				Adds all the columns in C<@column_pairlist> to the C<Data::Frame>.
401
402				=head2 add_column
403
404				add_column(Str $name, $data)
405
406				Adds a single column to the C<Data::Frame> with the name C<$name> and data
407				C<$data>.
408
409				=head2 select_rows
410
411				select_rows( Array @which )
412
413				select_rows( ArrayRef $which )
414
415				select_rows( PDL $which )
416
417				The argument C<$which> is a vector of indices. C<select_rows> returns a new
418				C<Data::Frame> that contains rows that match the indices in the vector
419				C<$which>.
420
421				This C<Data::Frame> supports PDL's data flow, meaning that changes to the
422				values in the child data frame columns will appear in the parent data frame.
423
424				If no indices are given, a C<Data::Frame> with no rows is returned.
425
426				=head1 SEE ALSO
427
428				=over 4
429
430				=item * L<R manual: data.frame\|https://stat.ethz.ch/R-manual/R-devel/library/base/html/data.frame.html>.
431
432				=item * L<Statistics::NiceR>
433
434				=item * L<PDL>
435
436				=back
437
438				=head1 AUTHOR
439
440				Zakariyya Mughal <zmughal@cpan.org>
441
442				=head1 COPYRIGHT AND LICENSE
443
444				This software is copyright (c) 2014 by Zakariyya Mughal.
445
446				This is free software; you can redistribute it and/or modify it under
447				the same terms as the Perl 5 programming language system itself.
448
449				=cut