| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#!/usr/bin/env perl |
|
2
|
|
|
|
|
|
|
# ABSTRACT: Get basic statistical functions, like in R, but with Perl using XS for performance |
|
3
|
|
|
|
|
|
|
require 5.010; |
|
4
|
13
|
|
|
13
|
|
458814
|
use strict; |
|
|
13
|
|
|
|
|
17
|
|
|
|
13
|
|
|
|
|
438
|
|
|
5
|
13
|
|
|
13
|
|
80
|
use feature 'say'; |
|
|
13
|
|
|
|
|
15
|
|
|
|
13
|
|
|
|
|
3171
|
|
|
6
|
|
|
|
|
|
|
package Stats::LikeR; |
|
7
|
|
|
|
|
|
|
our $VERSION = 0.14; |
|
8
|
|
|
|
|
|
|
require XSLoader; |
|
9
|
13
|
|
|
13
|
|
6411
|
use Devel::Confess 'color'; |
|
|
13
|
|
|
|
|
111176
|
|
|
|
13
|
|
|
|
|
38
|
|
|
10
|
13
|
|
|
13
|
|
1199
|
use warnings FATAL => 'all'; |
|
|
13
|
|
|
|
|
33
|
|
|
|
13
|
|
|
|
|
592
|
|
|
11
|
13
|
|
|
13
|
|
5129
|
use autodie ':default'; |
|
|
13
|
|
|
|
|
171192
|
|
|
|
13
|
|
|
|
|
42
|
|
|
12
|
13
|
|
|
13
|
|
57231
|
use Exporter 'import'; |
|
|
13
|
|
|
|
|
17
|
|
|
|
13
|
|
|
|
|
459
|
|
|
13
|
13
|
|
|
13
|
|
48
|
use Scalar::Util 'looks_like_number'; |
|
|
13
|
|
|
|
|
17
|
|
|
|
13
|
|
|
|
|
7171
|
|
|
14
|
|
|
|
|
|
|
XSLoader::load('Stats::LikeR', $VERSION); |
|
15
|
|
|
|
|
|
|
our @EXPORT_OK = qw(add_data aov cfilter chisq_test col col2col cor cor_test cov dnorm filter fisher_test glm group_by hoh2hoa hist kruskal_test ks_test ljoin lm matrix max mean median min mode oneway_test p_adjust power_t_test prcomp quantile rbinom read_table rnorm runif sample scale sd seq shapiro_test sum summary t_test transpose value_counts var var_test wilcox_test write_table); |
|
16
|
|
|
|
|
|
|
our @EXPORT = @EXPORT_OK; |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
require XSLoader; |
|
19
|
|
|
|
|
|
|
# ---- filter DSL: col() builds a predicate via overloading (pure Perl) ------- |
|
20
|
|
|
|
|
|
|
# Exported: filter (XS) and col. Place col()/Col/Pred near the top of the .pm; |
|
21
|
|
|
|
|
|
|
# they need no XS. filter() is the XSUB. |
|
22
|
26
|
|
|
26
|
1
|
135822
|
sub col { Stats::LikeR::Col->new($_[0]) } |
|
23
|
|
|
|
|
|
|
{ |
|
24
|
|
|
|
|
|
|
package Stats::LikeR::Col; |
|
25
|
26
|
|
33
|
26
|
|
173
|
sub new { bless { name => $_[1] }, ref($_[0]) || $_[0] } |
|
26
|
|
|
|
|
|
|
# build a comparison leaf; if operands were swapped (4 > col('x')), flip the op |
|
27
|
|
|
|
|
|
|
sub _c { |
|
28
|
26
|
|
|
26
|
|
47
|
my ($self, $val, $swapped, $op, $flip) = @_; |
|
29
|
26
|
100
|
|
|
|
62
|
Stats::LikeR::Pred->_leaf($self->{name}, $swapped ? $flip : $op, $val); |
|
30
|
|
|
|
|
|
|
} |
|
31
|
|
|
|
|
|
|
use overload |
|
32
|
13
|
|
|
13
|
|
24
|
'>' => sub { $_[0]->_c($_[1],$_[2],'>','<') }, |
|
33
|
3
|
|
|
3
|
|
12
|
'<' => sub { $_[0]->_c($_[1],$_[2],'<','>') }, |
|
34
|
1
|
|
|
1
|
|
4
|
'>=' => sub { $_[0]->_c($_[1],$_[2],'>=','<=') }, |
|
35
|
1
|
|
|
1
|
|
8
|
'<=' => sub { $_[0]->_c($_[1],$_[2],'<=','>=') }, |
|
36
|
3
|
|
|
3
|
|
6
|
'==' => sub { $_[0]->_c($_[1],$_[2],'==','==') }, |
|
37
|
1
|
|
|
1
|
|
3
|
'!=' => sub { $_[0]->_c($_[1],$_[2],'!=','!=') }, |
|
38
|
0
|
|
|
0
|
|
0
|
'lt' => sub { $_[0]->_c($_[1],$_[2],'lt','gt') }, |
|
39
|
1
|
|
|
1
|
|
4
|
'gt' => sub { $_[0]->_c($_[1],$_[2],'gt','lt') }, |
|
40
|
0
|
|
|
0
|
|
0
|
'le' => sub { $_[0]->_c($_[1],$_[2],'le','ge') }, |
|
41
|
0
|
|
|
0
|
|
0
|
'ge' => sub { $_[0]->_c($_[1],$_[2],'ge','le') }, |
|
42
|
2
|
|
|
2
|
|
5
|
'eq' => sub { $_[0]->_c($_[1],$_[2],'eq','eq') }, |
|
43
|
1
|
|
|
1
|
|
3
|
'ne' => sub { $_[0]->_c($_[1],$_[2],'ne','ne') }, |
|
44
|
13
|
|
|
13
|
|
118
|
fallback => 1; |
|
|
13
|
|
|
|
|
20
|
|
|
|
13
|
|
|
|
|
225
|
|
|
45
|
|
|
|
|
|
|
} |
|
46
|
|
|
|
|
|
|
{ |
|
47
|
|
|
|
|
|
|
package Stats::LikeR::Pred; |
|
48
|
26
|
|
|
26
|
|
324
|
sub _leaf { bless { op => $_[2], col => $_[1], val => $_[3] }, 'Stats::LikeR::Pred' } |
|
49
|
4
|
|
|
4
|
|
44
|
sub _node { bless { op => $_[0], l => $_[1], r => $_[2] }, 'Stats::LikeR::Pred' } |
|
50
|
|
|
|
|
|
|
use overload |
|
51
|
2
|
|
|
2
|
|
4
|
'&' => sub { Stats::LikeR::Pred::_node('and', $_[0], $_[1]) }, |
|
52
|
1
|
|
|
1
|
|
2
|
'|' => sub { Stats::LikeR::Pred::_node('or', $_[0], $_[1]) }, |
|
53
|
1
|
|
|
1
|
|
3
|
'!' => sub { Stats::LikeR::Pred::_node('not', $_[0], undef) }, |
|
54
|
13
|
|
|
13
|
|
3986
|
fallback => 1; |
|
|
13
|
|
|
|
|
14
|
|
|
|
13
|
|
|
|
|
98
|
|
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
sub summary { |
|
57
|
5
|
|
|
5
|
1
|
661492
|
my ($data, %args); |
|
58
|
5
|
|
|
|
|
36
|
my $current_sub = (split(/::/,(caller(0))[3]))[-1]; |
|
59
|
|
|
|
|
|
|
|
|
60
|
5
|
100
|
66
|
|
|
30
|
if (@_ && ref $_[0]) { |
|
61
|
|
|
|
|
|
|
# Handles: summary(\@arr) or summary(\@arr, nrows => 5) or summary(\%h, nrow => 3) |
|
62
|
4
|
|
|
|
|
5
|
$data = shift; |
|
63
|
4
|
|
|
|
|
7
|
%args = @_; # capture any trailing key/value pairs |
|
64
|
|
|
|
|
|
|
} else { |
|
65
|
|
|
|
|
|
|
# Handles: summary(@runif) or summary(@runif, nrows => 2) |
|
66
|
|
|
|
|
|
|
# Extract known trailing named arguments from the flat list |
|
67
|
1
|
|
33
|
|
|
20
|
while (@_ >= 2 && defined $_[-2] && !ref($_[-2]) && $_[-2] =~ /^(?:nrows|nrow)$/) { |
|
|
|
|
33
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
68
|
0
|
|
|
|
|
0
|
my $val = pop @_; |
|
69
|
0
|
|
|
|
|
0
|
my $key = pop @_; |
|
70
|
0
|
|
|
|
|
0
|
$args{$key} = $val; |
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
# The remaining items in @_ make up the actual data array |
|
73
|
1
|
|
|
|
|
6
|
my @list = @_; |
|
74
|
1
|
|
|
|
|
2
|
$data = \@list; |
|
75
|
|
|
|
|
|
|
} |
|
76
|
|
|
|
|
|
|
# Normalize nrow -> nrows, default to 10 |
|
77
|
5
|
|
50
|
|
|
24
|
$args{nrows} //= delete($args{nrow}) // 10; |
|
|
|
|
66
|
|
|
|
|
|
78
|
5
|
|
|
|
|
9
|
my $ref_type = ref $data; |
|
79
|
5
|
50
|
66
|
|
|
17
|
if (($ref_type ne 'ARRAY') && ($ref_type ne 'HASH')) { |
|
80
|
0
|
|
|
|
|
0
|
die "$current_sub' data must either be a hash or an array, not \"$ref_type\""; |
|
81
|
|
|
|
|
|
|
} |
|
82
|
5
|
|
|
|
|
7
|
my $single_arr = 0; |
|
83
|
5
|
100
|
100
|
|
|
18
|
if (($ref_type eq 'ARRAY') && (ref $data->[0] eq '')) { |
|
84
|
2
|
|
|
|
|
3
|
$single_arr = 1; |
|
85
|
|
|
|
|
|
|
} |
|
86
|
5
|
|
|
|
|
11
|
my @header = ('# values', 'Min.', '1st Qu.', 'Median', 'Mean', '3rd Qu.', 'Max.'); |
|
87
|
5
|
|
|
|
|
6
|
my @out; |
|
88
|
5
|
100
|
|
|
|
13
|
if ($single_arr == 1) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
89
|
2
|
|
|
|
|
6
|
push @out, '-' x 75; |
|
90
|
2
|
|
|
|
|
10
|
my $header = sprintf('%9s ' x scalar @header, @header); |
|
91
|
2
|
|
|
|
|
3
|
push @out, $header; |
|
92
|
2
|
|
|
|
|
3
|
push @out, '-' x 75; |
|
93
|
2
|
|
|
|
|
22
|
my @undef = grep {!defined $data->[$_]} 0..scalar @{ $data }-1; |
|
|
198
|
|
|
|
|
193
|
|
|
|
2
|
|
|
|
|
14
|
|
|
94
|
2
|
50
|
|
|
|
7
|
if (scalar @undef > 0) { |
|
95
|
0
|
|
|
|
|
0
|
say STDERR join (',', @undef); |
|
96
|
0
|
|
|
|
|
0
|
die "The above indices are not defined in $current_sub"; |
|
97
|
|
|
|
|
|
|
} |
|
98
|
2
|
|
|
|
|
3
|
my @numeric = grep {looks_like_number($_)} @{ $data }; |
|
|
198
|
|
|
|
|
230
|
|
|
|
2
|
|
|
|
|
4
|
|
|
99
|
2
|
|
|
|
|
48
|
my $q = quantile(\@numeric, probs => [0.25, 0.75]); |
|
100
|
2
|
|
|
|
|
73
|
my $vals = sprintf('%9.4g ' x scalar @header, scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric)); |
|
101
|
2
|
|
|
|
|
9
|
push @out, $vals; |
|
102
|
|
|
|
|
|
|
} elsif ($ref_type eq 'ARRAY') { |
|
103
|
1
|
|
|
|
|
13
|
push @out, '-' x 75; |
|
104
|
1
|
|
|
|
|
5
|
my $header = sprintf('%9s ' x scalar @header, @header); |
|
105
|
1
|
|
|
|
|
3
|
unshift @header, 'Index'; |
|
106
|
1
|
|
|
|
|
2
|
$header = 'Index ' . $header; |
|
107
|
1
|
|
|
|
|
1
|
push @out, $header; |
|
108
|
1
|
|
|
|
|
2
|
push @out, '-' x 75; |
|
109
|
1
|
|
|
|
|
1
|
my $rows_printed = 0; |
|
110
|
1
|
|
|
|
|
4
|
foreach my $index (0..$#$data) { |
|
111
|
2
|
|
|
|
|
2
|
my @undef = grep {!defined $data->[$index][$_]} 0..scalar @{ $data->[$index] }-1; |
|
|
18
|
|
|
|
|
34
|
|
|
|
2
|
|
|
|
|
5
|
|
|
112
|
2
|
50
|
|
|
|
5
|
if (scalar @undef > 0) { |
|
113
|
0
|
|
|
|
|
0
|
say STDERR join (',', @undef); |
|
114
|
0
|
|
|
|
|
0
|
die "The above indices are not defined for index $index in $current_sub"; |
|
115
|
|
|
|
|
|
|
} |
|
116
|
2
|
|
|
|
|
3
|
my @numeric = grep {looks_like_number($_)} @{ $data->[$index] }; |
|
|
18
|
|
|
|
|
23
|
|
|
|
2
|
|
|
|
|
3
|
|
|
117
|
2
|
|
|
|
|
16
|
my $q = quantile(\@numeric, probs => [0.25, 0.75]); |
|
118
|
2
|
|
|
|
|
27
|
my $vals = sprintf('%6.4g', $index) . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric)); |
|
119
|
2
|
|
|
|
|
4
|
push @out, $vals; |
|
120
|
2
|
|
|
|
|
3
|
$rows_printed++; |
|
121
|
2
|
50
|
|
|
|
6
|
last if $rows_printed >= $args{nrows}; # Changed to >= just to be safe |
|
122
|
|
|
|
|
|
|
} |
|
123
|
|
|
|
|
|
|
} elsif ($ref_type eq 'HASH') { |
|
124
|
2
|
|
|
|
|
5
|
push @out, '-' x 78; |
|
125
|
2
|
|
|
|
|
8
|
my $header = sprintf('%9s ' x scalar @header, @header); |
|
126
|
2
|
|
|
|
|
3
|
unshift @header, 'Key'; |
|
127
|
2
|
|
|
|
|
5
|
$header = ' Key ' . $header; |
|
128
|
2
|
|
|
|
|
2
|
push @out, $header; |
|
129
|
2
|
|
|
|
|
3
|
push @out, '-' x 78; |
|
130
|
2
|
|
|
|
|
2
|
my $rows_printed = 0; |
|
131
|
2
|
|
|
|
|
3
|
foreach my $key (sort {lc $a cmp lc $b} keys %{ $data }) { |
|
|
4
|
|
|
|
|
10
|
|
|
|
2
|
|
|
|
|
10
|
|
|
132
|
3
|
|
|
|
|
4
|
my @undef = grep {!defined $data->{$key}[$_]} 0..scalar @{ $data->{$key} }-1; |
|
|
27
|
|
|
|
|
31
|
|
|
|
3
|
|
|
|
|
7
|
|
|
133
|
3
|
50
|
|
|
|
8
|
if (scalar @undef > 0) { |
|
134
|
0
|
|
|
|
|
0
|
say STDERR join (',', @undef); |
|
135
|
0
|
|
|
|
|
0
|
die "The above indices are not defined for key $key in $current_sub"; |
|
136
|
|
|
|
|
|
|
} |
|
137
|
3
|
|
|
|
|
3
|
my @numeric = grep {looks_like_number($_)} @{ $data->{$key} }; |
|
|
27
|
|
|
|
|
58
|
|
|
|
3
|
|
|
|
|
4
|
|
|
138
|
3
|
|
|
|
|
24
|
my $q = quantile(\@numeric, probs => [0.25, 0.75]); |
|
139
|
3
|
|
|
|
|
6
|
my $print_key = substr($key, 0, 9); |
|
140
|
3
|
50
|
|
|
|
8
|
if ((length $print_key) < 9) { # make sure that short keys line up correctly |
|
141
|
3
|
|
|
|
|
7
|
$print_key .= ' ' x (9 - length $print_key); |
|
142
|
|
|
|
|
|
|
} |
|
143
|
3
|
|
|
|
|
40
|
my $vals = $print_key . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric)); |
|
144
|
3
|
|
|
|
|
6
|
push @out, $vals; |
|
145
|
3
|
|
|
|
|
4
|
$rows_printed++; |
|
146
|
3
|
100
|
|
|
|
11
|
last if $rows_printed >= $args{nrows}; |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
} |
|
149
|
5
|
|
|
|
|
60
|
say join ("\n", @out); |
|
150
|
5
|
|
|
|
|
27
|
return \@out; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
sub read_table { |
|
154
|
523
|
|
|
523
|
1
|
736915
|
my $file = shift; |
|
155
|
523
|
50
|
|
|
|
5180
|
die "\"$file\" is not a file" unless -f $file; |
|
156
|
523
|
50
|
|
|
|
3195
|
die "\"$file\" is not readable" unless -r $file; |
|
157
|
|
|
|
|
|
|
|
|
158
|
523
|
|
|
|
|
663
|
my %input_args = @_; |
|
159
|
523
|
100
|
|
|
|
769
|
if (defined $input_args{delim}) { |
|
160
|
1
|
|
|
|
|
3
|
$input_args{sep} = delete $input_args{delim}; |
|
161
|
|
|
|
|
|
|
} |
|
162
|
|
|
|
|
|
|
|
|
163
|
523
|
|
|
|
|
534
|
my $default_sep = ','; |
|
164
|
523
|
100
|
|
|
|
2033
|
if ($file =~ /\.tsv$/i) { |
|
|
|
50
|
|
|
|
|
|
|
165
|
2
|
|
|
|
|
4
|
$default_sep = "\t"; |
|
166
|
|
|
|
|
|
|
} elsif ($file =~ /\.csv$/i) { |
|
167
|
521
|
|
|
|
|
594
|
$default_sep = ","; |
|
168
|
|
|
|
|
|
|
} |
|
169
|
|
|
|
|
|
|
|
|
170
|
523
|
|
|
|
|
1148
|
my %args = ( |
|
171
|
|
|
|
|
|
|
sep => $default_sep, |
|
172
|
|
|
|
|
|
|
comment => '#', |
|
173
|
|
|
|
|
|
|
%input_args, |
|
174
|
|
|
|
|
|
|
); |
|
175
|
|
|
|
|
|
|
|
|
176
|
523
|
|
|
|
|
693
|
my %allowed_args = map {$_ => 1} ( |
|
|
3138
|
|
|
|
|
3906
|
|
|
177
|
|
|
|
|
|
|
'comment', 'output.type', 'filter', 'row.names', 'sep', 'delim' |
|
178
|
|
|
|
|
|
|
); |
|
179
|
523
|
|
|
|
|
930
|
my @undef_args = sort grep {!$allowed_args{$_}} keys %args; |
|
|
1067
|
|
|
|
|
1501
|
|
|
180
|
523
|
50
|
|
|
|
813
|
if (scalar @undef_args > 0) { |
|
181
|
0
|
|
|
|
|
0
|
my $current_sub = (split(/::/,(caller(0))[3]))[-1]; # only needed on the error path |
|
182
|
0
|
|
|
|
|
0
|
say join (', ', @undef_args); |
|
183
|
0
|
|
|
|
|
0
|
die "the above args aren't defined for $current_sub"; |
|
184
|
|
|
|
|
|
|
} |
|
185
|
523
|
|
100
|
|
|
1276
|
$args{'output.type'} = $args{'output.type'} // 'aoh'; |
|
186
|
523
|
100
|
|
|
|
1127
|
if ($args{'output.type'} !~ m/^(?:aoh|hoa|hoh)$/) { |
|
187
|
1
|
|
|
|
|
14
|
die "\"$args{'output.type'}\" isn't allowed"; |
|
188
|
|
|
|
|
|
|
} |
|
189
|
522
|
|
|
|
|
550
|
my $filter = $args{filter}; |
|
190
|
522
|
50
|
66
|
|
|
1308
|
if (defined $filter && ref($filter) eq 'CODE') { |
|
|
|
50
|
66
|
|
|
|
|
|
191
|
0
|
|
|
|
|
0
|
$filter = { 0 => $filter }; |
|
192
|
|
|
|
|
|
|
} elsif (defined $filter && ref($filter) ne 'HASH') { |
|
193
|
0
|
|
|
|
|
0
|
die "'filter' must be a CODE or HASH reference"; |
|
194
|
|
|
|
|
|
|
} |
|
195
|
522
|
|
|
|
|
591
|
my (@data, %data, @header, %mapped_filters, @sorted_filter_flds, %seen_rownames); |
|
196
|
|
|
|
|
|
|
_parse_csv_file($file, $args{sep} // '', $args{comment} // '', sub { |
|
197
|
6718
|
|
|
6718
|
|
7178
|
my ($line_ref) = @_; |
|
198
|
6718
|
100
|
|
|
|
7792
|
if (!@header) { |
|
199
|
|
|
|
|
|
|
# --- HEADER PROCESSING (copy made only here; runs once) --- |
|
200
|
522
|
|
|
|
|
907
|
my @line = @$line_ref; |
|
201
|
522
|
50
|
33
|
|
|
2691
|
$line[0] =~ s/^\Q$args{comment}\E// if @line && defined $line[0]; |
|
202
|
|
|
|
|
|
|
# NOTE: trailing-empty stripping removed — the header is now treated like |
|
203
|
|
|
|
|
|
|
# data rows, so a consistent trailing separator no longer produces a false |
|
204
|
|
|
|
|
|
|
# "Alignment error". The alignment check still rejects genuinely ragged rows. |
|
205
|
522
|
|
|
|
|
752
|
@header = @line; |
|
206
|
522
|
100
|
66
|
|
|
1180
|
if ((scalar @header > 0) && ($header[0] eq '')) { |
|
207
|
10
|
|
|
|
|
15
|
$header[0] = 'row_name'; |
|
208
|
|
|
|
|
|
|
} |
|
209
|
522
|
|
|
|
|
526
|
my %seen_h; |
|
210
|
522
|
|
|
|
|
561
|
my @dup_cols = grep { $seen_h{$_}++ } @header; |
|
|
1166
|
|
|
|
|
1792
|
|
|
211
|
522
|
100
|
|
|
|
722
|
warn "read_table: duplicate column name(s) in $file: @dup_cols (later values win)\n" if @dup_cols; |
|
212
|
522
|
100
|
100
|
|
|
893
|
if (($args{'output.type'} eq 'hoh') && (not defined $args{'row.names'})) { |
|
213
|
4
|
|
|
|
|
7
|
$args{'row.names'} = $header[0]; |
|
214
|
|
|
|
|
|
|
} |
|
215
|
522
|
50
|
66
|
|
|
734
|
if ((defined $args{'row.names'}) && (!grep {$_ eq $args{'row.names'}} @header)) { |
|
|
61
|
|
|
|
|
80
|
|
|
216
|
0
|
|
|
|
|
0
|
die "\"$args{'row.names'}\" isn't in the header of $file"; |
|
217
|
|
|
|
|
|
|
} |
|
218
|
522
|
100
|
|
|
|
669
|
if ($filter) { |
|
219
|
4
|
|
|
|
|
11
|
for my $k (keys %$filter) { |
|
220
|
4
|
50
|
|
|
|
17
|
if ($k =~ /^\d+$/) { |
|
221
|
0
|
|
|
|
|
0
|
$mapped_filters{$k} = $filter->{$k}; |
|
222
|
|
|
|
|
|
|
} else { |
|
223
|
4
|
|
|
|
|
50
|
my ($idx) = grep { $header[$_] eq $k } 0..$#header; |
|
|
44
|
|
|
|
|
55
|
|
|
224
|
4
|
50
|
|
|
|
13
|
die "Filter column '$k' not found in header" unless defined $idx; |
|
225
|
4
|
|
|
|
|
18
|
$mapped_filters{$idx + 1} = $filter->{$k}; |
|
226
|
|
|
|
|
|
|
} |
|
227
|
|
|
|
|
|
|
} |
|
228
|
4
|
|
|
|
|
13
|
@sorted_filter_flds = sort { $a <=> $b } keys %mapped_filters; # constant per file |
|
|
0
|
|
|
|
|
0
|
|
|
229
|
|
|
|
|
|
|
} |
|
230
|
522
|
|
|
|
|
2093
|
return; |
|
231
|
|
|
|
|
|
|
} |
|
232
|
|
|
|
|
|
|
# --- DATA PROCESSING (operate on $line_ref directly; no per-row array copy) --- |
|
233
|
6196
|
100
|
|
|
|
7248
|
if (scalar @$line_ref != scalar @header) { |
|
234
|
1
|
|
|
|
|
12
|
die "Alignment error on $file (" . scalar(@$line_ref) . " fields vs " . scalar(@header) . " headers)."; |
|
235
|
|
|
|
|
|
|
} |
|
236
|
6195
|
|
|
|
|
5423
|
my %line_hash; |
|
237
|
6195
|
|
|
|
|
7706
|
for my $i (0 .. $#header) { |
|
238
|
74732
|
|
|
|
|
65383
|
my $v = $line_ref->[$i]; |
|
239
|
74732
|
100
|
66
|
|
|
146376
|
$line_hash{$header[$i]} = (!defined($v) || $v eq '') ? undef : $v; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
# --- APPLY FILTERS --- |
|
242
|
6195
|
100
|
|
|
|
7181
|
if (@sorted_filter_flds) { |
|
243
|
1847
|
|
|
|
|
10085
|
local %_ = %line_hash; # row available as %_; set once per row, not per field |
|
244
|
1847
|
|
|
|
|
2557
|
my $skip = 0; |
|
245
|
1847
|
|
|
|
|
1773
|
foreach my $fld (@sorted_filter_flds) { |
|
246
|
1847
|
50
|
|
|
|
2438
|
local $_ = $fld == 0 ? $line_ref : $line_ref->[$fld - 1]; |
|
247
|
1847
|
100
|
|
|
|
2469
|
if (!$mapped_filters{$fld}->($line_ref, \%line_hash)) { $skip = 1; last; } |
|
|
1131
|
|
|
|
|
2192
|
|
|
|
1131
|
|
|
|
|
1103
|
|
|
248
|
716
|
50
|
|
|
|
1798
|
if ($fld > 0) { # write back any mutation the callback made to $_ |
|
249
|
716
|
|
|
|
|
683
|
$line_ref->[$fld - 1] = $_; |
|
250
|
716
|
100
|
66
|
|
|
1659
|
$line_hash{$header[$fld - 1]} = (!defined($_) || $_ eq '') ? undef : $_; |
|
251
|
|
|
|
|
|
|
} |
|
252
|
|
|
|
|
|
|
} |
|
253
|
1847
|
100
|
|
|
|
11027
|
return if $skip; |
|
254
|
|
|
|
|
|
|
} |
|
255
|
|
|
|
|
|
|
# Populate requested data structure |
|
256
|
5064
|
100
|
|
|
|
7635
|
if ($args{'output.type'} eq 'aoh') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
257
|
1863
|
|
|
|
|
12097
|
push @data, \%line_hash; |
|
258
|
|
|
|
|
|
|
} elsif ($args{'output.type'} eq 'hoa') { |
|
259
|
1114
|
|
|
|
|
1153
|
push @{ $data{$_} }, $line_hash{$_} for @header; |
|
|
15757
|
|
|
|
|
30964
|
|
|
260
|
|
|
|
|
|
|
} elsif ($args{'output.type'} eq 'hoh') { |
|
261
|
2087
|
|
|
|
|
2131
|
my $row_name = $line_hash{$args{'row.names'}}; |
|
262
|
|
|
|
|
|
|
warn "read_table: duplicate row name '$row_name' in $file (later values win)\n" |
|
263
|
2087
|
100
|
|
|
|
3267
|
if $seen_rownames{$row_name}++; |
|
264
|
2087
|
|
|
|
|
1962
|
foreach my $col (@header) { |
|
265
|
29171
|
100
|
|
|
|
32791
|
next if $col eq $args{'row.names'}; |
|
266
|
27084
|
|
|
|
|
52166
|
$data{$row_name}{$col} = $line_hash{$col}; |
|
267
|
|
|
|
|
|
|
} |
|
268
|
|
|
|
|
|
|
} |
|
269
|
522
|
|
50
|
|
|
15799
|
}); |
|
|
|
|
50
|
|
|
|
|
|
270
|
521
|
100
|
|
|
|
6397
|
if ($args{'output.type'} eq 'aoh') { |
|
271
|
509
|
|
|
|
|
2146
|
return \@data; |
|
272
|
|
|
|
|
|
|
} else { # hoa or hoh |
|
273
|
12
|
|
|
|
|
7422
|
return \%data; |
|
274
|
|
|
|
|
|
|
} |
|
275
|
|
|
|
|
|
|
} |
|
276
|
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
1; |
|
278
|
|
|
|
|
|
|
=encoding utf8 |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=head1 Synopsis |
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
Get basic statistical functions working in Perl as if they were part of List::Util, like C, C, C, etc. |
|
283
|
|
|
|
|
|
|
I've used Artificial Intelligence tools such as Claude, Gemini, and Grok to write this as well as using my own gray matter. |
|
284
|
|
|
|
|
|
|
There are other similar tools on CPAN, but I want speed and a form like List::Util, which I've gotten here with the help of AI, which often required many attempts to do correctly. |
|
285
|
|
|
|
|
|
|
This is meant to call subroutines directly through eXternal Subroutines (XS) for performance and portability. |
|
286
|
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
There B other modules on CPAN that can do B of this, but this works the way that I B it to. |
|
288
|
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=head1 Functions/Subroutines |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
======================================================================== |
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
=head2 add_data |
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
Add data to an existing hash or array reference. This function acts as the equivalent of adding new rows, as well as an C (described below). It dynamically infers your target data structure, handles deeply nested records, and seamlessly coerces mismatched data shapes to preserve the structural integrity of your primary reference. |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
=head3 Hash of Hashes (HoH) |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
When the target is a Hash of Hashes, incoming hash keys update existing rows, and new keys create new rows. |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
$data = { 'Jack Smith' => { age => 30 } }; |
|
302
|
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
$n = { |
|
304
|
|
|
|
|
|
|
'Jack Smith' => { # Update existing (Hash) |
|
305
|
|
|
|
|
|
|
dept => 'Engineering' |
|
306
|
|
|
|
|
|
|
}, |
|
307
|
|
|
|
|
|
|
'Jane Doe' => { age => 25, dept => 'Sales' }, # Add new (Hash) |
|
308
|
|
|
|
|
|
|
'Invalid' => 'Not a reference' # Edge case safety |
|
309
|
|
|
|
|
|
|
}; |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
add_data($data, $n); |
|
312
|
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
B |
|
314
|
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
{ |
|
316
|
|
|
|
|
|
|
"Jack Smith": { |
|
317
|
|
|
|
|
|
|
"age": 30, |
|
318
|
|
|
|
|
|
|
"dept": "Engineering" |
|
319
|
|
|
|
|
|
|
}, |
|
320
|
|
|
|
|
|
|
"Jane Doe": { |
|
321
|
|
|
|
|
|
|
"age": 25, |
|
322
|
|
|
|
|
|
|
"dept": "Sales" |
|
323
|
|
|
|
|
|
|
} |
|
324
|
|
|
|
|
|
|
} |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=head3 Hash of Arrays (HoA) |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
When the target is a Hash of Arrays, incoming arrays are pushed onto the existing arrays, appending the new elements, similarly to R's C. |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
$data = { 'Project Alpha' => [ 'task1', 'task2' ] }; |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
$n = { |
|
333
|
|
|
|
|
|
|
'Project Alpha' => [ 'task3' ], # Appends to existing array |
|
334
|
|
|
|
|
|
|
'Project Beta' => [ 'task1', 'task2' ] # Creates new array row |
|
335
|
|
|
|
|
|
|
}; |
|
336
|
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
add_data($data, $n); |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
B |
|
340
|
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
{ |
|
342
|
|
|
|
|
|
|
"Project Alpha": [ "task1", "task2", "task3" ], |
|
343
|
|
|
|
|
|
|
"Project Beta": [ "task1", "task2" ] |
|
344
|
|
|
|
|
|
|
} |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
=head3 Array of Hashes / Arrays (AoH / AoA) |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
C now natively supports Array references at the root level. When targeting an Array, it iterates through the source array and merges data at the corresponding indices. |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
$data = [ |
|
351
|
|
|
|
|
|
|
{ id => 1, name => 'Alice' } |
|
352
|
|
|
|
|
|
|
]; |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
$n = [ |
|
355
|
|
|
|
|
|
|
{ role => 'Admin' }, # Updates index 0 |
|
356
|
|
|
|
|
|
|
{ id => 2, name => 'Bob' } # Creates index 1 |
|
357
|
|
|
|
|
|
|
]; |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
add_data($data, $n); |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
B |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
[ |
|
364
|
|
|
|
|
|
|
{ "id": 1, "name": "Alice", "role": "Admin" }, |
|
365
|
|
|
|
|
|
|
{ "id": 2, "name": "Bob" } |
|
366
|
|
|
|
|
|
|
] |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=head3 Advanced Structural Coercion & Cross-Merging |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
C strictly enforces the primary structure of your target reference (determined by inspecting its outer and inner bounds). If you mix Array and Hash types, the function automatically coerces the incoming data to match the target. |
|
371
|
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
B<1. Inner Coercion (Mixing Rows):> |
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
=over |
|
375
|
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
=item * B Source Array rows are read in pairs and converted to key-value pairs. |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=item * B Source Hash rows are flattened into key-value pairs and pushed onto the array. |
|
379
|
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
=back |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
B<2. Root-Level Coercion (Mixing Outer Containers):> |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=over |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=item * B The function evaluates the Hash keys as numeric indices. (e.g., source key C<"0"> merges into target array index C<[0]>). Non-numeric keys are safely ignored. |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=item * B The function converts the Array indices into stringified Hash keys. (e.g., source array index C<[1]> merges into target hash key C<"1">). |
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=back |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
=head3 Source is a mixed Hash. Keys dictate the target array index! |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
$n = { |
|
395
|
|
|
|
|
|
|
'0' => { y => 20 }, # Merges into $data->[0] |
|
396
|
|
|
|
|
|
|
'1' => [ 'z', 30 ], # Array pair coerced to Hash, creates $data->[1] |
|
397
|
|
|
|
|
|
|
'ignored' => { k => 'v' } # Ignored: cannot map to an array index |
|
398
|
|
|
|
|
|
|
}; |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
add_data($data, $n); |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
B |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
[ |
|
405
|
|
|
|
|
|
|
{ "x": 10, "y": 20 }, |
|
406
|
|
|
|
|
|
|
{ "z": 30 } |
|
407
|
|
|
|
|
|
|
] |
|
408
|
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
NB: If C is called on a completely empty target reference (e.g., C<$data = {}> or C<$data = []>), it will intelligently infer the required inner structure (Hashes vs Arrays) by inspecting the first valid row of the source data. |
|
410
|
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
=head2 aov |
|
412
|
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
Warning: assumes normal distribution |
|
414
|
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
aov( |
|
416
|
|
|
|
|
|
|
{ |
|
417
|
|
|
|
|
|
|
yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2], |
|
418
|
|
|
|
|
|
|
ctrl => [1, 1, 1, 0, 0, 0] |
|
419
|
|
|
|
|
|
|
}, |
|
420
|
|
|
|
|
|
|
'yield ~ ctrl'); |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
which returns |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
{ |
|
425
|
|
|
|
|
|
|
ctrl { |
|
426
|
|
|
|
|
|
|
Df 1, |
|
427
|
|
|
|
|
|
|
"F value" 25.6000000000001, |
|
428
|
|
|
|
|
|
|
"Mean Sq" 1.70666666666667, |
|
429
|
|
|
|
|
|
|
Pr(>F) 0.00718232855871859, |
|
430
|
|
|
|
|
|
|
"Sum Sq" 1.70666666666667 |
|
431
|
|
|
|
|
|
|
}, |
|
432
|
|
|
|
|
|
|
Residuals { |
|
433
|
|
|
|
|
|
|
Df 4, |
|
434
|
|
|
|
|
|
|
"Mean Sq" 0.0666666666666665, |
|
435
|
|
|
|
|
|
|
"Sum Sq" 0.266666666666666 |
|
436
|
|
|
|
|
|
|
} |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
You can also perform Two-Way ANOVA with categorical interactions using the C<*> operator. The parser will implicitly evaluate the main effects alongside the interaction: |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
my $res_2way = aov($data_2way, 'len ~ supp * dose'); |
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
It is robust against rank deficiency; collinear terms will gracefully receive 0 degrees of freedom and 0 sum of squares, matching R's behavior. |
|
444
|
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
=head3 Input Parameters |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
=begin html |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
| Parameter |
|
455
|
|
|
|
|
|
|
| Type |
|
456
|
|
|
|
|
|
|
| Default |
|
457
|
|
|
|
|
|
|
| Description |
|
458
|
|
|
|
|
|
|
| Example |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
| |
|
461
|
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
| data_sv |
|
464
|
|
|
|
|
|
|
| HashRef or ArrayRef |
|
465
|
|
|
|
|
|
|
| (Required) |
|
466
|
|
|
|
|
|
|
| The dataset to analyze. Accepts a Hash of Arrays (HoA) or Array of Hashes (AoH). If no formula is provided, it must be an HoA to allow automatic stacking (mimicking R's stack() on a named list). |
|
467
|
|
|
|
|
|
|
| |
|
468
|
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
| formula_sv |
|
471
|
|
|
|
|
|
|
| String |
|
472
|
|
|
|
|
|
|
| undef |
|
473
|
|
|
|
|
|
|
| A symbolic description of the model to be fitted. If omitted, the formula automatically defaults to 'Value ~ Group' and the input data is stacked. |
|
474
|
|
|
|
|
|
|
| 'yield ~ N * P' |
|
475
|
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
| |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
=end html |
|
480
|
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
=head3 Output Variables |
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
The function returns a single C containing the evaluated statistical results. Because the keys map dynamically to the terms parsed from your formula, the structure will vary based on your inputs. |
|
486
|
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
=begin html |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
| Parameter |
|
495
|
|
|
|
|
|
|
| Type |
|
496
|
|
|
|
|
|
|
| Default |
|
497
|
|
|
|
|
|
|
| Description |
|
498
|
|
|
|
|
|
|
| Example |
|
499
|
|
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
| |
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
| (Term Name) |
|
504
|
|
|
|
|
|
|
| HashRef |
|
505
|
|
|
|
|
|
|
| undef |
|
506
|
|
|
|
|
|
|
| A nested hash for each independent term in the formula (e.g., 'Group', 'N:P'), containing its ANOVA table statistics. |
|
507
|
|
|
|
|
|
|
| {'Df' => 1, 'Sum Sq' => 14.2, 'Mean Sq' => 14.2, 'F value' => 25.81, 'Pr(>F)' => 0.0004} |
|
508
|
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
| Residuals |
|
511
|
|
|
|
|
|
|
| HashRef |
|
512
|
|
|
|
|
|
|
| undef |
|
513
|
|
|
|
|
|
|
| A nested hash containing the residual (error) statistics for the fitted model. |
|
514
|
|
|
|
|
|
|
| {'Df' => 10, 'Sum Sq' => 5.5, 'Mean Sq' => 0.55} |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
| group_stats |
|
518
|
|
|
|
|
|
|
| HashRef |
|
519
|
|
|
|
|
|
|
| undef |
|
520
|
|
|
|
|
|
|
| A nested hash containing descriptive statistics (mean and size / count) for every column evaluated in the original unstacked data structure. |
|
521
|
|
|
|
|
|
|
| {'mean' => {'A' => 2.1, 'B' => 5.4}, 'size' => {'A' => 10, 'B' => 10}} |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
| |
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
=end html |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
=head3 omitting formula |
|
531
|
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
As of version 0.07, in the case of an omitted formula, stacking is done: |
|
533
|
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
aov( |
|
535
|
|
|
|
|
|
|
{ |
|
536
|
|
|
|
|
|
|
yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2], |
|
537
|
|
|
|
|
|
|
ctrl => [1, 1, 1, 0, 0, 0] |
|
538
|
|
|
|
|
|
|
}, |
|
539
|
|
|
|
|
|
|
); |
|
540
|
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
is the equivalent of: |
|
542
|
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
yield <- c(5.5, 5.4, 5.8, 4.5, 4.8, 4.2) |
|
544
|
|
|
|
|
|
|
ctrl <- c(1, 1, 1, 0, 0, 0) |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
# Combine them into a named list (the R equivalent of your hash) |
|
547
|
|
|
|
|
|
|
my_list <- list(yield = yield, ctrl = ctrl) |
|
548
|
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
# Convert the list into a "long" dataframe |
|
550
|
|
|
|
|
|
|
# This creates two columns: "values" and "ind" (the group name) |
|
551
|
|
|
|
|
|
|
my_data <- stack(my_list) |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# Rename columns for clarity (optional but good practice) |
|
554
|
|
|
|
|
|
|
colnames(my_data) <- c("Value", "Group") |
|
555
|
|
|
|
|
|
|
anova_model <- aov(Value ~ Group, data = my_data) |
|
556
|
|
|
|
|
|
|
summary(anova_model) |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
in R |
|
559
|
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=head2 cfilter |
|
561
|
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
Select B out of a table and return it in the same shape. A column is |
|
563
|
|
|
|
|
|
|
the inner (second-level) key of a B or an B, |
|
564
|
|
|
|
|
|
|
or the outer key of a B: |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
use Stats::LikeR; |
|
567
|
|
|
|
|
|
|
my %hoa = ( x => [1,2,3], y => [4,5,6], z => [0,0,0] ); |
|
568
|
|
|
|
|
|
|
cfilter(\%hoa, keep => ['x','y']); # { x => [1,2,3], y => [4,5,6] } |
|
569
|
|
|
|
|
|
|
cfilter(\%hoa, remove => ['z']); # { x => [1,2,3], y => [4,5,6] } |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
C takes exactly one of C or C. C returns only the |
|
572
|
|
|
|
|
|
|
matching columns; C returns everything except them. The result is the |
|
573
|
|
|
|
|
|
|
same shape as the input (HoH → HoH, HoA → HoA, AoH → AoH), with cell values |
|
574
|
|
|
|
|
|
|
copied and the original structure left untouched. |
|
575
|
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
=head3 Selecting by name |
|
577
|
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
Pass an array ref of column names. Naming a column that is not present in the |
|
579
|
|
|
|
|
|
|
data is an error (it catches typos), and a row that happens not to contain a |
|
580
|
|
|
|
|
|
|
kept column simply comes back without it: |
|
581
|
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
my @aoh = ( { a => 1, b => 2 }, { a => 3 } ); |
|
583
|
|
|
|
|
|
|
cfilter(\@aoh, keep => ['b']); # [ { b => 2 }, {} ] |
|
584
|
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
=head3 Selecting by a predicate |
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
Instead of names, C/C accept a B — a CODE ref or a |
|
588
|
|
|
|
|
|
|
function name — evaluated once per column. It is called as |
|
589
|
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
$predicate->($column_values, $column_name) |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
where C<$column_values> is an array ref of the column's B cells (undef |
|
593
|
|
|
|
|
|
|
and missing cells are dropped, so functions like C get clean input). |
|
594
|
|
|
|
|
|
|
With C, columns for which the predicate is true are kept; with C, |
|
595
|
|
|
|
|
|
|
those columns are dropped. |
|
596
|
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
# Keep only the constant columns (standard deviation zero): |
|
598
|
|
|
|
|
|
|
my $const = cfilter(\%hoa, keep => sub { sd($_[0]) == 0 }); # { z => [0,0,0] } |
|
599
|
|
|
|
|
|
|
# Drop the constant columns instead: |
|
600
|
|
|
|
|
|
|
my $varying = cfilter(\%hoa, remove => sub { sd($_[0]) == 0 }); # { x=>..., y=>... } |
|
601
|
|
|
|
|
|
|
# A bare function name resolves in Stats::LikeR:: (use a package for your own): |
|
602
|
|
|
|
|
|
|
cfilter(\%hoa, keep => 'some_predicate'); |
|
603
|
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
A bare string is always treated as a B, not a single column |
|
605
|
|
|
|
|
|
|
name, so to keep one column by name use an array ref: C<< keep =E ['x'] >>. |
|
606
|
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
=head3 Errors |
|
608
|
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
C dies (via C) when: |
|
610
|
|
|
|
|
|
|
|
|
611
|
|
|
|
|
|
|
=over |
|
612
|
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
=item * neither C nor C is given, or both are, |
|
614
|
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
=item * a named column is not present in the data, |
|
616
|
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
=item * the selector is neither an array ref nor a code ref / function name, or the |
|
618
|
|
|
|
|
|
|
function name cannot be resolved, |
|
619
|
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
=item * an unknown option is given, or the options are not C<< name =E value >> pairs, |
|
621
|
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
=item * the data is not a hash/array reference of the expected shape (a hash of hash |
|
623
|
|
|
|
|
|
|
refs or array refs, or an array of hash refs). |
|
624
|
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
=back |
|
626
|
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
=head2 chisq_test |
|
628
|
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
The C function performs chi-squared contingency table tests and goodness-of-fit tests. It natively accepts both arrays and hashes (1D and 2D) and mathematically mirrors R's C, returning a structured hash reference of the results. |
|
630
|
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
For 2x2 matrices, Yates' Continuity Correction is applied automatically. |
|
632
|
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
=head3 Accepted Inputs |
|
634
|
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
=begin html |
|
638
|
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
| Input Type |
|
643
|
|
|
|
|
|
|
| Data Structure |
|
644
|
|
|
|
|
|
|
| Applied Test |
|
645
|
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
| |
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
| 1D Array |
|
650
|
|
|
|
|
|
|
| [ $v1, $v2, ... ] |
|
651
|
|
|
|
|
|
|
| Chi-squared test for given probabilities |
|
652
|
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
| 2D Array |
|
655
|
|
|
|
|
|
|
| [ [ $v1, $v2 ], [ $v3, $v4 ] ] |
|
656
|
|
|
|
|
|
|
| Pearson's Chi-squared test (Yates' correction if 2x2) |
|
657
|
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
| 1D Hash |
|
660
|
|
|
|
|
|
|
| { key1 => $v1, key2 => $v2 } |
|
661
|
|
|
|
|
|
|
| Chi-squared test for given probabilities |
|
662
|
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
| 2D Hash |
|
665
|
|
|
|
|
|
|
| { row1 => { c1 => $v1, c2 => $v2 } } |
|
666
|
|
|
|
|
|
|
| Pearson's Chi-squared test (Yates' correction if 2x2) |
|
667
|
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
| |
|
670
|
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
=end html |
|
672
|
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
=head3 Output Object Structure |
|
676
|
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
The function returns a single Hash Reference containing the following key-value pairs. The internal structure of C and C will always identically match the structure of your input. |
|
678
|
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
=begin html |
|
682
|
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
=end html |
|
731
|
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
=head3 Two-Dimensional Array |
|
735
|
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
Passing an Array of Arrays (AoA) triggers a standard Pearson's Chi-squared test. If the input is exactly a 2x2 matrix, Yates' continuity correction is applied automatically. |
|
737
|
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
my $test_data = [ |
|
739
|
|
|
|
|
|
|
[762, 327, 468], |
|
740
|
|
|
|
|
|
|
[484, 239, 477] |
|
741
|
|
|
|
|
|
|
]; |
|
742
|
|
|
|
|
|
|
my $res = chisq_test($test_data); |
|
743
|
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
B |
|
745
|
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
{ |
|
747
|
|
|
|
|
|
|
'data.name' => 'Perl ArrayRef', |
|
748
|
|
|
|
|
|
|
'expected' => [ |
|
749
|
|
|
|
|
|
|
[ 703.671381936888, 319.645266594124, 533.683351468988 ], |
|
750
|
|
|
|
|
|
|
[ 542.328618063112, 246.354733405876, 411.316648531012 ] |
|
751
|
|
|
|
|
|
|
], |
|
752
|
|
|
|
|
|
|
'method' => "Pearson's Chi-squared test", |
|
753
|
|
|
|
|
|
|
'observed' => [ |
|
754
|
|
|
|
|
|
|
[ 762, 327, 468 ], |
|
755
|
|
|
|
|
|
|
[ 484, 239, 477 ] |
|
756
|
|
|
|
|
|
|
], |
|
757
|
|
|
|
|
|
|
'p.value' => 2.95358918321176e-07, |
|
758
|
|
|
|
|
|
|
'parameter' => { 'df' => 2 }, |
|
759
|
|
|
|
|
|
|
'statistic' => { 'X-squared' => 30.0701490957547 } |
|
760
|
|
|
|
|
|
|
} |
|
761
|
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
=head3 1-Dimensional Array (Goodness of Fit) |
|
763
|
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
Passing a flat Array Reference triggers a Goodness of Fit test, assuming equal expected probabilities across all items. |
|
765
|
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
my $data = [10, 20, 30]; |
|
767
|
|
|
|
|
|
|
my $res = chisq_test($data); |
|
768
|
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
B |
|
770
|
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
{ |
|
772
|
|
|
|
|
|
|
'data.name' => 'Perl ArrayRef', |
|
773
|
|
|
|
|
|
|
'expected' => [ 20, 20, 20 ], |
|
774
|
|
|
|
|
|
|
'method' => 'Chi-squared test for given probabilities', |
|
775
|
|
|
|
|
|
|
'observed' => [ 10, 20, 30 ], |
|
776
|
|
|
|
|
|
|
'p.value' => 0.00673794699908547, |
|
777
|
|
|
|
|
|
|
'parameter' => { 'df' => 2 }, |
|
778
|
|
|
|
|
|
|
'statistic' => { 'X-squared' => 10 } |
|
779
|
|
|
|
|
|
|
} |
|
780
|
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
=head3 2-Dimensional Hash (Pearson's Chi-squared) |
|
782
|
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
Passing a Hash of Hashes (HoH) applies the exact same logic as a 2D Array, but preserves your nested string keys in the output. This is particularly useful when mapping data extracted directly from JSON, databases, or categorical mappings. |
|
784
|
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
my $data = { |
|
786
|
|
|
|
|
|
|
GroupA => { Success => 10, Failure => 15 }, |
|
787
|
|
|
|
|
|
|
GroupB => { Success => 20, Failure => 5 } |
|
788
|
|
|
|
|
|
|
}; |
|
789
|
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
my $res = chisq_test($data); |
|
791
|
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
B |
|
793
|
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
{ |
|
795
|
|
|
|
|
|
|
'data.name' => 'Perl HashRef', |
|
796
|
|
|
|
|
|
|
'expected' => { |
|
797
|
|
|
|
|
|
|
'GroupA' => { 'Failure' => 10, 'Success' => 15 }, |
|
798
|
|
|
|
|
|
|
'GroupB' => { 'Failure' => 10, 'Success' => 15 } |
|
799
|
|
|
|
|
|
|
}, |
|
800
|
|
|
|
|
|
|
'method' => "Pearson's Chi-squared test with Yates' continuity correction", |
|
801
|
|
|
|
|
|
|
'observed' => { |
|
802
|
|
|
|
|
|
|
'GroupA' => { 'Failure' => 15, 'Success' => 10 }, |
|
803
|
|
|
|
|
|
|
'GroupB' => { 'Failure' => 5, 'Success' => 20 } |
|
804
|
|
|
|
|
|
|
}, |
|
805
|
|
|
|
|
|
|
'p.value' => 0.00937475878430379, |
|
806
|
|
|
|
|
|
|
'parameter' => { 'df' => 1 }, |
|
807
|
|
|
|
|
|
|
'statistic' => { 'X-squared' => 6.75 } |
|
808
|
|
|
|
|
|
|
} |
|
809
|
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
=head3 One-Dimensional Hash (Goodness of Fit) |
|
811
|
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
Flat Hash References evaluate Goodness of Fit while preserving your categorical keys in the C and C output blocks. |
|
813
|
|
|
|
|
|
|
|
|
814
|
|
|
|
|
|
|
my $data = { |
|
815
|
|
|
|
|
|
|
Apples => 10, |
|
816
|
|
|
|
|
|
|
Oranges => 20, |
|
817
|
|
|
|
|
|
|
Bananas => 30 |
|
818
|
|
|
|
|
|
|
}; |
|
819
|
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
my $res = chisq_test($data); |
|
821
|
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
=head1 C |
|
823
|
|
|
|
|
|
|
|
|
824
|
|
|
|
|
|
|
Apply a B to every pair of columns in a table and collect |
|
825
|
|
|
|
|
|
|
the answers in a hash of hashes. |
|
826
|
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
It's the workhorse behind things like correlation matrices: give it your data and |
|
828
|
|
|
|
|
|
|
the name of a function that takes two columns (C, C, …) and you get |
|
829
|
|
|
|
|
|
|
back every column compared against every other column. |
|
830
|
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
use Stats::LikeR; |
|
832
|
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
my %data = ( |
|
834
|
|
|
|
|
|
|
height => [ 170, 165, 180, 175 ], |
|
835
|
|
|
|
|
|
|
weight => [ 70, 60, 85, 77 ], |
|
836
|
|
|
|
|
|
|
age => [ 30, 41, 25, 38 ], |
|
837
|
|
|
|
|
|
|
); |
|
838
|
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
my $result = col2col(\%data, 'cor'); |
|
840
|
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
# $result->{height}{weight} == correlation of height vs weight |
|
842
|
|
|
|
|
|
|
# $result->{height}{age} == correlation of height vs age |
|
843
|
|
|
|
|
|
|
# ...and so on for every pair |
|
844
|
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
======================================================================== |
|
846
|
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
=head2 Arguments |
|
848
|
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
col2col( $data, $command, $cols, %options ) |
|
850
|
|
|
|
|
|
|
col2col( $data, $command, \%options ) # options in place of $cols |
|
851
|
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
=begin html |
|
855
|
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
=end html |
|
889
|
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
|
|
891
|
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
======================================================================== |
|
893
|
|
|
|
|
|
|
|
|
894
|
|
|
|
|
|
|
=head2 Data shapes |
|
895
|
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
C understands three layouts. In every case a B is the thing that |
|
897
|
|
|
|
|
|
|
gets compared, and the result is keyed by column name. |
|
898
|
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
B — keys are column names: |
|
900
|
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
my %hoa = ( a => [1, 2, 3], b => [4, 5, 6] ); |
|
902
|
|
|
|
|
|
|
|
|
903
|
|
|
|
|
|
|
B — First keys are row names, second keys are columns: |
|
904
|
|
|
|
|
|
|
|
|
905
|
|
|
|
|
|
|
my %hoh = ( |
|
906
|
|
|
|
|
|
|
row1 => { a => 1, b => 4 }, |
|
907
|
|
|
|
|
|
|
row2 => { a => 2, b => 5 }, |
|
908
|
|
|
|
|
|
|
); |
|
909
|
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
B — each element is a row, inner keys are columns: |
|
911
|
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
my @aoh = ( { a => 1, b => 4 }, { a => 2, b => 5 } ); |
|
913
|
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
All three produce the same result for the same underlying numbers. Missing or |
|
915
|
|
|
|
|
|
|
C cells are handled by the C option (below). |
|
916
|
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
======================================================================== |
|
918
|
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
=head2 The command |
|
920
|
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
The second argument is the function applied to each pair of columns. It is called |
|
922
|
|
|
|
|
|
|
as: |
|
923
|
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
$command->( $column_a, $column_b ) # two ARRAY refs |
|
925
|
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
so inside a block the two columns arrive in C<@_>: |
|
927
|
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
my $result = col2col(\%data, sub { |
|
929
|
|
|
|
|
|
|
my ($x, $y) = @_; # $x and $y are array refs |
|
930
|
|
|
|
|
|
|
cor($x, $y); |
|
931
|
|
|
|
|
|
|
}); |
|
932
|
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
You can also pass a B. A bare name is looked up in |
|
934
|
|
|
|
|
|
|
C, so these two are equivalent: |
|
935
|
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
col2col(\%data, 'cor'); |
|
937
|
|
|
|
|
|
|
col2col(\%data, sub { cor($_[0], $_[1]) }); |
|
938
|
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
======================================================================== |
|
940
|
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
=head2 The result |
|
942
|
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
Always a hash of hashes: B<< C<< $result-E{from}{to} >> >>. |
|
944
|
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
for my $from (sort keys %$result) { |
|
946
|
|
|
|
|
|
|
for my $to (sort keys %{ $result->{$from} }) { |
|
947
|
|
|
|
|
|
|
printf "%s vs %s = %s\n", $from, $to, $result->{$from}{$to}; |
|
948
|
|
|
|
|
|
|
} |
|
949
|
|
|
|
|
|
|
} |
|
950
|
|
|
|
|
|
|
|
|
951
|
|
|
|
|
|
|
A column is never compared with itself, so C<< $result-E{a}{a} >> does not exist. |
|
952
|
|
|
|
|
|
|
|
|
953
|
|
|
|
|
|
|
======================================================================== |
|
954
|
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
=head2 Restricting columns (C<$cols>) |
|
956
|
|
|
|
|
|
|
|
|
957
|
|
|
|
|
|
|
By default every column is used as the "from" side. The third argument narrows |
|
958
|
|
|
|
|
|
|
that down — handy when you only care about one variable. |
|
959
|
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
# all columns vs all columns |
|
961
|
|
|
|
|
|
|
my $all = col2col(\%data, 'cor'); |
|
962
|
|
|
|
|
|
|
# just ONE column vs every other column |
|
963
|
|
|
|
|
|
|
my $one = col2col(\%data, 'cor', 'height'); |
|
964
|
|
|
|
|
|
|
my $cors = $one->{height}; # { weight => ..., age => ... } |
|
965
|
|
|
|
|
|
|
# a FEW specific columns vs every other column |
|
966
|
|
|
|
|
|
|
my $few = col2col(\%data, 'cor', ['height', 'weight']); |
|
967
|
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
The "to" side is always every other column; C<$cols> only limits the outer keys. |
|
969
|
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
======================================================================== |
|
971
|
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
=head2 Options |
|
973
|
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
Options can be given two ways: |
|
975
|
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
col2col(\%data, 'cor', $cols, 'skip.errors' => 0); # after $cols |
|
977
|
|
|
|
|
|
|
col2col(\%data, 'cor', { 'skip.errors' => 0 }); # hash ref, no $cols needed |
|
978
|
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
The hash-ref form is convenient when you have B column restriction — it saves |
|
980
|
|
|
|
|
|
|
you from passing a placeholder. (A hash ref I C<$cols>, so you can't use |
|
981
|
|
|
|
|
|
|
it to restrict columns at the same time; use the trailing form for that.) |
|
982
|
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
=head3 C — how undefined values are handled |
|
984
|
|
|
|
|
|
|
|
|
985
|
|
|
|
|
|
|
Real data has gaps. C decides what the function sees. |
|
986
|
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
=begin html |
|
990
|
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
| Value |
|
995
|
|
|
|
|
|
|
| Behaviour |
|
996
|
|
|
|
|
|
|
| Use for |
|
997
|
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
| |
|
999
|
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
| 'pairwise' (default) |
|
1002
|
|
|
|
|
|
|
| A row is used for a pair only if both columns are defined there. The two columns arrive aligned and equal-length. |
|
1003
|
|
|
|
|
|
|
| Paired stats like cor. |
|
1004
|
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
|
|
1006
|
|
|
|
|
|
|
| 'omit' |
|
1007
|
|
|
|
|
|
|
| Each column drops its own undefined values independently. The two columns may end up different lengths. |
|
1008
|
|
|
|
|
|
|
| Unpaired tests like t_test, kruskal_test, where a gap in one sample shouldn't discard a value in the other. |
|
1009
|
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
| 'keep' |
|
1012
|
|
|
|
|
|
|
| Every row is passed through, undef and all. |
|
1013
|
|
|
|
|
|
|
| When your function does its own missing-data handling. |
|
1014
|
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
| |
|
1017
|
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
=end html |
|
1019
|
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
# correlation: keep only complete pairs (the default) |
|
1023
|
|
|
|
|
|
|
col2col(\%data, 'cor'); |
|
1024
|
|
|
|
|
|
|
# two-sample test: each column keeps its own values |
|
1025
|
|
|
|
|
|
|
col2col(\%data, 't_test', undef, na => 'omit'); |
|
1026
|
|
|
|
|
|
|
col2col(\%data, 't_test', { na => 'omit' }); # same, no placeholder |
|
1027
|
|
|
|
|
|
|
|
|
1028
|
|
|
|
|
|
|
C / C remain as boolean aliases for backward compatibility: |
|
1029
|
|
|
|
|
|
|
C means C<'pairwise'>, C means C<'keep'>. Don't combine them with C. |
|
1030
|
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
=head3 C — keep going when a pair fails I<(default: true)> |
|
1032
|
|
|
|
|
|
|
|
|
1033
|
|
|
|
|
|
|
Some functions croak on degenerate input — for example C dies if a column has |
|
1034
|
|
|
|
|
|
|
zero variance. By default C B that croak per pair: instead of |
|
1035
|
|
|
|
|
|
|
aborting the whole run, it stores the B of the error message in that |
|
1036
|
|
|
|
|
|
|
cell, so the result tells you I pair failed and I. Every other cell is |
|
1037
|
|
|
|
|
|
|
computed normally. |
|
1038
|
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
my $r = col2col(\%data, 'cor'); |
|
1040
|
|
|
|
|
|
|
# a good pair: $r->{a}{b} == 0.83 |
|
1041
|
|
|
|
|
|
|
# a bad pair: $r->{a}{const} eq 'cor: standard deviation of y is 0' |
|
1042
|
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
To restore the old "die on the first error" behaviour, turn it off: |
|
1044
|
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
col2col(\%data, 'cor', undef, 'skip.errors' => 0); |
|
1046
|
|
|
|
|
|
|
col2col(\%data, 'cor', { 'skip.errors' => 0 }); |
|
1047
|
|
|
|
|
|
|
|
|
1048
|
|
|
|
|
|
|
Only errors from B are trapped. Mistakes in the call itself |
|
1049
|
|
|
|
|
|
|
(unknown column, bad data, unknown function name, unknown option) always die. |
|
1050
|
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
======================================================================== |
|
1052
|
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
=head2 Worked examples |
|
1054
|
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
B |
|
1056
|
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
my $m = col2col(\%data, 'cor'); |
|
1058
|
|
|
|
|
|
|
|
|
1059
|
|
|
|
|
|
|
B |
|
1060
|
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
my $col = 'Testosterone, total (nmol/L)'; |
|
1062
|
|
|
|
|
|
|
my $cors = col2col($hoa, 'cor', $col)->{$col}; |
|
1063
|
|
|
|
|
|
|
for my $other (sort { ($cors->{$b} // -2) <=> ($cors->{$a} // -2) } keys %$cors) { |
|
1064
|
|
|
|
|
|
|
next unless $cors->{$other} =~ /^-?\d/; # skip cells holding an error message |
|
1065
|
|
|
|
|
|
|
printf "%-30s % .3f\n", $other, $cors->{$other}; |
|
1066
|
|
|
|
|
|
|
} |
|
1067
|
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
B |
|
1069
|
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
my $t = col2col($hoa, 't_test', undef, na => 'omit'); |
|
1071
|
|
|
|
|
|
|
|
|
1072
|
|
|
|
|
|
|
B |
|
1073
|
|
|
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
my $m = col2col($hoa, 'cor'); |
|
1075
|
|
|
|
|
|
|
for my $from (sort keys %$m) { |
|
1076
|
|
|
|
|
|
|
for my $to (sort keys %{ $m->{$from} }) { |
|
1077
|
|
|
|
|
|
|
my $v = $m->{$from}{$to}; |
|
1078
|
|
|
|
|
|
|
warn "$from vs $to: $v\n" if defined $v && $v !~ /^-?\d/; # non-numeric = error |
|
1079
|
|
|
|
|
|
|
} |
|
1080
|
|
|
|
|
|
|
} |
|
1081
|
|
|
|
|
|
|
|
|
1082
|
|
|
|
|
|
|
======================================================================== |
|
1083
|
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
=head2 Gotchas |
|
1085
|
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
=over |
|
1087
|
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
=item * B, C<($col_a, $col_b)> — not a column and |
|
1089
|
|
|
|
|
|
|
a name. Unpack with C. |
|
1090
|
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
=item * B<< C<'pairwise'> can still hit a constant I. >> A column with overall |
|
1092
|
|
|
|
|
|
|
variance can be flat on just the rows it shares with one partner, so C may |
|
1093
|
|
|
|
|
|
|
still croak for that pair. With the default C, that shows up as a |
|
1094
|
|
|
|
|
|
|
message in the single offending cell rather than killing the run. |
|
1095
|
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
=item * B<< C does not modify your data. >> It reads the table and returns a new |
|
1097
|
|
|
|
|
|
|
hash of hashes. |
|
1098
|
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
=item * B — i.e. |
|
1100
|
|
|
|
|
|
|
C is the inner ("to") key. So C<< $result-E{A}{B} >> reading C<…deviation of y is 0> |
|
1101
|
|
|
|
|
|
|
means column C is the degenerate one for that pair. |
|
1102
|
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
=back |
|
1104
|
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
=head2 cor |
|
1106
|
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
cor($array1, $array2, $method = 'pearson'), |
|
1108
|
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
that is, C is the default and will be used if C<$method> is not specified. |
|
1110
|
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
Just like R, C, C, and C are available |
|
1112
|
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
If you provide an array of arrays (a matrix), C will compute the correlation matrix automatically. |
|
1114
|
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
=head2 cor_test |
|
1116
|
|
|
|
|
|
|
|
|
1117
|
|
|
|
|
|
|
my $result = cor_test( |
|
1118
|
|
|
|
|
|
|
'x' => $x, |
|
1119
|
|
|
|
|
|
|
'y' => $y, |
|
1120
|
|
|
|
|
|
|
alternative => 'two.sided', |
|
1121
|
|
|
|
|
|
|
method => 'pearson', |
|
1122
|
|
|
|
|
|
|
continuity => 1 |
|
1123
|
|
|
|
|
|
|
); |
|
1124
|
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
C safely handles C (or C) values seamlessly by computing over pairwise complete observations. |
|
1126
|
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
=head2 cov |
|
1128
|
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
cov($array1, $array2, 'pearson') |
|
1130
|
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
or |
|
1132
|
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
cov($array1, $array2, 'spearman') |
|
1134
|
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
or |
|
1136
|
|
|
|
|
|
|
|
|
1137
|
|
|
|
|
|
|
cov($array1, $array2, 'kendall') |
|
1138
|
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
=head2 dnorm |
|
1140
|
|
|
|
|
|
|
|
|
1141
|
|
|
|
|
|
|
gives the density of the normal distribution, with the specified mean and standard deviation. |
|
1142
|
|
|
|
|
|
|
|
|
1143
|
|
|
|
|
|
|
In other words, the predicted height of the value C, given a mean, standard deviation, and whether or not to use a log value. |
|
1144
|
|
|
|
|
|
|
|
|
1145
|
|
|
|
|
|
|
returns a single scalar/number if a single value is given, otherwise returns an array reference. |
|
1146
|
|
|
|
|
|
|
|
|
1147
|
|
|
|
|
|
|
Usage: |
|
1148
|
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
dnorm(4) # assumes a mean of 0 and standard deviation of 1 |
|
1150
|
|
|
|
|
|
|
|
|
1151
|
|
|
|
|
|
|
but default mean, standard deviation, and log can be passed as parameters: |
|
1152
|
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
$x = dnorm(0, mean => 0, sd => 2, 'log' => 0); |
|
1154
|
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
=head2 filter |
|
1156
|
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
Return a new data frame containing only the rows of C<$df> that match a predicate. The original C<$df> is never modified. |
|
1158
|
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
my $df2 = filter($df, col('column.name') > 4); |
|
1160
|
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
C accepts a predicate in one of two forms: |
|
1162
|
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
=over |
|
1164
|
|
|
|
|
|
|
|
|
1165
|
|
|
|
|
|
|
=item 1. a B<< C expression >> — a small, composable comparison built with overloaded operators, and |
|
1166
|
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
=item 2. a B — for anything the operators can't express (multiple columns, regexes, arbitrary logic), in the same spirit as the C option of L<#>. |
|
1168
|
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
=back |
|
1170
|
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
Both C and C are exported by default. |
|
1172
|
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
=head3 Arguments |
|
1174
|
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
|
|
1177
|
|
|
|
|
|
|
=begin html |
|
1178
|
|
|
|
|
|
|
|
|
1179
|
|
|
|
|
|
|
|
|
1200
|
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
=end html |
|
1202
|
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
The return value is a B data frame of the B as the input (AoH in → AoH out, HoA in → HoA out). For an HoA, every column is filtered in parallel by row index, so all returned columns stay the same length and aligned. |
|
1206
|
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
=head3 The C form |
|
1208
|
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
C is a deferred reference to a column. It carries no data — only the column name — so it can be compared with a literal (or another value) to build a predicate that C evaluates once per row. |
|
1210
|
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
filter($df, col('age') >= 18); # keep rows where age >= 18 |
|
1212
|
|
|
|
|
|
|
filter($df, col('sex') eq 'f'); # keep rows where sex is 'f' |
|
1213
|
|
|
|
|
|
|
filter($df, 18 <= col('age')); # operands may be in either order |
|
1214
|
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
=head3 Comparison operators |
|
1216
|
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
=begin html |
|
1220
|
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
|
|
1242
|
|
|
|
|
|
|
|
|
1243
|
|
|
|
|
|
|
=end html |
|
1244
|
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
|
|
1246
|
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
C may appear on either side of the operator; C<< 4 E col('x') >> is automatically rewritten to the equivalent C<< col('x') E 4 >>. |
|
1248
|
|
|
|
|
|
|
|
|
1249
|
|
|
|
|
|
|
=head3 Combining predicates: C<&>, C<|>, C |
|
1250
|
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
Predicates compose with bitwise C<&> (and), C<|> (or), and C (not): |
|
1252
|
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
filter($df, (col('age') > 18) & (col('sex') eq 'f')); # and |
|
1254
|
|
|
|
|
|
|
filter($df, (col('grp') eq 'a') | (col('grp') eq 'c')); # or |
|
1255
|
|
|
|
|
|
|
filter($df, !(col('x') > 100)); # not |
|
1256
|
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
Comparison operators bind more tightly than C<&> and C<|>, so C<< (col('a') E 4) & (col('b') E 2) >> is parsed correctly, but the parentheses are recommended for readability. |
|
1258
|
|
|
|
|
|
|
|
|
1259
|
|
|
|
|
|
|
=head3 The code-reference form |
|
1260
|
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
For logic the operators can't express, pass a C. It is called once per row; the B, available both as C<$_> and as the first argument C<$_[0]>. Return a true value to keep the row. |
|
1262
|
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
filter($df, sub { $_->{x} > 4 && $_->{grp} eq 'a' }); |
|
1264
|
|
|
|
|
|
|
filter($df, sub { $_->{name} =~ /^A/ }); |
|
1265
|
|
|
|
|
|
|
filter($df, sub { $_[0]{score} > $_[0]{threshold} }); |
|
1266
|
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
For an HoA, each row is assembled into a temporary hash reference (C<< { column =E value, ... } >>) before the sub is called, so the same C<< $_-E{column} >> syntax works regardless of the input shape. |
|
1268
|
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
=head3 Examples |
|
1270
|
|
|
|
|
|
|
|
|
1271
|
|
|
|
|
|
|
use Stats::LikeR; |
|
1272
|
|
|
|
|
|
|
my $df = read_table('patients.csv'); # array of hashes |
|
1273
|
|
|
|
|
|
|
# numeric threshold |
|
1274
|
|
|
|
|
|
|
my $adults = filter($df, col('Age') >= 18); |
|
1275
|
|
|
|
|
|
|
# combine conditions |
|
1276
|
|
|
|
|
|
|
my $target = filter($df, (col('Age') >= 18) & (col('Sex') eq 'f')); |
|
1277
|
|
|
|
|
|
|
# arbitrary logic with a coderef |
|
1278
|
|
|
|
|
|
|
my $flagged = filter($df, sub { $_->{ALT} > 40 || $_->{AST} > 40 }); |
|
1279
|
|
|
|
|
|
|
# hash-of-arrays input -> hash-of-arrays output, columns filtered in parallel |
|
1280
|
|
|
|
|
|
|
my $hoa = read_table('patients.csv', 'output.type' => 'hoa'); |
|
1281
|
|
|
|
|
|
|
my $sub = filter($hoa, col('Age') > 32); |
|
1282
|
|
|
|
|
|
|
# $sub->{Age}, $sub->{Sex}, ... are all the same length and row-aligned |
|
1283
|
|
|
|
|
|
|
|
|
1284
|
|
|
|
|
|
|
=head3 Behavior and notes |
|
1285
|
|
|
|
|
|
|
|
|
1286
|
|
|
|
|
|
|
=over |
|
1287
|
|
|
|
|
|
|
|
|
1288
|
|
|
|
|
|
|
=item * B C builds and returns a new frame; C<$df> is left untouched. |
|
1289
|
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
=item * B<< A missing or C cell never matches >> a C comparison. For example C<< col('x') E 0 >> silently drops any row that has no C value or whose C is C. |
|
1291
|
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
=item * B, into the returned frame: the returned array references the I row hashes as the input (fast, low-memory). Mutating a row in the result would therefore also change it in the original. HoA values are copied into fresh arrays. |
|
1293
|
|
|
|
|
|
|
|
|
1294
|
|
|
|
|
|
|
=item * B are well defined: a predicate true for every row returns a copy-shaped frame with all rows; a predicate true for none returns an empty frame (C<[]> for AoH, a hash of empty arrays for HoA). |
|
1295
|
|
|
|
|
|
|
|
|
1296
|
|
|
|
|
|
|
=item * B Passing a non-reference, an array element that is not a hash reference, or an HoA column that is not an array reference raises a descriptive error. |
|
1297
|
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
=item * B The C/operator layer is pure Perl (operator overloading); the per-row evaluation is done in XS. |
|
1299
|
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
=back |
|
1301
|
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
=head3 See also |
|
1303
|
|
|
|
|
|
|
|
|
1304
|
|
|
|
|
|
|
C (whose C option applies the same coderef convention while reading a file), C. |
|
1305
|
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
=head2 fisher_test |
|
1307
|
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
=head3 array reference entry |
|
1309
|
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
my $array_data = [ |
|
1311
|
|
|
|
|
|
|
[10, 2], |
|
1312
|
|
|
|
|
|
|
[3, 15] |
|
1313
|
|
|
|
|
|
|
]; |
|
1314
|
|
|
|
|
|
|
my $res1 = fisher_test($array_data); |
|
1315
|
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
which returns a hash reference: |
|
1317
|
|
|
|
|
|
|
|
|
1318
|
|
|
|
|
|
|
{ |
|
1319
|
|
|
|
|
|
|
alternative "two.sided", |
|
1320
|
|
|
|
|
|
|
conf_int [ |
|
1321
|
|
|
|
|
|
|
[0] 2.75343836564204, |
|
1322
|
|
|
|
|
|
|
[1] 300.682787419401 |
|
1323
|
|
|
|
|
|
|
], |
|
1324
|
|
|
|
|
|
|
conf_level 0.95, |
|
1325
|
|
|
|
|
|
|
estimate { |
|
1326
|
|
|
|
|
|
|
"odds ratio" 21.3053312750168 |
|
1327
|
|
|
|
|
|
|
}, |
|
1328
|
|
|
|
|
|
|
method "Fisher's Exact Test for Count Data", |
|
1329
|
|
|
|
|
|
|
p_value 0.000536724119143435 |
|
1330
|
|
|
|
|
|
|
} |
|
1331
|
|
|
|
|
|
|
|
|
1332
|
|
|
|
|
|
|
=head3 hash reference entry |
|
1333
|
|
|
|
|
|
|
|
|
1334
|
|
|
|
|
|
|
$ft = fisher_test( { |
|
1335
|
|
|
|
|
|
|
Guess => { |
|
1336
|
|
|
|
|
|
|
Milk => 3, Tea => 1 |
|
1337
|
|
|
|
|
|
|
}, |
|
1338
|
|
|
|
|
|
|
Truth => { |
|
1339
|
|
|
|
|
|
|
Milk => 1, Tea => 3 |
|
1340
|
|
|
|
|
|
|
} |
|
1341
|
|
|
|
|
|
|
}); |
|
1342
|
|
|
|
|
|
|
|
|
1343
|
|
|
|
|
|
|
=head2 glm |
|
1344
|
|
|
|
|
|
|
|
|
1345
|
|
|
|
|
|
|
takes a hash of an array as input |
|
1346
|
|
|
|
|
|
|
|
|
1347
|
|
|
|
|
|
|
my %tooth_growth = ( |
|
1348
|
|
|
|
|
|
|
dose => [qw(0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 |
|
1349
|
|
|
|
|
|
|
1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 |
|
1350
|
|
|
|
|
|
|
0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 |
|
1351
|
|
|
|
|
|
|
2.0 2.0 2.0)], |
|
1352
|
|
|
|
|
|
|
len => [qw(4.2 11.5 7.3 5.8 6.4 10.0 11.2 11.2 5.2 7.0 16.5 16.5 15.2 17.3 22.5 |
|
1353
|
|
|
|
|
|
|
17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5 |
|
1354
|
|
|
|
|
|
|
15.2 21.5 17.6 9.7 14.5 10.0 8.2 9.4 16.5 9.7 19.7 23.3 23.6 26.4 20.0 |
|
1355
|
|
|
|
|
|
|
25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0)], |
|
1356
|
|
|
|
|
|
|
supp => [qw(VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC |
|
1357
|
|
|
|
|
|
|
VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ |
|
1358
|
|
|
|
|
|
|
OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ)] |
|
1359
|
|
|
|
|
|
|
); |
|
1360
|
|
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
|
my $glm_teeth = glm( |
|
1362
|
|
|
|
|
|
|
data => \%tooth_growth, |
|
1363
|
|
|
|
|
|
|
formula => 'len ~ dose + supp', |
|
1364
|
|
|
|
|
|
|
family => 'gaussian' |
|
1365
|
|
|
|
|
|
|
); |
|
1366
|
|
|
|
|
|
|
|
|
1367
|
|
|
|
|
|
|
In addition to the C default, it fully supports logistic regression using the C family parameter via Iteratively Reweighted Least Squares (IRLS): |
|
1368
|
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
my $glm_bin = glm(formula => 'am ~ wt + hp', data => \%mtcars, family => 'binomial'); |
|
1370
|
|
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
|
=head3 Input Parameters |
|
1372
|
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
|
|
1374
|
|
|
|
|
|
|
|
|
1375
|
|
|
|
|
|
|
=begin html |
|
1376
|
|
|
|
|
|
|
|
|
1377
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
|
1379
|
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
| Parameter |
|
1381
|
|
|
|
|
|
|
| Type |
|
1382
|
|
|
|
|
|
|
| Default |
|
1383
|
|
|
|
|
|
|
| Description |
|
1384
|
|
|
|
|
|
|
| Example |
|
1385
|
|
|
|
|
|
|
|
|
1386
|
|
|
|
|
|
|
| |
|
1387
|
|
|
|
|
|
|
|
|
1388
|
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
| formula |
|
1390
|
|
|
|
|
|
|
| String |
|
1391
|
|
|
|
|
|
|
| None (Required) |
|
1392
|
|
|
|
|
|
|
| A symbolic description of the model to be fitted. Supports operators like +, :, *, ^, and -1 (to remove the intercept). |
|
1393
|
|
|
|
|
|
|
| 'am ~ wt + hp', 'y ~ x - 1' |
|
1394
|
|
|
|
|
|
|
|
|
1395
|
|
|
|
|
|
|
|
|
1396
|
|
|
|
|
|
|
| data |
|
1397
|
|
|
|
|
|
|
| HashRef or ArrayRef |
|
1398
|
|
|
|
|
|
|
| None (Required) |
|
1399
|
|
|
|
|
|
|
| The dataset containing the variables used in the formula. Accepts either a Hash of Arrays (HoA) or an Array of Hashes (AoH). |
|
1400
|
|
|
|
|
|
|
| \%mtcars, [{x => 1, y => 2}, ...] |
|
1401
|
|
|
|
|
|
|
|
|
1402
|
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
| family |
|
1404
|
|
|
|
|
|
|
| String |
|
1405
|
|
|
|
|
|
|
| 'gaussian' |
|
1406
|
|
|
|
|
|
|
| A description of the error distribution and link function to be used in the model. Currently supports 'gaussian' (identity link) and 'binomial' (logit link). |
|
1407
|
|
|
|
|
|
|
| 'binomial' |
|
1408
|
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
|
|
1410
|
|
|
|
|
|
|
| |
|
1411
|
|
|
|
|
|
|
|
|
1412
|
|
|
|
|
|
|
=end html |
|
1413
|
|
|
|
|
|
|
|
|
1414
|
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
|
|
1416
|
|
|
|
|
|
|
=head3 Output variables |
|
1417
|
|
|
|
|
|
|
|
|
1418
|
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
|
|
1420
|
|
|
|
|
|
|
=begin html |
|
1421
|
|
|
|
|
|
|
|
|
1422
|
|
|
|
|
|
|
|
1423
|
|
|
|
|
|
|
|
1424
|
|
|
|
|
|
|
|
|
1425
|
|
|
|
|
|
|
| Variable |
|
1426
|
|
|
|
|
|
|
| Type |
|
1427
|
|
|
|
|
|
|
| Description |
|
1428
|
|
|
|
|
|
|
| Example |
|
1429
|
|
|
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
| |
|
1431
|
|
|
|
|
|
|
|
|
1432
|
|
|
|
|
|
|
|
|
1433
|
|
|
|
|
|
|
| aic |
|
1434
|
|
|
|
|
|
|
| Double |
|
1435
|
|
|
|
|
|
|
| Akaike's Information Criterion for the fitted model. |
|
1436
|
|
|
|
|
|
|
| 123.45 |
|
1437
|
|
|
|
|
|
|
|
|
1438
|
|
|
|
|
|
|
|
|
1439
|
|
|
|
|
|
|
| boundary |
|
1440
|
|
|
|
|
|
|
| Integer (Boolean) |
|
1441
|
|
|
|
|
|
|
| 1 if the fitted values computationally reached the 0 or 1 boundary (specific to the binomial family), 0 otherwise. |
|
1442
|
|
|
|
|
|
|
| 0 |
|
1443
|
|
|
|
|
|
|
|
|
1444
|
|
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
|
| coefficients |
|
1446
|
|
|
|
|
|
|
| HashRef |
|
1447
|
|
|
|
|
|
|
| A hash mapping the expanded model term names to their estimated coefficient values. |
|
1448
|
|
|
|
|
|
|
| {'Intercept' => 1.5, 'wt' => -0.5} |
|
1449
|
|
|
|
|
|
|
|
|
1450
|
|
|
|
|
|
|
|
|
1451
|
|
|
|
|
|
|
| converged |
|
1452
|
|
|
|
|
|
|
| Integer (Boolean) |
|
1453
|
|
|
|
|
|
|
| 1 if the Iteratively Reweighted Least Squares (IRLS) algorithm converged within the maximum iterations, 0 otherwise. |
|
1454
|
|
|
|
|
|
|
| 1 |
|
1455
|
|
|
|
|
|
|
|
|
1456
|
|
|
|
|
|
|
|
|
1457
|
|
|
|
|
|
|
| deviance |
|
1458
|
|
|
|
|
|
|
| Double |
|
1459
|
|
|
|
|
|
|
| The residual deviance of the fitted model. |
|
1460
|
|
|
|
|
|
|
| 15.2 |
|
1461
|
|
|
|
|
|
|
|
|
1462
|
|
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
| deviance.resid |
|
1464
|
|
|
|
|
|
|
| HashRef |
|
1465
|
|
|
|
|
|
|
| A hash mapping data row names to their computed deviance residuals. |
|
1466
|
|
|
|
|
|
|
| {'Mazda RX4' => 0.12} |
|
1467
|
|
|
|
|
|
|
|
|
1468
|
|
|
|
|
|
|
|
|
1469
|
|
|
|
|
|
|
| df.null |
|
1470
|
|
|
|
|
|
|
| Integer |
|
1471
|
|
|
|
|
|
|
| The residual degrees of freedom for the null model. |
|
1472
|
|
|
|
|
|
|
| 31 |
|
1473
|
|
|
|
|
|
|
|
|
1474
|
|
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
| df.residual |
|
1476
|
|
|
|
|
|
|
| Integer |
|
1477
|
|
|
|
|
|
|
| The residual degrees of freedom for the fitted model. |
|
1478
|
|
|
|
|
|
|
| 30 |
|
1479
|
|
|
|
|
|
|
|
|
1480
|
|
|
|
|
|
|
|
|
1481
|
|
|
|
|
|
|
| family |
|
1482
|
|
|
|
|
|
|
| String |
|
1483
|
|
|
|
|
|
|
| The statistical family used to fit the model. |
|
1484
|
|
|
|
|
|
|
| "gaussian" |
|
1485
|
|
|
|
|
|
|
|
|
1486
|
|
|
|
|
|
|
|
|
1487
|
|
|
|
|
|
|
| fitted.values |
|
1488
|
|
|
|
|
|
|
| HashRef |
|
1489
|
|
|
|
|
|
|
| A hash mapping data row names to the fitted mean values (the model's predictions on the scale of the response). |
|
1490
|
|
|
|
|
|
|
| {'Mazda RX4' => 0.85} |
|
1491
|
|
|
|
|
|
|
|
|
1492
|
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
| iter |
|
1494
|
|
|
|
|
|
|
| Integer |
|
1495
|
|
|
|
|
|
|
| The number of IRLS iterations performed before convergence or hitting the iteration limit. |
|
1496
|
|
|
|
|
|
|
| 4 |
|
1497
|
|
|
|
|
|
|
|
|
1498
|
|
|
|
|
|
|
|
|
1499
|
|
|
|
|
|
|
| null.deviance |
|
1500
|
|
|
|
|
|
|
| Double |
|
1501
|
|
|
|
|
|
|
| The deviance for the null model (a baseline model containing only an intercept, or an offset of 0 if the intercept is removed). |
|
1502
|
|
|
|
|
|
|
| 43.5 |
|
1503
|
|
|
|
|
|
|
|
|
1504
|
|
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
|
| rank |
|
1506
|
|
|
|
|
|
|
| Integer |
|
1507
|
|
|
|
|
|
|
| The numeric rank of the fitted linear model (the number of estimated, non-aliased parameters). |
|
1508
|
|
|
|
|
|
|
| 2 |
|
1509
|
|
|
|
|
|
|
|
|
1510
|
|
|
|
|
|
|
|
|
1511
|
|
|
|
|
|
|
| summary |
|
1512
|
|
|
|
|
|
|
| HashRef |
|
1513
|
|
|
|
|
|
|
| A nested hash mapping each term to its detailed summary statistics, including Estimate, Std. Error, t value / z value, and Pr(> t ) / Pr(> z ). Aliased parameters return "NaN". |
|
1514
|
|
|
|
|
|
|
| {'wt' => {'Estimate' => -0.5, 'Std. Error' => 0.1, ...}} |
|
1515
|
|
|
|
|
|
|
|
|
1516
|
|
|
|
|
|
|
|
|
1517
|
|
|
|
|
|
|
| terms |
|
1518
|
|
|
|
|
|
|
| ArrayRef |
|
1519
|
|
|
|
|
|
|
| An ordered list of the expanded term names included in the model matrix. |
|
1520
|
|
|
|
|
|
|
| ['Intercept', 'wt', 'hp'] |
|
1521
|
|
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
|
|
|
1523
|
|
|
|
|
|
|
| |
|
1524
|
|
|
|
|
|
|
|
|
1525
|
|
|
|
|
|
|
=end html |
|
1526
|
|
|
|
|
|
|
|
|
1527
|
|
|
|
|
|
|
|
|
1528
|
|
|
|
|
|
|
|
|
1529
|
|
|
|
|
|
|
=head2 group_by |
|
1530
|
|
|
|
|
|
|
|
|
1531
|
|
|
|
|
|
|
Take a hash of arrays, hash of hashes, or array of hashes, and group a column by another column. |
|
1532
|
|
|
|
|
|
|
|
|
1533
|
|
|
|
|
|
|
my $aoh_data = [ |
|
1534
|
|
|
|
|
|
|
{ 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 }, |
|
1535
|
|
|
|
|
|
|
{ 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 }, |
|
1536
|
|
|
|
|
|
|
{ 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 }, |
|
1537
|
|
|
|
|
|
|
{ 'Gender' => 'Female' } # Intentional missing target value |
|
1538
|
|
|
|
|
|
|
]; |
|
1539
|
|
|
|
|
|
|
|
|
1540
|
|
|
|
|
|
|
as well as |
|
1541
|
|
|
|
|
|
|
|
|
1542
|
|
|
|
|
|
|
$hoh_data = { |
|
1543
|
|
|
|
|
|
|
'Patient_A' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 }, |
|
1544
|
|
|
|
|
|
|
'Patient_B' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 }, |
|
1545
|
|
|
|
|
|
|
'Patient_C' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 }, |
|
1546
|
|
|
|
|
|
|
'Patient_D' => { 'Gender' => 'Female' }, # Intentional missing target value |
|
1547
|
|
|
|
|
|
|
'Patient_E' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => undef } # Explicit undef |
|
1548
|
|
|
|
|
|
|
}; |
|
1549
|
|
|
|
|
|
|
|
|
1550
|
|
|
|
|
|
|
and |
|
1551
|
|
|
|
|
|
|
|
|
1552
|
|
|
|
|
|
|
my $hoa_data = { |
|
1553
|
|
|
|
|
|
|
'Gender' => ['Male', 'Female', 'Male', 'Female'], |
|
1554
|
|
|
|
|
|
|
'Testosterone, total (nmol/L)' => [22.1, 2.5, 19.4, undef ] |
|
1555
|
|
|
|
|
|
|
}; |
|
1556
|
|
|
|
|
|
|
|
|
1557
|
|
|
|
|
|
|
then run the function thus: |
|
1558
|
|
|
|
|
|
|
|
|
1559
|
|
|
|
|
|
|
group_by( $hoa_data, 'Testosterone, total (nmol/L)', 'Gender'); |
|
1560
|
|
|
|
|
|
|
|
|
1561
|
|
|
|
|
|
|
The output can be thought of like a hash, with the first string broken down by the second. |
|
1562
|
|
|
|
|
|
|
|
|
1563
|
|
|
|
|
|
|
all become hash of arrays: |
|
1564
|
|
|
|
|
|
|
|
|
1565
|
|
|
|
|
|
|
{ |
|
1566
|
|
|
|
|
|
|
Female [ |
|
1567
|
|
|
|
|
|
|
[0] 1.8 |
|
1568
|
|
|
|
|
|
|
], |
|
1569
|
|
|
|
|
|
|
Male [ |
|
1570
|
|
|
|
|
|
|
[0] 18.2, |
|
1571
|
|
|
|
|
|
|
[1] 20.5 |
|
1572
|
|
|
|
|
|
|
] |
|
1573
|
|
|
|
|
|
|
} |
|
1574
|
|
|
|
|
|
|
|
|
1575
|
|
|
|
|
|
|
returns an empty array of hashes if neither target nor group keys are found. |
|
1576
|
|
|
|
|
|
|
|
|
1577
|
|
|
|
|
|
|
=head3 Filtering |
|
1578
|
|
|
|
|
|
|
|
|
1579
|
|
|
|
|
|
|
Data can be further broken down with filter/subs like in C: |
|
1580
|
|
|
|
|
|
|
|
|
1581
|
|
|
|
|
|
|
my $testosterone = group_by($d, # group testosterone by "Gender" |
|
1582
|
|
|
|
|
|
|
'Testosterone, total (nmol/L)', |
|
1583
|
|
|
|
|
|
|
'Gender', |
|
1584
|
|
|
|
|
|
|
{ 'Race/Hispanic origin w/ NH Asian' => sub { $_ eq $n } },# filter |
|
1585
|
|
|
|
|
|
|
{ 'Testosterone, total (nmol/L)' => sub { $_ ne 'NA' } } # filter |
|
1586
|
|
|
|
|
|
|
); |
|
1587
|
|
|
|
|
|
|
|
|
1588
|
|
|
|
|
|
|
where each filter filters on the columns, e.g. second hash keys. |
|
1589
|
|
|
|
|
|
|
|
|
1590
|
|
|
|
|
|
|
=head2 hoh2hoa |
|
1591
|
|
|
|
|
|
|
|
|
1592
|
|
|
|
|
|
|
Convert a B (row-major: outer key = row, inner key = column) |
|
1593
|
|
|
|
|
|
|
into a B (column-major: key = column, value = that column's |
|
1594
|
|
|
|
|
|
|
cells down the rows). |
|
1595
|
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
use Stats::LikeR; |
|
1597
|
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
my %hoh = ( |
|
1599
|
|
|
|
|
|
|
'r1' => { 'a' => 1, 'b' => 2 }, |
|
1600
|
|
|
|
|
|
|
'r2' => { 'a' => 3, 'b' => 4 }, |
|
1601
|
|
|
|
|
|
|
); |
|
1602
|
|
|
|
|
|
|
|
|
1603
|
|
|
|
|
|
|
my $hoa = hoh2hoa(\%hoh); |
|
1604
|
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
which returns |
|
1606
|
|
|
|
|
|
|
{ |
|
1607
|
|
|
|
|
|
|
a => [1, 3], |
|
1608
|
|
|
|
|
|
|
b => [2, 4], |
|
1609
|
|
|
|
|
|
|
} |
|
1610
|
|
|
|
|
|
|
|
|
1611
|
|
|
|
|
|
|
=head3 Behavior |
|
1612
|
|
|
|
|
|
|
|
|
1613
|
|
|
|
|
|
|
=over |
|
1614
|
|
|
|
|
|
|
|
|
1615
|
|
|
|
|
|
|
=item * B are the union of every inner key, so a key that appears in only |
|
1616
|
|
|
|
|
|
|
some rows still becomes a column. |
|
1617
|
|
|
|
|
|
|
|
|
1618
|
|
|
|
|
|
|
=item * B are emitted in sorted outer-key (row-name) order, and that one order |
|
1619
|
|
|
|
|
|
|
is used for every column, so the arrays stay aligned and the result is |
|
1620
|
|
|
|
|
|
|
reproducible regardless of hash ordering. |
|
1621
|
|
|
|
|
|
|
|
|
1622
|
|
|
|
|
|
|
=item * B — a missing inner key, or a cell whose value is C — are filled |
|
1623
|
|
|
|
|
|
|
with the fill value (see C below). Every column therefore has |
|
1624
|
|
|
|
|
|
|
exactly one entry per row. |
|
1625
|
|
|
|
|
|
|
|
|
1626
|
|
|
|
|
|
|
=item * Values are B into the result; the original structure is left |
|
1627
|
|
|
|
|
|
|
untouched. |
|
1628
|
|
|
|
|
|
|
|
|
1629
|
|
|
|
|
|
|
=item * An B hash of hashes returns an empty hash of arrays (it is not an |
|
1630
|
|
|
|
|
|
|
error). |
|
1631
|
|
|
|
|
|
|
|
|
1632
|
|
|
|
|
|
|
=back |
|
1633
|
|
|
|
|
|
|
|
|
1634
|
|
|
|
|
|
|
=head3 Options |
|
1635
|
|
|
|
|
|
|
|
|
1636
|
|
|
|
|
|
|
Options are passed as trailing C<< name =E value >> pairs. |
|
1637
|
|
|
|
|
|
|
|
|
1638
|
|
|
|
|
|
|
|
|
1639
|
|
|
|
|
|
|
|
|
1640
|
|
|
|
|
|
|
=begin html |
|
1641
|
|
|
|
|
|
|
|
|
1642
|
|
|
|
|
|
|
|
1643
|
|
|
|
|
|
|
|
1644
|
|
|
|
|
|
|
|
|
1645
|
|
|
|
|
|
|
| Option |
|
1646
|
|
|
|
|
|
|
| Default |
|
1647
|
|
|
|
|
|
|
| Meaning |
|
1648
|
|
|
|
|
|
|
|
|
1649
|
|
|
|
|
|
|
| |
|
1650
|
|
|
|
|
|
|
|
|
1651
|
|
|
|
|
|
|
|
|
1652
|
|
|
|
|
|
|
| undef.val |
|
1653
|
|
|
|
|
|
|
| undef |
|
1654
|
|
|
|
|
|
|
| Value used to fill a missing key or an undef cell. Any defined scalar works, including 0 and ''. Passing undef keeps the default. |
|
1655
|
|
|
|
|
|
|
|
|
1656
|
|
|
|
|
|
|
|
|
1657
|
|
|
|
|
|
|
| row.names |
|
1658
|
|
|
|
|
|
|
| (none) |
|
1659
|
|
|
|
|
|
|
| If set to a string, an extra column of that name is added holding the sorted row labels, aligned with the data. Dies if the name collides with an existing column. |
|
1660
|
|
|
|
|
|
|
|
|
1661
|
|
|
|
|
|
|
|
|
1662
|
|
|
|
|
|
|
| |
|
1663
|
|
|
|
|
|
|
|
|
1664
|
|
|
|
|
|
|
=end html |
|
1665
|
|
|
|
|
|
|
|
|
1666
|
|
|
|
|
|
|
|
|
1667
|
|
|
|
|
|
|
|
|
1668
|
|
|
|
|
|
|
# Ragged input with an explicit fill string: |
|
1669
|
|
|
|
|
|
|
my %ragged = ( |
|
1670
|
|
|
|
|
|
|
'r1' => { 'a' => 1, 'b' => 2 }, |
|
1671
|
|
|
|
|
|
|
'r2' => { 'a' => 3, 'c' => 9 }, |
|
1672
|
|
|
|
|
|
|
); |
|
1673
|
|
|
|
|
|
|
my $hoa = hoh2hoa(\%ragged, 'undef.val' => 'NA'); |
|
1674
|
|
|
|
|
|
|
# { |
|
1675
|
|
|
|
|
|
|
# a => [1, 3 ], |
|
1676
|
|
|
|
|
|
|
# b => [2, 'NA'], |
|
1677
|
|
|
|
|
|
|
# c => ['NA', 9 ], |
|
1678
|
|
|
|
|
|
|
# } |
|
1679
|
|
|
|
|
|
|
|
|
1680
|
|
|
|
|
|
|
# Keep the row labels as a column: |
|
1681
|
|
|
|
|
|
|
my $with_ids = hoh2hoa(\%ragged, 'row.names' => 'id'); |
|
1682
|
|
|
|
|
|
|
# { |
|
1683
|
|
|
|
|
|
|
# id => ['r1', 'r2'], |
|
1684
|
|
|
|
|
|
|
# a => [1, 3 ], |
|
1685
|
|
|
|
|
|
|
# b => [2, undef], |
|
1686
|
|
|
|
|
|
|
# c => [undef, 9 ], |
|
1687
|
|
|
|
|
|
|
# } |
|
1688
|
|
|
|
|
|
|
|
|
1689
|
|
|
|
|
|
|
=head3 Errors |
|
1690
|
|
|
|
|
|
|
|
|
1691
|
|
|
|
|
|
|
C dies (via C) when: |
|
1692
|
|
|
|
|
|
|
|
|
1693
|
|
|
|
|
|
|
=over |
|
1694
|
|
|
|
|
|
|
|
|
1695
|
|
|
|
|
|
|
=item * the argument is not a hash reference, |
|
1696
|
|
|
|
|
|
|
|
|
1697
|
|
|
|
|
|
|
=item * any value in the hash is not itself a hash reference, |
|
1698
|
|
|
|
|
|
|
|
|
1699
|
|
|
|
|
|
|
=item * an unknown option is given, or the options are not C<< name =E value >> pairs, |
|
1700
|
|
|
|
|
|
|
|
|
1701
|
|
|
|
|
|
|
=item * C is not a plain string, or it names an already-present column. |
|
1702
|
|
|
|
|
|
|
|
|
1703
|
|
|
|
|
|
|
=back |
|
1704
|
|
|
|
|
|
|
|
|
1705
|
|
|
|
|
|
|
=head2 hist |
|
1706
|
|
|
|
|
|
|
|
|
1707
|
|
|
|
|
|
|
Computes the histogram of the given data values, operating in single $O(N)$ pass performance. It returns the bin counts, computed breaks, midpoints, and density. |
|
1708
|
|
|
|
|
|
|
|
|
1709
|
|
|
|
|
|
|
my $res = hist([1, 2, 2, 3, 3, 3, 4, 4, 5], breaks => 4); |
|
1710
|
|
|
|
|
|
|
|
|
1711
|
|
|
|
|
|
|
If C is not explicitly provided, it defaults to calculating the number of bins using Sturges' formula. |
|
1712
|
|
|
|
|
|
|
|
|
1713
|
|
|
|
|
|
|
=head2 kruskal_test |
|
1714
|
|
|
|
|
|
|
|
|
1715
|
|
|
|
|
|
|
Essentially the test determines if all groups have the same median (same distribution) (an excellent review is at https://library.virginia.edu/data/articles/getting-started-with-the-kruskal-wallis-test) |
|
1716
|
|
|
|
|
|
|
|
|
1717
|
|
|
|
|
|
|
Performs a Kruskal-Wallis rank sum test, see |
|
1718
|
|
|
|
|
|
|
https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/kruskal.test |
|
1719
|
|
|
|
|
|
|
|
|
1720
|
|
|
|
|
|
|
=head3 hash of array entry |
|
1721
|
|
|
|
|
|
|
|
|
1722
|
|
|
|
|
|
|
I feel that this is better, and more easily read, than what you get in R: |
|
1723
|
|
|
|
|
|
|
|
|
1724
|
|
|
|
|
|
|
my %x = ( |
|
1725
|
|
|
|
|
|
|
'normal.subjects' => [2.9, 3.0, 2.5, 2.6, 3.2], |
|
1726
|
|
|
|
|
|
|
'obs. airway disease' => [3.8, 2.7, 4.0, 2.4], |
|
1727
|
|
|
|
|
|
|
'asbestosis' => [2.8, 3.4, 3.7, 2.2, 2.0] |
|
1728
|
|
|
|
|
|
|
); |
|
1729
|
|
|
|
|
|
|
$kt = kruskal_test(\%x); |
|
1730
|
|
|
|
|
|
|
|
|
1731
|
|
|
|
|
|
|
=head3 R-like array entry |
|
1732
|
|
|
|
|
|
|
|
|
1733
|
|
|
|
|
|
|
my @xk = (2.9, 3.0, 2.5, 2.6, 3.2); # normal subjects |
|
1734
|
|
|
|
|
|
|
my @yk = (3.8, 2.7, 4.0, 2.4); # with obstructive airway disease |
|
1735
|
|
|
|
|
|
|
my @zk = (2.8, 3.4, 3.7, 2.2, 2.0); # with asbestosis |
|
1736
|
|
|
|
|
|
|
my @x = (@xk, @yk, @zk); |
|
1737
|
|
|
|
|
|
|
my @g = ( |
|
1738
|
|
|
|
|
|
|
(map {'Normal subjects'} 0..4), |
|
1739
|
|
|
|
|
|
|
(map {'Subjects with obstructive airway disease'} 0..3), |
|
1740
|
|
|
|
|
|
|
map {'Subjects with asbestosis'} 0..4 |
|
1741
|
|
|
|
|
|
|
); |
|
1742
|
|
|
|
|
|
|
my $kt = kruskal_test(\@x, \@g); |
|
1743
|
|
|
|
|
|
|
|
|
1744
|
|
|
|
|
|
|
=head2 ks_test |
|
1745
|
|
|
|
|
|
|
|
|
1746
|
|
|
|
|
|
|
The Kolmogorov-Smirnov test, which tests whether or not two arrays/lists of data are part of the same distribution is implemented simply: |
|
1747
|
|
|
|
|
|
|
|
|
1748
|
|
|
|
|
|
|
$ks = ks_test(\@x, \@y, alternative => 'greater'); |
|
1749
|
|
|
|
|
|
|
|
|
1750
|
|
|
|
|
|
|
returning a hash reference. |
|
1751
|
|
|
|
|
|
|
|
|
1752
|
|
|
|
|
|
|
Also, a single array can be tested against a normal distribution: |
|
1753
|
|
|
|
|
|
|
|
|
1754
|
|
|
|
|
|
|
$ks = ks_test($ksx, 'pnorm'); |
|
1755
|
|
|
|
|
|
|
|
|
1756
|
|
|
|
|
|
|
The p-value precision is about 1e-8, which I want to improve, but am not sure how. |
|
1757
|
|
|
|
|
|
|
|
|
1758
|
|
|
|
|
|
|
=head2 ljoin |
|
1759
|
|
|
|
|
|
|
|
|
1760
|
|
|
|
|
|
|
Consider a hash: C<$h{$row}{$col}>, and another hash C<$i{$row}{$col}>. |
|
1761
|
|
|
|
|
|
|
C will add information for C<$col> in C<%i> for each C<$row> to C<%h>, where C<$row> exists in both C<%h> and C<%i> |
|
1762
|
|
|
|
|
|
|
|
|
1763
|
|
|
|
|
|
|
For example, |
|
1764
|
|
|
|
|
|
|
|
|
1765
|
|
|
|
|
|
|
{ |
|
1766
|
|
|
|
|
|
|
"Jack Smith" { |
|
1767
|
|
|
|
|
|
|
age 30 |
|
1768
|
|
|
|
|
|
|
} |
|
1769
|
|
|
|
|
|
|
} |
|
1770
|
|
|
|
|
|
|
|
|
1771
|
|
|
|
|
|
|
and a second hash, |
|
1772
|
|
|
|
|
|
|
{ |
|
1773
|
|
|
|
|
|
|
"Jack Smith" { |
|
1774
|
|
|
|
|
|
|
dept "Engineering" |
|
1775
|
|
|
|
|
|
|
}, |
|
1776
|
|
|
|
|
|
|
"Jane Doe" { |
|
1777
|
|
|
|
|
|
|
age 25 |
|
1778
|
|
|
|
|
|
|
} |
|
1779
|
|
|
|
|
|
|
} |
|
1780
|
|
|
|
|
|
|
|
|
1781
|
|
|
|
|
|
|
in this case, running C will modify \%h to result: |
|
1782
|
|
|
|
|
|
|
|
|
1783
|
|
|
|
|
|
|
{ |
|
1784
|
|
|
|
|
|
|
"Jack Smith" { |
|
1785
|
|
|
|
|
|
|
age 30, |
|
1786
|
|
|
|
|
|
|
dept "Engineering" |
|
1787
|
|
|
|
|
|
|
} |
|
1788
|
|
|
|
|
|
|
} |
|
1789
|
|
|
|
|
|
|
|
|
1790
|
|
|
|
|
|
|
=head2 lm |
|
1791
|
|
|
|
|
|
|
|
|
1792
|
|
|
|
|
|
|
This is the linear models function. |
|
1793
|
|
|
|
|
|
|
|
|
1794
|
|
|
|
|
|
|
$lm = lm(formula => 'mpg ~ wt + hp', data => $mtcars); |
|
1795
|
|
|
|
|
|
|
|
|
1796
|
|
|
|
|
|
|
where C<$mtcars> is a hash of hashes |
|
1797
|
|
|
|
|
|
|
|
|
1798
|
|
|
|
|
|
|
C also supports generating interaction terms directly within the formula using the C<*> operator: |
|
1799
|
|
|
|
|
|
|
|
|
1800
|
|
|
|
|
|
|
my $lm = lm(formula => 'mpg ~ wt * hp^2', data => \%mtcars); |
|
1801
|
|
|
|
|
|
|
|
|
1802
|
|
|
|
|
|
|
If your data contains missing numbers (C or C), C handles listwise deletion dynamically to ensure mathematical integrity before fitting. |
|
1803
|
|
|
|
|
|
|
|
|
1804
|
|
|
|
|
|
|
the dot operator also works: |
|
1805
|
|
|
|
|
|
|
|
|
1806
|
|
|
|
|
|
|
$lm = lm(formula => 'y ~ .', data => $dot_data); |
|
1807
|
|
|
|
|
|
|
|
|
1808
|
|
|
|
|
|
|
=head2 matrix |
|
1809
|
|
|
|
|
|
|
|
|
1810
|
|
|
|
|
|
|
my $mat1 = matrix( |
|
1811
|
|
|
|
|
|
|
data => [1..6], |
|
1812
|
|
|
|
|
|
|
nrow => 2 |
|
1813
|
|
|
|
|
|
|
); |
|
1814
|
|
|
|
|
|
|
|
|
1815
|
|
|
|
|
|
|
You can also pass C<< byrow =E 1 >> if you want the matrix populated row-wise instead of column-wise. |
|
1816
|
|
|
|
|
|
|
|
|
1817
|
|
|
|
|
|
|
As of version 0.10, parameters do not need to be named, so that C works more like R: |
|
1818
|
|
|
|
|
|
|
|
|
1819
|
|
|
|
|
|
|
my $d = matrix(rnorm(32000), 1000, 32); |
|
1820
|
|
|
|
|
|
|
|
|
1821
|
|
|
|
|
|
|
works as C, C, and C |
|
1822
|
|
|
|
|
|
|
|
|
1823
|
|
|
|
|
|
|
=head2 max |
|
1824
|
|
|
|
|
|
|
|
|
1825
|
|
|
|
|
|
|
max(1,2,3); |
|
1826
|
|
|
|
|
|
|
|
|
1827
|
|
|
|
|
|
|
or |
|
1828
|
|
|
|
|
|
|
|
|
1829
|
|
|
|
|
|
|
my @arr = 1..8; |
|
1830
|
|
|
|
|
|
|
max(@arr, 4, 5) |
|
1831
|
|
|
|
|
|
|
|
|
1832
|
|
|
|
|
|
|
as of version 0.02, max will die if any undefined values are provided |
|
1833
|
|
|
|
|
|
|
|
|
1834
|
|
|
|
|
|
|
=head2 mean |
|
1835
|
|
|
|
|
|
|
|
|
1836
|
|
|
|
|
|
|
mean(1,2,3); |
|
1837
|
|
|
|
|
|
|
|
|
1838
|
|
|
|
|
|
|
or |
|
1839
|
|
|
|
|
|
|
|
|
1840
|
|
|
|
|
|
|
my @arr = 1..8; |
|
1841
|
|
|
|
|
|
|
mean(@arr, 4, 5) |
|
1842
|
|
|
|
|
|
|
|
|
1843
|
|
|
|
|
|
|
or |
|
1844
|
|
|
|
|
|
|
|
|
1845
|
|
|
|
|
|
|
mean([1,1], [2,2]) # 1.5 |
|
1846
|
|
|
|
|
|
|
|
|
1847
|
|
|
|
|
|
|
as of version 0.02, mean will die if any undefined values are provided |
|
1848
|
|
|
|
|
|
|
|
|
1849
|
|
|
|
|
|
|
=head2 median |
|
1850
|
|
|
|
|
|
|
|
|
1851
|
|
|
|
|
|
|
works like mean, taking array references and arrays: |
|
1852
|
|
|
|
|
|
|
|
|
1853
|
|
|
|
|
|
|
median( $test_data[$i][0] ) |
|
1854
|
|
|
|
|
|
|
|
|
1855
|
|
|
|
|
|
|
as of version 0.02, median will die if any undefined values are provided |
|
1856
|
|
|
|
|
|
|
|
|
1857
|
|
|
|
|
|
|
=head2 min |
|
1858
|
|
|
|
|
|
|
|
|
1859
|
|
|
|
|
|
|
min(1,2,3); |
|
1860
|
|
|
|
|
|
|
|
|
1861
|
|
|
|
|
|
|
or |
|
1862
|
|
|
|
|
|
|
|
|
1863
|
|
|
|
|
|
|
my @arr = 1..8; |
|
1864
|
|
|
|
|
|
|
min(@arr, 4, 5) |
|
1865
|
|
|
|
|
|
|
|
|
1866
|
|
|
|
|
|
|
as of version 0.02, min will die if any undefined values are provided |
|
1867
|
|
|
|
|
|
|
|
|
1868
|
|
|
|
|
|
|
=head2 mode |
|
1869
|
|
|
|
|
|
|
|
|
1870
|
|
|
|
|
|
|
Takes either an array or an array reference, and returns an array of the most common scalars (numbers or strings) |
|
1871
|
|
|
|
|
|
|
|
|
1872
|
|
|
|
|
|
|
@arr = mode([1,3,3,3]); # returns (3) |
|
1873
|
|
|
|
|
|
|
|
|
1874
|
|
|
|
|
|
|
@arr = mode('a','a','c','c','z'); # returns ('a', 'c') |
|
1875
|
|
|
|
|
|
|
|
|
1876
|
|
|
|
|
|
|
=head2 oneway_test |
|
1877
|
|
|
|
|
|
|
|
|
1878
|
|
|
|
|
|
|
Like ANOVA/aov but does not assume normality |
|
1879
|
|
|
|
|
|
|
|
|
1880
|
|
|
|
|
|
|
=head3 hash of array input |
|
1881
|
|
|
|
|
|
|
|
|
1882
|
|
|
|
|
|
|
$test_data = oneway_test({ |
|
1883
|
|
|
|
|
|
|
yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2], |
|
1884
|
|
|
|
|
|
|
ctrl => [1, 1, 1, 0, 0, 0] |
|
1885
|
|
|
|
|
|
|
}); |
|
1886
|
|
|
|
|
|
|
|
|
1887
|
|
|
|
|
|
|
which will output a hash reference: |
|
1888
|
|
|
|
|
|
|
|
|
1889
|
|
|
|
|
|
|
{ |
|
1890
|
|
|
|
|
|
|
Group { |
|
1891
|
|
|
|
|
|
|
Df 1, |
|
1892
|
|
|
|
|
|
|
"F value" 177.504798464491, |
|
1893
|
|
|
|
|
|
|
"Mean Sq" 61.6533333333333, |
|
1894
|
|
|
|
|
|
|
Pr(>F) 1.31343255160843e-07, |
|
1895
|
|
|
|
|
|
|
"Sum Sq" 61.6533333333333 |
|
1896
|
|
|
|
|
|
|
}, |
|
1897
|
|
|
|
|
|
|
group_stats { |
|
1898
|
|
|
|
|
|
|
mean { |
|
1899
|
|
|
|
|
|
|
ctrl 0.5, |
|
1900
|
|
|
|
|
|
|
yield 5.03333333333333 |
|
1901
|
|
|
|
|
|
|
}, |
|
1902
|
|
|
|
|
|
|
size { |
|
1903
|
|
|
|
|
|
|
ctrl 6, |
|
1904
|
|
|
|
|
|
|
yield 6 |
|
1905
|
|
|
|
|
|
|
} |
|
1906
|
|
|
|
|
|
|
}, |
|
1907
|
|
|
|
|
|
|
Residuals { |
|
1908
|
|
|
|
|
|
|
Df 9.81767348326473, |
|
1909
|
|
|
|
|
|
|
"Mean Sq" 0.353783749200256, |
|
1910
|
|
|
|
|
|
|
"Sum Sq" 3.47333333333333 |
|
1911
|
|
|
|
|
|
|
} |
|
1912
|
|
|
|
|
|
|
|
|
1913
|
|
|
|
|
|
|
} |
|
1914
|
|
|
|
|
|
|
|
|
1915
|
|
|
|
|
|
|
=head3 array of array input |
|
1916
|
|
|
|
|
|
|
|
|
1917
|
|
|
|
|
|
|
oneway_test([ |
|
1918
|
|
|
|
|
|
|
[5.5, 5.4, 5.8, 4.5, 4.8, 4.2], |
|
1919
|
|
|
|
|
|
|
[1, 1, 1, 0, 0, 0] |
|
1920
|
|
|
|
|
|
|
]); |
|
1921
|
|
|
|
|
|
|
|
|
1922
|
|
|
|
|
|
|
which will output a nearly identical hash reference as for hash of arrays: |
|
1923
|
|
|
|
|
|
|
|
|
1924
|
|
|
|
|
|
|
{ |
|
1925
|
|
|
|
|
|
|
Group { |
|
1926
|
|
|
|
|
|
|
Df 1, |
|
1927
|
|
|
|
|
|
|
"F value" 177.504798464491, |
|
1928
|
|
|
|
|
|
|
"Mean Sq" 61.6533333333333, |
|
1929
|
|
|
|
|
|
|
Pr(>F) 1.31343255160843e-07, |
|
1930
|
|
|
|
|
|
|
"Sum Sq" 61.6533333333333 |
|
1931
|
|
|
|
|
|
|
}, |
|
1932
|
|
|
|
|
|
|
group_stats { |
|
1933
|
|
|
|
|
|
|
mean { |
|
1934
|
|
|
|
|
|
|
"Index 0" 5.03333333333333, |
|
1935
|
|
|
|
|
|
|
"Index 1" 0.5 |
|
1936
|
|
|
|
|
|
|
}, |
|
1937
|
|
|
|
|
|
|
size { |
|
1938
|
|
|
|
|
|
|
"Index 0" 6, |
|
1939
|
|
|
|
|
|
|
"Index 1" 6 |
|
1940
|
|
|
|
|
|
|
} |
|
1941
|
|
|
|
|
|
|
}, |
|
1942
|
|
|
|
|
|
|
Residuals { |
|
1943
|
|
|
|
|
|
|
Df 9.81767348326473, |
|
1944
|
|
|
|
|
|
|
"Mean Sq" 0.353783749200256, |
|
1945
|
|
|
|
|
|
|
"Sum Sq" 3.47333333333333 |
|
1946
|
|
|
|
|
|
|
} |
|
1947
|
|
|
|
|
|
|
} |
|
1948
|
|
|
|
|
|
|
|
|
1949
|
|
|
|
|
|
|
=head2 p_adjust |
|
1950
|
|
|
|
|
|
|
|
|
1951
|
|
|
|
|
|
|
Returns array of false-discovery-rate-corrected p-values, where methods available are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr" |
|
1952
|
|
|
|
|
|
|
|
|
1953
|
|
|
|
|
|
|
my @q = p_adjust(\@pvalues, $method); |
|
1954
|
|
|
|
|
|
|
|
|
1955
|
|
|
|
|
|
|
=head2 power_t_test |
|
1956
|
|
|
|
|
|
|
|
|
1957
|
|
|
|
|
|
|
$test_data = power_t_test( |
|
1958
|
|
|
|
|
|
|
n => 30, delta => 0.5, |
|
1959
|
|
|
|
|
|
|
sd => 1.0, sig_level => 0.05 |
|
1960
|
|
|
|
|
|
|
); |
|
1961
|
|
|
|
|
|
|
|
|
1962
|
|
|
|
|
|
|
It also allows configuring the test type (C<< type =E 'one.sample' >>, C<'two.sample'>, C<'paired'>) and alternative hypothesis (C<< alternative =E 'one.sided' >>). You can also pass C<< strict =E 1 >> to strictly evaluate both tails of the distribution. |
|
1963
|
|
|
|
|
|
|
|
|
1964
|
|
|
|
|
|
|
|
|
1965
|
|
|
|
|
|
|
|
|
1966
|
|
|
|
|
|
|
=begin html |
|
1967
|
|
|
|
|
|
|
|
|
1968
|
|
|
|
|
|
|
|
|
2034
|
|
|
|
|
|
|
|
|
2035
|
|
|
|
|
|
|
=end html |
|
2036
|
|
|
|
|
|
|
|
|
2037
|
|
|
|
|
|
|
|
|
2038
|
|
|
|
|
|
|
|
|
2039
|
|
|
|
|
|
|
=head2 prcomp |
|
2040
|
|
|
|
|
|
|
|
|
2041
|
|
|
|
|
|
|
Principal Component Analysis |
|
2042
|
|
|
|
|
|
|
|
|
2043
|
|
|
|
|
|
|
=head3 Options |
|
2044
|
|
|
|
|
|
|
|
|
2045
|
|
|
|
|
|
|
|
|
2046
|
|
|
|
|
|
|
|
|
2047
|
|
|
|
|
|
|
=begin html |
|
2048
|
|
|
|
|
|
|
|
|
2049
|
|
|
|
|
|
|
|
2050
|
|
|
|
|
|
|
|
2051
|
|
|
|
|
|
|
|
|
2052
|
|
|
|
|
|
|
| Option |
|
2053
|
|
|
|
|
|
|
| Type |
|
2054
|
|
|
|
|
|
|
| Default |
|
2055
|
|
|
|
|
|
|
| Description |
|
2056
|
|
|
|
|
|
|
|
|
2057
|
|
|
|
|
|
|
| |
|
2058
|
|
|
|
|
|
|
|
|
2059
|
|
|
|
|
|
|
|
|
2060
|
|
|
|
|
|
|
| center |
|
2061
|
|
|
|
|
|
|
| Boolean |
|
2062
|
|
|
|
|
|
|
| 1 (True) |
|
2063
|
|
|
|
|
|
|
| If true, the variables are shifted to be zero-centered before the analysis takes place. |
|
2064
|
|
|
|
|
|
|
|
|
2065
|
|
|
|
|
|
|
|
|
2066
|
|
|
|
|
|
|
| scale |
|
2067
|
|
|
|
|
|
|
| Boolean |
|
2068
|
|
|
|
|
|
|
| 0 (False) |
|
2069
|
|
|
|
|
|
|
| If true, the variables are scaled to have unit variance before the analysis takes place. Note: If a column has zero variance, the function will croak to prevent division by zero. |
|
2070
|
|
|
|
|
|
|
|
|
2071
|
|
|
|
|
|
|
|
|
2072
|
|
|
|
|
|
|
| retx |
|
2073
|
|
|
|
|
|
|
| Boolean |
|
2074
|
|
|
|
|
|
|
| 1 (True) |
|
2075
|
|
|
|
|
|
|
| If true, the rotated data (the original data multiplied by the rotation matrix) is returned under the key x. |
|
2076
|
|
|
|
|
|
|
|
|
2077
|
|
|
|
|
|
|
|
|
2078
|
|
|
|
|
|
|
| tol |
|
2079
|
|
|
|
|
|
|
| Number |
|
2080
|
|
|
|
|
|
|
| undef |
|
2081
|
|
|
|
|
|
|
| A value indicating the magnitude below which components should be omitted. Components are omitted if their standard deviation is less than or equal to tol times the standard deviation of the first component. |
|
2082
|
|
|
|
|
|
|
|
|
2083
|
|
|
|
|
|
|
|
|
2084
|
|
|
|
|
|
|
| rank |
|
2085
|
|
|
|
|
|
|
| Integer |
|
2086
|
|
|
|
|
|
|
| undef |
|
2087
|
|
|
|
|
|
|
| Optionally specify a strict limit on the number of principal components to return. The function will return min(rank, rows, columns) components. |
|
2088
|
|
|
|
|
|
|
|
|
2089
|
|
|
|
|
|
|
|
|
2090
|
|
|
|
|
|
|
| |
|
2091
|
|
|
|
|
|
|
|
|
2092
|
|
|
|
|
|
|
=end html |
|
2093
|
|
|
|
|
|
|
|
|
2094
|
|
|
|
|
|
|
|
|
2095
|
|
|
|
|
|
|
|
|
2096
|
|
|
|
|
|
|
=head3 Results |
|
2097
|
|
|
|
|
|
|
|
|
2098
|
|
|
|
|
|
|
=head4 Returned Data Structure |
|
2099
|
|
|
|
|
|
|
|
|
2100
|
|
|
|
|
|
|
The C function returns a HashRef containing the following keys representing the results of the Principal Component Analysis: |
|
2101
|
|
|
|
|
|
|
|
|
2102
|
|
|
|
|
|
|
|
|
2103
|
|
|
|
|
|
|
|
|
2104
|
|
|
|
|
|
|
=begin html |
|
2105
|
|
|
|
|
|
|
|
|
2106
|
|
|
|
|
|
|
|
2107
|
|
|
|
|
|
|
|
2108
|
|
|
|
|
|
|
|
|
2109
|
|
|
|
|
|
|
| Key |
|
2110
|
|
|
|
|
|
|
| Type |
|
2111
|
|
|
|
|
|
|
| Description |
|
2112
|
|
|
|
|
|
|
|
|
2113
|
|
|
|
|
|
|
| |
|
2114
|
|
|
|
|
|
|
|
|
2115
|
|
|
|
|
|
|
|
|
2116
|
|
|
|
|
|
|
| sdev |
|
2117
|
|
|
|
|
|
|
| ArrayRef[Number] |
|
2118
|
|
|
|
|
|
|
| The standard deviations of the principal components. Mathematically, these are the square roots of the eigenvalues of the covariance matrix. |
|
2119
|
|
|
|
|
|
|
|
|
2120
|
|
|
|
|
|
|
|
|
2121
|
|
|
|
|
|
|
| rotation |
|
2122
|
|
|
|
|
|
|
| ArrayRef[ArrayRef] |
|
2123
|
|
|
|
|
|
|
| A 2D array representing the matrix of variable loadings (the eigenvectors). Each inner array represents a row, and the columns correspond to the principal components. |
|
2124
|
|
|
|
|
|
|
|
|
2125
|
|
|
|
|
|
|
|
|
2126
|
|
|
|
|
|
|
| x |
|
2127
|
|
|
|
|
|
|
| ArrayRef[ArrayRef] |
|
2128
|
|
|
|
|
|
|
| A 2D array containing the rotated data (often referred to as PCA scores). This is the original data projected onto the principal components. Note: Only present if the retx option is true. |
|
2129
|
|
|
|
|
|
|
|
|
2130
|
|
|
|
|
|
|
|
|
2131
|
|
|
|
|
|
|
| center |
|
2132
|
|
|
|
|
|
|
| ArrayRef[Number] or 0 |
|
2133
|
|
|
|
|
|
|
| The centering values used (typically the column means). Returns false (0) if centering was disabled. |
|
2134
|
|
|
|
|
|
|
|
|
2135
|
|
|
|
|
|
|
|
|
2136
|
|
|
|
|
|
|
| scale |
|
2137
|
|
|
|
|
|
|
| ArrayRef[Number] or 0 |
|
2138
|
|
|
|
|
|
|
| The scaling values used (typically the column standard deviations). Returns false (0) if scaling was disabled. |
|
2139
|
|
|
|
|
|
|
|
|
2140
|
|
|
|
|
|
|
|
|
2141
|
|
|
|
|
|
|
| varnames |
|
2142
|
|
|
|
|
|
|
| ArrayRef[String] |
|
2143
|
|
|
|
|
|
|
| The sorted names of the original variables. Note: Only present if the input data was a Hash of Arrays (HoA) or a Hash of Hashes (HoH). |
|
2144
|
|
|
|
|
|
|
|
|
2145
|
|
|
|
|
|
|
|
|
2146
|
|
|
|
|
|
|
| |
|
2147
|
|
|
|
|
|
|
|
|
2148
|
|
|
|
|
|
|
=end html |
|
2149
|
|
|
|
|
|
|
|
|
2150
|
|
|
|
|
|
|
|
|
2151
|
|
|
|
|
|
|
|
|
2152
|
|
|
|
|
|
|
=head3 Using array of arrays |
|
2153
|
|
|
|
|
|
|
|
|
2154
|
|
|
|
|
|
|
my $aoa = [ |
|
2155
|
|
|
|
|
|
|
[2, 4], |
|
2156
|
|
|
|
|
|
|
[4, 2], |
|
2157
|
|
|
|
|
|
|
[6, 6] |
|
2158
|
|
|
|
|
|
|
]; |
|
2159
|
|
|
|
|
|
|
|
|
2160
|
|
|
|
|
|
|
my $pca = prcomp($aoa); |
|
2161
|
|
|
|
|
|
|
|
|
2162
|
|
|
|
|
|
|
which returns |
|
2163
|
|
|
|
|
|
|
|
|
2164
|
|
|
|
|
|
|
{ |
|
2165
|
|
|
|
|
|
|
center [ |
|
2166
|
|
|
|
|
|
|
[0] 4, |
|
2167
|
|
|
|
|
|
|
[1] 4 |
|
2168
|
|
|
|
|
|
|
], |
|
2169
|
|
|
|
|
|
|
rotation [ |
|
2170
|
|
|
|
|
|
|
[0] [ |
|
2171
|
|
|
|
|
|
|
[0] 0.707106781186547, |
|
2172
|
|
|
|
|
|
|
[1] 0.707106781186548 |
|
2173
|
|
|
|
|
|
|
], |
|
2174
|
|
|
|
|
|
|
[1] [ |
|
2175
|
|
|
|
|
|
|
[0] 0.707106781186548, |
|
2176
|
|
|
|
|
|
|
[1] -0.707106781186547 |
|
2177
|
|
|
|
|
|
|
] |
|
2178
|
|
|
|
|
|
|
], |
|
2179
|
|
|
|
|
|
|
scale 0, |
|
2180
|
|
|
|
|
|
|
sdev [ |
|
2181
|
|
|
|
|
|
|
[0] 2.44948974278318, |
|
2182
|
|
|
|
|
|
|
[1] 1.4142135623731 |
|
2183
|
|
|
|
|
|
|
], |
|
2184
|
|
|
|
|
|
|
x [ |
|
2185
|
|
|
|
|
|
|
[0] [ |
|
2186
|
|
|
|
|
|
|
[0] -1.41421356237309, |
|
2187
|
|
|
|
|
|
|
[1] -1.4142135623731 |
|
2188
|
|
|
|
|
|
|
], |
|
2189
|
|
|
|
|
|
|
[1] [ |
|
2190
|
|
|
|
|
|
|
[0] -1.4142135623731, |
|
2191
|
|
|
|
|
|
|
[1] 1.41421356237309 |
|
2192
|
|
|
|
|
|
|
], |
|
2193
|
|
|
|
|
|
|
[2] [ |
|
2194
|
|
|
|
|
|
|
[0] 2.82842712474619, |
|
2195
|
|
|
|
|
|
|
[1] 2.22044604925031e-16 |
|
2196
|
|
|
|
|
|
|
] |
|
2197
|
|
|
|
|
|
|
] |
|
2198
|
|
|
|
|
|
|
} |
|
2199
|
|
|
|
|
|
|
|
|
2200
|
|
|
|
|
|
|
=head3 Hash of Arrays |
|
2201
|
|
|
|
|
|
|
|
|
2202
|
|
|
|
|
|
|
my $hoa = { B => [4, 2, 6], A => [2, 4, 6] }; |
|
2203
|
|
|
|
|
|
|
my $pca = prcomp($hoa); |
|
2204
|
|
|
|
|
|
|
|
|
2205
|
|
|
|
|
|
|
=head2 quantile |
|
2206
|
|
|
|
|
|
|
|
|
2207
|
|
|
|
|
|
|
Calculates sample quantiles using R's continuous Type 7 interpolation. |
|
2208
|
|
|
|
|
|
|
|
|
2209
|
|
|
|
|
|
|
my $quantile = quantile('x' => [1..99], probs => [0.05, 0.1, 0.25]); |
|
2210
|
|
|
|
|
|
|
|
|
2211
|
|
|
|
|
|
|
If the C parameter is omitted, it behaves identically to R by defaulting to the 0, 25, 50, 75, and 100 percentiles (C). The returned hash keys match R's standardized naming convention (e.g., C<"25%">, C<"33.3%">). |
|
2212
|
|
|
|
|
|
|
|
|
2213
|
|
|
|
|
|
|
=head2 rbinom |
|
2214
|
|
|
|
|
|
|
|
|
2215
|
|
|
|
|
|
|
Create a binomial distribution of numbers |
|
2216
|
|
|
|
|
|
|
|
|
2217
|
|
|
|
|
|
|
my $binom = rbinom( n => $n, prob => 0.5, size => 9); |
|
2218
|
|
|
|
|
|
|
|
|
2219
|
|
|
|
|
|
|
It hooks directly into Perl's internal PRNG system, respecting C seeds. |
|
2220
|
|
|
|
|
|
|
|
|
2221
|
|
|
|
|
|
|
=head2 read_table |
|
2222
|
|
|
|
|
|
|
|
|
2223
|
|
|
|
|
|
|
I've tried to make this as simple as possible, trying to follow from R: |
|
2224
|
|
|
|
|
|
|
|
|
2225
|
|
|
|
|
|
|
my $test_data = read_table('t/HepatitisCdata.csv'); |
|
2226
|
|
|
|
|
|
|
|
|
2227
|
|
|
|
|
|
|
=head3 options |
|
2228
|
|
|
|
|
|
|
|
|
2229
|
|
|
|
|
|
|
|
|
2230
|
|
|
|
|
|
|
|
|
2231
|
|
|
|
|
|
|
=begin html |
|
2232
|
|
|
|
|
|
|
|
|
2233
|
|
|
|
|
|
|
|
|
2274
|
|
|
|
|
|
|
|
|
2275
|
|
|
|
|
|
|
=end html |
|
2276
|
|
|
|
|
|
|
|
|
2277
|
|
|
|
|
|
|
|
|
2278
|
|
|
|
|
|
|
|
|
2279
|
|
|
|
|
|
|
output types can be AOH (aoa), HOA (hoa), HOH (hoh) |
|
2280
|
|
|
|
|
|
|
|
|
2281
|
|
|
|
|
|
|
read_table($filename, 'output.type' => 'aoh'); |
|
2282
|
|
|
|
|
|
|
|
|
2283
|
|
|
|
|
|
|
read_table($filename, 'output.type' => 'hoa'); |
|
2284
|
|
|
|
|
|
|
|
|
2285
|
|
|
|
|
|
|
and, like Text::CSV_XS, filters can be applied in order to save RAM on big files: |
|
2286
|
|
|
|
|
|
|
|
|
2287
|
|
|
|
|
|
|
$test_data = read_table( |
|
2288
|
|
|
|
|
|
|
't/HepatitisCdata.csv', |
|
2289
|
|
|
|
|
|
|
filter => { |
|
2290
|
|
|
|
|
|
|
Sex => sub {$_ eq 'f'} # where "Sex" is the column name, and "$_" is the value for that column |
|
2291
|
|
|
|
|
|
|
}, |
|
2292
|
|
|
|
|
|
|
'output.type' => 'aoh' |
|
2293
|
|
|
|
|
|
|
); |
|
2294
|
|
|
|
|
|
|
|
|
2295
|
|
|
|
|
|
|
the default delimiter is C<,> |
|
2296
|
|
|
|
|
|
|
Suffixes C<.csv> and C<.tsv> are automatically detected from file names, but if specified, are overridden by C and/or C. C is given priority. |
|
2297
|
|
|
|
|
|
|
|
|
2298
|
|
|
|
|
|
|
=head2 rnorm |
|
2299
|
|
|
|
|
|
|
|
|
2300
|
|
|
|
|
|
|
Make a normal distribution of numbers, with pre-set mean C, standard deviation C, and number C. |
|
2301
|
|
|
|
|
|
|
|
|
2302
|
|
|
|
|
|
|
my ($rmean, $sd, $n) = (10, 2, 9999); |
|
2303
|
|
|
|
|
|
|
my $normals = rnorm( n => $n, mean => $rmean, sd => $sd); |
|
2304
|
|
|
|
|
|
|
|
|
2305
|
|
|
|
|
|
|
=head2 runif |
|
2306
|
|
|
|
|
|
|
|
|
2307
|
|
|
|
|
|
|
Make an approximately uniform distribution into an array |
|
2308
|
|
|
|
|
|
|
|
|
2309
|
|
|
|
|
|
|
=head3 named arguments |
|
2310
|
|
|
|
|
|
|
|
|
2311
|
|
|
|
|
|
|
my $unif = runif( n => $n, min => 0, max => 1); |
|
2312
|
|
|
|
|
|
|
|
|
2313
|
|
|
|
|
|
|
where C is the number of items, the values are between C and C |
|
2314
|
|
|
|
|
|
|
|
|
2315
|
|
|
|
|
|
|
=head3 positional args |
|
2316
|
|
|
|
|
|
|
|
|
2317
|
|
|
|
|
|
|
this is to match R's behavior: |
|
2318
|
|
|
|
|
|
|
|
|
2319
|
|
|
|
|
|
|
runif( 9 ) |
|
2320
|
|
|
|
|
|
|
|
|
2321
|
|
|
|
|
|
|
will make 9 numbers in [0,1] |
|
2322
|
|
|
|
|
|
|
|
|
2323
|
|
|
|
|
|
|
runif(9, 0, 99) |
|
2324
|
|
|
|
|
|
|
|
|
2325
|
|
|
|
|
|
|
will match C, C, and C respectively |
|
2326
|
|
|
|
|
|
|
|
|
2327
|
|
|
|
|
|
|
=head2 sample |
|
2328
|
|
|
|
|
|
|
|
|
2329
|
|
|
|
|
|
|
take a sample of hash or array slices. |
|
2330
|
|
|
|
|
|
|
|
|
2331
|
|
|
|
|
|
|
my $h = sample(\%h, 4); # take 4 hash keys and their values into $h |
|
2332
|
|
|
|
|
|
|
|
|
2333
|
|
|
|
|
|
|
or, alternatively, with arrays: |
|
2334
|
|
|
|
|
|
|
|
|
2335
|
|
|
|
|
|
|
my $arr = sample(\@arr, 3); # take 3 indices of an array |
|
2336
|
|
|
|
|
|
|
|
|
2337
|
|
|
|
|
|
|
=head2 scale |
|
2338
|
|
|
|
|
|
|
|
|
2339
|
|
|
|
|
|
|
my @scaled_results = scale(1..5); |
|
2340
|
|
|
|
|
|
|
|
|
2341
|
|
|
|
|
|
|
You can also pass an options hash to disable centering or scaling: |
|
2342
|
|
|
|
|
|
|
|
|
2343
|
|
|
|
|
|
|
my @scaled_results = scale(1..5, { center => false, scale => 1 }); |
|
2344
|
|
|
|
|
|
|
|
|
2345
|
|
|
|
|
|
|
It fully supports matrix operations. By passing an array of arrays, C processes the data column by column independently: |
|
2346
|
|
|
|
|
|
|
|
|
2347
|
|
|
|
|
|
|
my $scaled_mat = scale([[1, 2], [3, 4], [5, 6]]); |
|
2348
|
|
|
|
|
|
|
|
|
2349
|
|
|
|
|
|
|
=head2 sd |
|
2350
|
|
|
|
|
|
|
|
|
2351
|
|
|
|
|
|
|
my $stdev = sd(2,4,4,4,5,5,7,9); |
|
2352
|
|
|
|
|
|
|
|
|
2353
|
|
|
|
|
|
|
Correct answer is 2.1380899352994 |
|
2354
|
|
|
|
|
|
|
|
|
2355
|
|
|
|
|
|
|
C can accept both array references as well as arrays: |
|
2356
|
|
|
|
|
|
|
|
|
2357
|
|
|
|
|
|
|
my $stdev = sd([2,4,4,4,5,5,7,9]); |
|
2358
|
|
|
|
|
|
|
|
|
2359
|
|
|
|
|
|
|
As of version 0.02, sd will croak/die if any undefined values are provided. |
|
2360
|
|
|
|
|
|
|
|
|
2361
|
|
|
|
|
|
|
=head2 seq |
|
2362
|
|
|
|
|
|
|
|
|
2363
|
|
|
|
|
|
|
Works as closely as I can to R's seq, which is very similar to Perl's C loops. Returns an array, not an array reference. |
|
2364
|
|
|
|
|
|
|
|
|
2365
|
|
|
|
|
|
|
=head3 Standard integer sequence |
|
2366
|
|
|
|
|
|
|
|
|
2367
|
|
|
|
|
|
|
say 'seq(1, 5):'; |
|
2368
|
|
|
|
|
|
|
my @seq = seq(1, 5); |
|
2369
|
|
|
|
|
|
|
say join(', ', @seq), "\n"; |
|
2370
|
|
|
|
|
|
|
|
|
2371
|
|
|
|
|
|
|
say 'seq(1, 2, 0.25):'; |
|
2372
|
|
|
|
|
|
|
@seq = seq(1, 2, 0.25); |
|
2373
|
|
|
|
|
|
|
|
|
2374
|
|
|
|
|
|
|
=head3 Fractional steps |
|
2375
|
|
|
|
|
|
|
|
|
2376
|
|
|
|
|
|
|
say 'seq(1, 2, 0.25):'; |
|
2377
|
|
|
|
|
|
|
@seq = seq(1, 2, 0.25); |
|
2378
|
|
|
|
|
|
|
say join(", ", @seq), "\n"; |
|
2379
|
|
|
|
|
|
|
for (my $idx = 2; $idx >= 1; $idx -= 0.25) { # count down to pop |
|
2380
|
|
|
|
|
|
|
is_approx(pop @seq, $idx, "seq item $idx with fractional step"); |
|
2381
|
|
|
|
|
|
|
} |
|
2382
|
|
|
|
|
|
|
|
|
2383
|
|
|
|
|
|
|
=head3 Negative steps |
|
2384
|
|
|
|
|
|
|
|
|
2385
|
|
|
|
|
|
|
say 'seq(10, 5, -1):'; |
|
2386
|
|
|
|
|
|
|
@seq = seq(10, 5, -1); |
|
2387
|
|
|
|
|
|
|
say join(", ", @seq), "\n"; |
|
2388
|
|
|
|
|
|
|
for (my $idx = 5; $idx <= 10; $idx++) { # count down to pop |
|
2389
|
|
|
|
|
|
|
is_approx(pop @seq, $idx, "seq item $idx with negative step"); |
|
2390
|
|
|
|
|
|
|
} |
|
2391
|
|
|
|
|
|
|
|
|
2392
|
|
|
|
|
|
|
=head2 shapiro_test |
|
2393
|
|
|
|
|
|
|
|
|
2394
|
|
|
|
|
|
|
tests to see if an array reference is normally distributed, returns a p-value and a statistic |
|
2395
|
|
|
|
|
|
|
|
|
2396
|
|
|
|
|
|
|
my $shapiro = shapiro_test( |
|
2397
|
|
|
|
|
|
|
[1..5] |
|
2398
|
|
|
|
|
|
|
); |
|
2399
|
|
|
|
|
|
|
|
|
2400
|
|
|
|
|
|
|
and returns the hash reference: |
|
2401
|
|
|
|
|
|
|
|
|
2402
|
|
|
|
|
|
|
{ |
|
2403
|
|
|
|
|
|
|
p.value 0.589650577093106, |
|
2404
|
|
|
|
|
|
|
p_value 0.589650577093106, |
|
2405
|
|
|
|
|
|
|
statistic 0.960870680168535, |
|
2406
|
|
|
|
|
|
|
W 0.960870680168535 |
|
2407
|
|
|
|
|
|
|
} |
|
2408
|
|
|
|
|
|
|
|
|
2409
|
|
|
|
|
|
|
=head2 sum |
|
2410
|
|
|
|
|
|
|
|
|
2411
|
|
|
|
|
|
|
returns sum, but using both arrays and array references. |
|
2412
|
|
|
|
|
|
|
|
|
2413
|
|
|
|
|
|
|
my $test_data = [1..8]; |
|
2414
|
|
|
|
|
|
|
sum($test_data) |
|
2415
|
|
|
|
|
|
|
|
|
2416
|
|
|
|
|
|
|
which I prefer, compared to List::Util's required casting into an array: |
|
2417
|
|
|
|
|
|
|
|
|
2418
|
|
|
|
|
|
|
sum(@{ $test_data }); |
|
2419
|
|
|
|
|
|
|
|
|
2420
|
|
|
|
|
|
|
which passing a reference is shorter and much easier to read. Stats::LikeR, however, will work for B |
|
2421
|
|
|
|
|
|
|
|
|
2422
|
|
|
|
|
|
|
as of version 0.02, C will cause the script to die if any undefined values are provided |
|
2423
|
|
|
|
|
|
|
|
|
2424
|
|
|
|
|
|
|
=head2 summary |
|
2425
|
|
|
|
|
|
|
|
|
2426
|
|
|
|
|
|
|
Analogous to R's C, but does not deal with outputs from other functions. |
|
2427
|
|
|
|
|
|
|
C only describes data as it is entered. |
|
2428
|
|
|
|
|
|
|
An option C or its synonym C specifies the maximum number of rows that will print. |
|
2429
|
|
|
|
|
|
|
|
|
2430
|
|
|
|
|
|
|
=head3 array of array input |
|
2431
|
|
|
|
|
|
|
|
|
2432
|
|
|
|
|
|
|
my @arr; |
|
2433
|
|
|
|
|
|
|
foreach my $i (0..18) { |
|
2434
|
|
|
|
|
|
|
push @arr, runif(22); |
|
2435
|
|
|
|
|
|
|
} |
|
2436
|
|
|
|
|
|
|
|
|
2437
|
|
|
|
|
|
|
and then C, or C |
|
2438
|
|
|
|
|
|
|
|
|
2439
|
|
|
|
|
|
|
--------------------------------------------------------------------------- |
|
2440
|
|
|
|
|
|
|
Index # values Min. 1st Qu. Median Mean 3rd Qu. Max. |
|
2441
|
|
|
|
|
|
|
--------------------------------------------------------------------------- |
|
2442
|
|
|
|
|
|
|
0 22 0.04312 0.286 0.4975 0.5121 0.7296 0.9633 |
|
2443
|
|
|
|
|
|
|
1 22 0.05932 0.1483 0.495 0.4737 0.7699 0.9371 |
|
2444
|
|
|
|
|
|
|
2 22 0.02742 0.1588 0.4045 0.4325 0.6682 0.9878 |
|
2445
|
|
|
|
|
|
|
3 22 0.009233 0.2552 0.5398 0.5147 0.7755 0.9808 |
|
2446
|
|
|
|
|
|
|
4 22 0.06727 0.2432 0.5019 0.4855 0.7121 0.9043 |
|
2447
|
|
|
|
|
|
|
5 22 0.001032 0.1646 0.3021 0.3727 0.5704 0.9556 |
|
2448
|
|
|
|
|
|
|
|
|
2449
|
|
|
|
|
|
|
=head3 hash of array input |
|
2450
|
|
|
|
|
|
|
|
|
2451
|
|
|
|
|
|
|
$test_data = summary( |
|
2452
|
|
|
|
|
|
|
{ |
|
2453
|
|
|
|
|
|
|
A => runif(9), |
|
2454
|
|
|
|
|
|
|
B => runif(9) |
|
2455
|
|
|
|
|
|
|
}, |
|
2456
|
|
|
|
|
|
|
); |
|
2457
|
|
|
|
|
|
|
|
|
2458
|
|
|
|
|
|
|
=head2 t_test |
|
2459
|
|
|
|
|
|
|
|
|
2460
|
|
|
|
|
|
|
There are 1-sample and 2-sample t-tests, from one or two arrays: |
|
2461
|
|
|
|
|
|
|
|
|
2462
|
|
|
|
|
|
|
my $t_test = t_test( $array1, mu => 0.2334 ); |
|
2463
|
|
|
|
|
|
|
|
|
2464
|
|
|
|
|
|
|
or 2-sample: |
|
2465
|
|
|
|
|
|
|
|
|
2466
|
|
|
|
|
|
|
$t_test = t_test( |
|
2467
|
|
|
|
|
|
|
$array1, $array2, |
|
2468
|
|
|
|
|
|
|
paired => 1 |
|
2469
|
|
|
|
|
|
|
); |
|
2470
|
|
|
|
|
|
|
|
|
2471
|
|
|
|
|
|
|
returns a hash reference, which looks like: |
|
2472
|
|
|
|
|
|
|
|
|
2473
|
|
|
|
|
|
|
conf_int => [ |
|
2474
|
|
|
|
|
|
|
-0.06672889, 0.25672889 |
|
2475
|
|
|
|
|
|
|
], |
|
2476
|
|
|
|
|
|
|
df => 5, |
|
2477
|
|
|
|
|
|
|
estimate => 0.095, |
|
2478
|
|
|
|
|
|
|
p_value => 0.19143688433660, |
|
2479
|
|
|
|
|
|
|
statistic => 1.50996688705414 |
|
2480
|
|
|
|
|
|
|
|
|
2481
|
|
|
|
|
|
|
the two groups compared can be specified, though not necessarily, as C and C, just like in R: |
|
2482
|
|
|
|
|
|
|
|
|
2483
|
|
|
|
|
|
|
$t_test = t_test( |
|
2484
|
|
|
|
|
|
|
'x' => $array1, 'y' => $array2, |
|
2485
|
|
|
|
|
|
|
paired => 1 |
|
2486
|
|
|
|
|
|
|
); |
|
2487
|
|
|
|
|
|
|
|
|
2488
|
|
|
|
|
|
|
=head3 Parameters |
|
2489
|
|
|
|
|
|
|
|
|
2490
|
|
|
|
|
|
|
|
|
2491
|
|
|
|
|
|
|
|
|
2492
|
|
|
|
|
|
|
=begin html |
|
2493
|
|
|
|
|
|
|
|
|
2494
|
|
|
|
|
|
|
|
|
2548
|
|
|
|
|
|
|
|
|
2549
|
|
|
|
|
|
|
=end html |
|
2550
|
|
|
|
|
|
|
|
|
2551
|
|
|
|
|
|
|
|
|
2552
|
|
|
|
|
|
|
|
|
2553
|
|
|
|
|
|
|
=head3 Return Hash |
|
2554
|
|
|
|
|
|
|
|
|
2555
|
|
|
|
|
|
|
|
|
2556
|
|
|
|
|
|
|
|
|
2557
|
|
|
|
|
|
|
=begin html |
|
2558
|
|
|
|
|
|
|
|
|
2559
|
|
|
|
|
|
|
|
|
2597
|
|
|
|
|
|
|
|
|
2598
|
|
|
|
|
|
|
=end html |
|
2599
|
|
|
|
|
|
|
|
|
2600
|
|
|
|
|
|
|
|
|
2601
|
|
|
|
|
|
|
|
|
2602
|
|
|
|
|
|
|
=head2 transpose |
|
2603
|
|
|
|
|
|
|
|
|
2604
|
|
|
|
|
|
|
Transposes a two-dimensional data structure, swapping rows and columns. Accepts either an array of arrays or a hash of hashes. |
|
2605
|
|
|
|
|
|
|
Returns a new reference of the same type; the input is never modified. |
|
2606
|
|
|
|
|
|
|
|
|
2607
|
|
|
|
|
|
|
=head3 Array of array input |
|
2608
|
|
|
|
|
|
|
|
|
2609
|
|
|
|
|
|
|
Takes a reference to an array of array references and returns a new AoA where C |
|
2610
|
|
|
|
|
|
|
|
|
2611
|
|
|
|
|
|
|
my $matrix = [[1, 2, 3], [4, 5, 6]]; |
|
2612
|
|
|
|
|
|
|
my $t = transpose($matrix); |
|
2613
|
|
|
|
|
|
|
# [[1, 4], |
|
2614
|
|
|
|
|
|
|
# [2, 5], |
|
2615
|
|
|
|
|
|
|
# [3, 6]] |
|
2616
|
|
|
|
|
|
|
|
|
2617
|
|
|
|
|
|
|
All rows must be the same length; a ragged input is a fatal error. |
|
2618
|
|
|
|
|
|
|
C is valid as an element value and is preserved exactly. An empty outer array or an array of empty rows both return C<[]>. |
|
2619
|
|
|
|
|
|
|
|
|
2620
|
|
|
|
|
|
|
Dies if: |
|
2621
|
|
|
|
|
|
|
- any inner element is not an array reference |
|
2622
|
|
|
|
|
|
|
- rows differ in length (ragged array) |
|
2623
|
|
|
|
|
|
|
|
|
2624
|
|
|
|
|
|
|
=head3 Hash of hash input |
|
2625
|
|
|
|
|
|
|
|
|
2626
|
|
|
|
|
|
|
Takes a reference to a hash of hash references and returns a new HoH where C |
|
2627
|
|
|
|
|
|
|
|
|
2628
|
|
|
|
|
|
|
my $table = { alice => { score => 97, grade => 'A' }, bob => { score => 84, grade => 'B' } }; |
|
2629
|
|
|
|
|
|
|
my $t = transpose($table); |
|
2630
|
|
|
|
|
|
|
# { score => { alice => 97, bob => 84 }, |
|
2631
|
|
|
|
|
|
|
# grade => { alice => 'A', bob => 'B' } } |
|
2632
|
|
|
|
|
|
|
|
|
2633
|
|
|
|
|
|
|
Inner keys do not need to be uniform across rows. If a given column key appears in only some rows, the output hash for that column will simply contain only those rows — no padding or C-filling is performed. |
|
2634
|
|
|
|
|
|
|
|
|
2635
|
|
|
|
|
|
|
my $sparse = { |
|
2636
|
|
|
|
|
|
|
a => { x => 1, y => 2 }, |
|
2637
|
|
|
|
|
|
|
b => { x => 3, z => 4 } }; |
|
2638
|
|
|
|
|
|
|
|
|
2639
|
|
|
|
|
|
|
my $t = transpose($sparse); |
|
2640
|
|
|
|
|
|
|
# { x => { a => 1, b => 3 }, |
|
2641
|
|
|
|
|
|
|
# y => { a => 2 }, |
|
2642
|
|
|
|
|
|
|
# z => { b => 4 } } |
|
2643
|
|
|
|
|
|
|
|
|
2644
|
|
|
|
|
|
|
An empty outer hash or an outer hash whose inner hashes are all empty both return C<{}>. |
|
2645
|
|
|
|
|
|
|
|
|
2646
|
|
|
|
|
|
|
Dies if any inner element is not a hash reference |
|
2647
|
|
|
|
|
|
|
|
|
2648
|
|
|
|
|
|
|
=head2 value_counts |
|
2649
|
|
|
|
|
|
|
|
|
2650
|
|
|
|
|
|
|
Count the values in a given data set, return a hash reference showing how many times each particular value is present. |
|
2651
|
|
|
|
|
|
|
|
|
2652
|
|
|
|
|
|
|
=head3 Scalar |
|
2653
|
|
|
|
|
|
|
|
|
2654
|
|
|
|
|
|
|
$hash = value_counts('c'); |
|
2655
|
|
|
|
|
|
|
|
|
2656
|
|
|
|
|
|
|
returns C<< { c =E 1 } >> |
|
2657
|
|
|
|
|
|
|
|
|
2658
|
|
|
|
|
|
|
=head3 Array reference |
|
2659
|
|
|
|
|
|
|
|
|
2660
|
|
|
|
|
|
|
value_counts(['a','b','b']); |
|
2661
|
|
|
|
|
|
|
|
|
2662
|
|
|
|
|
|
|
returns C<< { a =E 1, b =E 2} >> |
|
2663
|
|
|
|
|
|
|
|
|
2664
|
|
|
|
|
|
|
=head3 Array |
|
2665
|
|
|
|
|
|
|
|
|
2666
|
|
|
|
|
|
|
my $value_counts = value_counts('a','b','b'); |
|
2667
|
|
|
|
|
|
|
|
|
2668
|
|
|
|
|
|
|
like an array reference above, returns C<< { a =E 1, b =E 2} >> |
|
2669
|
|
|
|
|
|
|
|
|
2670
|
|
|
|
|
|
|
=head3 Hash |
|
2671
|
|
|
|
|
|
|
|
|
2672
|
|
|
|
|
|
|
my $value_counts = value_counts( { A => 'a', B => 'a', C => 'b' } ); |
|
2673
|
|
|
|
|
|
|
|
|
2674
|
|
|
|
|
|
|
returns C<< { a =E 2, b =E 1} >> |
|
2675
|
|
|
|
|
|
|
|
|
2676
|
|
|
|
|
|
|
=head3 Hash of array |
|
2677
|
|
|
|
|
|
|
|
|
2678
|
|
|
|
|
|
|
my $value_counts = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']}); |
|
2679
|
|
|
|
|
|
|
|
|
2680
|
|
|
|
|
|
|
without a key (like above), the occurences of C, C, and C are counted. |
|
2681
|
|
|
|
|
|
|
|
|
2682
|
|
|
|
|
|
|
With a key, like C for above, only values within that hash key are counted: |
|
2683
|
|
|
|
|
|
|
|
|
2684
|
|
|
|
|
|
|
my $vc = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']}, 'a'); |
|
2685
|
|
|
|
|
|
|
|
|
2686
|
|
|
|
|
|
|
=head3 Hash of hash (table) |
|
2687
|
|
|
|
|
|
|
|
|
2688
|
|
|
|
|
|
|
$hash = value_counts( { |
|
2689
|
|
|
|
|
|
|
A => { |
|
2690
|
|
|
|
|
|
|
a => 'x', |
|
2691
|
|
|
|
|
|
|
b => 'z' |
|
2692
|
|
|
|
|
|
|
}, |
|
2693
|
|
|
|
|
|
|
B => { |
|
2694
|
|
|
|
|
|
|
a => 'x' |
|
2695
|
|
|
|
|
|
|
}, |
|
2696
|
|
|
|
|
|
|
C => { |
|
2697
|
|
|
|
|
|
|
a => 'y' |
|
2698
|
|
|
|
|
|
|
} |
|
2699
|
|
|
|
|
|
|
}, 'a'); |
|
2700
|
|
|
|
|
|
|
|
|
2701
|
|
|
|
|
|
|
the column, or second hash key, that you wish to count, is specified at the command line |
|
2702
|
|
|
|
|
|
|
|
|
2703
|
|
|
|
|
|
|
=head2 var |
|
2704
|
|
|
|
|
|
|
|
|
2705
|
|
|
|
|
|
|
as simple as possible: |
|
2706
|
|
|
|
|
|
|
|
|
2707
|
|
|
|
|
|
|
var(2, 4, 5, 8, 9) |
|
2708
|
|
|
|
|
|
|
|
|
2709
|
|
|
|
|
|
|
as of version 0.02, C will die if any undefined values are provided |
|
2710
|
|
|
|
|
|
|
|
|
2711
|
|
|
|
|
|
|
like C, C, etc., C can accept array references, to make code simpler: |
|
2712
|
|
|
|
|
|
|
|
|
2713
|
|
|
|
|
|
|
my $ref = \@arr; |
|
2714
|
|
|
|
|
|
|
var($ref) = var(@arr) |
|
2715
|
|
|
|
|
|
|
|
|
2716
|
|
|
|
|
|
|
=head2 var_test |
|
2717
|
|
|
|
|
|
|
|
|
2718
|
|
|
|
|
|
|
As described by R: Performs an F test to compare the variances of two samples from normal populations |
|
2719
|
|
|
|
|
|
|
|
|
2720
|
|
|
|
|
|
|
use Stats::LikeR; |
|
2721
|
|
|
|
|
|
|
|
|
2722
|
|
|
|
|
|
|
my @x = (2.9, 3.0, 2.5, 2.6, 3.2); |
|
2723
|
|
|
|
|
|
|
my @y = (3.8, 2.7, 4.0, 2.4); |
|
2724
|
|
|
|
|
|
|
|
|
2725
|
|
|
|
|
|
|
my $vt = var_test(\@x, \@y); |
|
2726
|
|
|
|
|
|
|
|
|
2727
|
|
|
|
|
|
|
also, conf_level can be set: |
|
2728
|
|
|
|
|
|
|
|
|
2729
|
|
|
|
|
|
|
$vt = var_test(\@x, \@y, conf_level => 0.99); |
|
2730
|
|
|
|
|
|
|
|
|
2731
|
|
|
|
|
|
|
as well as a ratio (from R: the hypothesized ratio of the population variances of C and C: |
|
2732
|
|
|
|
|
|
|
|
|
2733
|
|
|
|
|
|
|
$test_data = var_test(\@xk, \@yk, ratio => 2); |
|
2734
|
|
|
|
|
|
|
|
|
2735
|
|
|
|
|
|
|
=head2 wilcox_test |
|
2736
|
|
|
|
|
|
|
|
|
2737
|
|
|
|
|
|
|
$test_data = wilcox_test( |
|
2738
|
|
|
|
|
|
|
[1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30], |
|
2739
|
|
|
|
|
|
|
[0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29] |
|
2740
|
|
|
|
|
|
|
); |
|
2741
|
|
|
|
|
|
|
|
|
2742
|
|
|
|
|
|
|
It fully supports paired tests (C<< paired =E 1 >>) and can calculate exact p-values (the default for C<< N E 50 >> without ties). If ties are encountered, it automatically switches to an approximation with continuity correction. |
|
2743
|
|
|
|
|
|
|
|
|
2744
|
|
|
|
|
|
|
=head2 write_table |
|
2745
|
|
|
|
|
|
|
|
|
2746
|
|
|
|
|
|
|
mimics R's C, with data as first argument to subroutine, and output file as second |
|
2747
|
|
|
|
|
|
|
|
|
2748
|
|
|
|
|
|
|
write_table(\@data_aoh, $tmp_file, sep => "\t", 'row.names' => 1); |
|
2749
|
|
|
|
|
|
|
|
|
2750
|
|
|
|
|
|
|
You can also precisely filter and reorder which columns are written by passing an array reference to C: |
|
2751
|
|
|
|
|
|
|
|
|
2752
|
|
|
|
|
|
|
write_table(\@data, $tmp_file, sep => "\t", 'col.names' => ['c', 'a']); |
|
2753
|
|
|
|
|
|
|
|
|
2754
|
|
|
|
|
|
|
undefined variables are printed as C by default, but can be set as you wish using C |
|
2755
|
|
|
|
|
|
|
|
|
2756
|
|
|
|
|
|
|
write_table(\%data_hoa, '/tmp/undef.val.tsv', sep => "\t", 'undef.val' => 'nan') |
|
2757
|
|
|
|
|
|
|
|
|
2758
|
|
|
|
|
|
|
as of version 0.07, C determines comma and tab-separated delimiters from the filename, but will override if C or C are explicitly set. |
|
2759
|
|
|
|
|
|
|
|
|
2760
|
|
|
|
|
|
|
Args can also be accepted: |
|
2761
|
|
|
|
|
|
|
|
|
2762
|
|
|
|
|
|
|
write_table( 'data' => \%flat, 'file' => $f ); |
|
2763
|
|
|
|
|
|
|
|
|
2764
|
|
|
|
|
|
|
=head1 changes |
|
2765
|
|
|
|
|
|
|
|
|
2766
|
|
|
|
|
|
|
=head2 0.14 |
|
2767
|
|
|
|
|
|
|
|
|
2768
|
|
|
|
|
|
|
C function added for rows |
|
2769
|
|
|
|
|
|
|
|
|
2770
|
|
|
|
|
|
|
C reads undefined values to C instead of C, which makes calculations easier |
|
2771
|
|
|
|
|
|
|
|
|
2772
|
|
|
|
|
|
|
C writes undef by default as an empty string C<''> |
|
2773
|
|
|
|
|
|
|
|
|
2774
|
|
|
|
|
|
|
C transforms a hash of hashes into an hash of arrays |
|
2775
|
|
|
|
|
|
|
|
|
2776
|
|
|
|
|
|
|
C uses C instead of C to allow for high-precision 128-bit floats to be used on quadmath machines when available: https://www.cpantesters.org/cpan/report/296f4868-631f-11f1-abba-ff15558d240b |
|
2777
|
|
|
|
|
|
|
|
|
2778
|
|
|
|
|
|
|
Numerous switches from C to C for local precision, like above |
|
2779
|
|
|
|
|
|
|
|
|
2780
|
|
|
|
|
|
|
numerous changes to C for ease of use and working with datasets with numerous undefined values |
|
2781
|
|
|
|
|
|
|
|
|
2782
|
|
|
|
|
|
|
dist.ini now links to math library when compiling: https://www.cpantesters.org/cpan/report/785e26d8-6397-11f1-89c0-dc066e8775ea |
|
2783
|
|
|
|
|
|
|
|
|
2784
|
|
|
|
|
|
|
C now should be complete, errors with confidence intervals fixed |
|
2785
|
|
|
|
|
|
|
|
|
2786
|
|
|
|
|
|
|
=head2 0.13 |
|
2787
|
|
|
|
|
|
|
|
|
2788
|
|
|
|
|
|
|
C: speed improvements; commented headers are now allowed |
|
2789
|
|
|
|
|
|
|
|
|
2790
|
|
|
|
|
|
|
C: fix for |
|
2791
|
|
|
|
|
|
|
|
|
2792
|
|
|
|
|
|
|
Attempt to free temp prematurely: SV 0x56417a2ae610 at t/write_table.t line 182. |
|
2793
|
|
|
|
|
|
|
main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203 |
|
2794
|
|
|
|
|
|
|
Attempt to free unreferenced scalar: SV 0x56417a2ae610 at t/write_table.t line 183. |
|
2795
|
|
|
|
|
|
|
main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203 |
|
2796
|
|
|
|
|
|
|
|
|
2797
|
|
|
|
|
|
|
C gives better warnings for incorrect types of data given |
|
2798
|
|
|
|
|
|
|
|
|
2799
|
|
|
|
|
|
|
Numerous changes to dist.ini to improve CPAN testing, especially for Win32 |
|
2800
|
|
|
|
|
|
|
|
|
2801
|
|
|
|
|
|
|
=head2 0.12 |
|
2802
|
|
|
|
|
|
|
|
|
2803
|
|
|
|
|
|
|
C can also take hash of arrays, and various mixes of data types |
|
2804
|
|
|
|
|
|
|
|
|
2805
|
|
|
|
|
|
|
C: Addition of C keywords in many places; should improve CPU performance |
|
2806
|
|
|
|
|
|
|
|
|
2807
|
|
|
|
|
|
|
Better POD formatting, correction of output hash for README's C |
|
2808
|
|
|
|
|
|
|
|
|
2809
|
|
|
|
|
|
|
C can now accept hash of hashes as input |
|
2810
|
|
|
|
|
|
|
|
|
2811
|
|
|
|
|
|
|
new C function for switching 2D hash keys and 2D array indices, and C for comparing columns against columns |
|
2812
|
|
|
|
|
|
|
|
|
2813
|
|
|
|
|
|
|
removed unused function from C helpers |
|
2814
|
|
|
|
|
|
|
|
|
2815
|
|
|
|
|
|
|
C: addition of restrict keywords in preinit, should improve CPU performance |
|
2816
|
|
|
|
|
|
|
|
|
2817
|
|
|
|
|
|
|
MANIFEST.skip changed to MANIFEST.SKIP to improve CPAN testing |
|
2818
|
|
|
|
|
|
|
|
|
2819
|
|
|
|
|
|
|
using C for tests of C, which may or may not work with CPAN testers (experimental) |
|
2820
|
|
|
|
|
|
|
|
|
2821
|
|
|
|
|
|
|
Added function name to warnings, so I actually know which function is producing the error |
|
2822
|
|
|
|
|
|
|
|
|
2823
|
|
|
|
|
|
|
C can also take C and C as args, in addition to positions |
|
2824
|
|
|
|
|
|
|
|
|
2825
|
|
|
|
|
|
|
fixed C as it could hang if given empty C or C |
|
2826
|
|
|
|
|
|
|
|
|
2827
|
|
|
|
|
|
|
Added C<__EXTENSIONS__> to source XS file for better CPAN testing |
|
2828
|
|
|
|
|
|
|
|
|
2829
|
|
|
|
|
|
|
=head2 0.11 |
|
2830
|
|
|
|
|
|
|
|
|
2831
|
|
|
|
|
|
|
better POD formatting for tables |
|
2832
|
|
|
|
|
|
|
|
|
2833
|
|
|
|
|
|
|
addition of MANIFEST.skip to get better testing results on CPAN |
|
2834
|
|
|
|
|
|
|
|
|
2835
|
|
|
|
|
|
|
C: bugfix for when there is no intercept in the formula, new test cases in t/glm.t |
|
2836
|
|
|
|
|
|
|
|
|
2837
|
|
|
|
|
|
|
C now accepts simple hashes as input, in addition to hash of arrays, hash of hashes, and arrays of hashes |
|
2838
|
|
|
|
|
|
|
|
|
2839
|
|
|
|
|
|
|
Better documentation for t-test |
|
2840
|
|
|
|
|
|
|
|
|
2841
|
|
|
|
|
|
|
=head2 0.10 |
|
2842
|
|
|
|
|
|
|
|
|
2843
|
|
|
|
|
|
|
changes to compilation for CPAN, trying to get this work on Windows |
|
2844
|
|
|
|
|
|
|
|
|
2845
|
|
|
|
|
|
|
Addition of C and C |
|
2846
|
|
|
|
|
|
|
|
|
2847
|
|
|
|
|
|
|
C will work without key names, just like in R. Testing for C has improved. |
|
2848
|
|
|
|
|
|
|
|
|
2849
|
|
|
|
|
|
|
=head2 0.09 |
|
2850
|
|
|
|
|
|
|
|
|
2851
|
|
|
|
|
|
|
context changes in XS C, C, and C to get better CPAN testing results |
|
2852
|
|
|
|
|
|
|
|
|
2853
|
|
|
|
|
|
|
C keywords added to C to increase speed |
|
2854
|
|
|
|
|
|
|
|
|
2855
|
|
|
|
|
|
|
=head2 0.08 |
|
2856
|
|
|
|
|
|
|
|
|
2857
|
|
|
|
|
|
|
Speed improvement in C of hashes. |
|
2858
|
|
|
|
|
|
|
|
|
2859
|
|
|
|
|
|
|
Addition of C, C, C, C, and C functions |
|
2860
|
|
|
|
|
|
|
|
|
2861
|
|
|
|
|
|
|
Chi-squared function no longer has Perl wrapper, and all code is in XS, which should result in a minor speed increase with 1 less function call. |
|
2862
|
|
|
|
|
|
|
|
|
2863
|
|
|
|
|
|
|
Compiler changes for GNU source and inclusion of C, to ensure more CPAN testing works better. |
|
2864
|
|
|
|
|
|
|
|
|
2865
|
|
|
|
|
|
|
C now returns hash-of-hash in {row}{column} |
|
2866
|
|
|
|
|
|
|
|
|
2867
|
|
|
|
|
|
|
=head2 0.07 |
|
2868
|
|
|
|
|
|
|
|
|
2869
|
|
|
|
|
|
|
Addition of C function. |
|
2870
|
|
|
|
|
|
|
|
|
2871
|
|
|
|
|
|
|
Formulas can now be omitted from C, resulting in a stacked calculation as R would think. |
|
2872
|
|
|
|
|
|
|
|
|
2873
|
|
|
|
|
|
|
Addition of C for multi-group comparisons that does not assume normality like C does. |
|
2874
|
|
|
|
|
|
|
|
|
2875
|
|
|
|
|
|
|
C and C now automatically set separators for C<.csv> files as C<,> and C<.tsv> files as C<"\t">, respectively, so these values no longer need to be specified separately from the file name. |
|
2876
|
|
|
|
|
|
|
|
|
2877
|
|
|
|
|
|
|
=head2 0.06 |
|
2878
|
|
|
|
|
|
|
|
|
2879
|
|
|
|
|
|
|
Changed compiler options so that Solaris will work |
|
2880
|
|
|
|
|
|
|
|
|
2881
|
|
|
|
|
|
|
signed integers changed to unsigned in C |
|
2882
|
|
|
|
|
|
|
|
|
2883
|
|
|
|
|
|
|
Added restrict keywords to C, and made C to C |
|
2884
|
|
|
|
|
|
|
|
|
2885
|
|
|
|
|
|
|
=head2 0.05 |
|
2886
|
|
|
|
|
|
|
|
|
2887
|
|
|
|
|
|
|
Leak testing for C |
|
2888
|
|
|
|
|
|
|
|
|
2889
|
|
|
|
|
|
|
removal of Data::Printer dependency for easier CPAN testing |
|
2890
|
|
|
|
|
|
|
|
|
2891
|
|
|
|
|
|
|
switched several C variable to C so that clang doesn't complain |
|
2892
|
|
|
|
|
|
|
|
|
2893
|
|
|
|
|
|
|
added restrict keyword for C |
|
2894
|
|
|
|
|
|
|
|
|
2895
|
|
|
|
|
|
|
=head2 0.04 |
|
2896
|
|
|
|
|
|
|
|
|
2897
|
|
|
|
|
|
|
addition of C function |
|
2898
|
|
|
|
|
|
|
|
|
2899
|
|
|
|
|
|
|
GNU source, to maximize compatibility and ease installation |
|
2900
|
|
|
|
|
|
|
|
|
2901
|
|
|
|
|
|
|
removal of JSON dependency to ease installation |
|
2902
|
|
|
|
|
|
|
|
|
2903
|
|
|
|
|
|
|
=head2 0.03 |
|
2904
|
|
|
|
|
|
|
|
|
2905
|
|
|
|
|
|
|
Compatibility back to Perl 5.10 |
|
2906
|
|
|
|
|
|
|
|
|
2907
|
|
|
|
|
|
|
=head2 0.02 |
|
2908
|
|
|
|
|
|
|
|
|
2909
|
|
|
|
|
|
|
back-compatible to Perl 5.10, instead of original 5.40, ensuring more people can use it |
|
2910
|
|
|
|
|
|
|
|
|
2911
|
|
|
|
|
|
|
added var_test |
|
2912
|
|
|
|
|
|
|
|
|
2913
|
|
|
|
|
|
|
mean, min, sum, median, var, and max die with undefined values, and print the offending indices |
|
2914
|
|
|
|
|
|
|
|
|
2915
|
|
|
|
|
|
|
"group_stats" added to aov, for TukeyHSD in the future |
|
2916
|
|
|
|
|
|
|
|
|
2917
|
|
|
|
|
|
|
"cor" dies when given data with standard deviation of 0 |
|
2918
|
|
|
|
|
|
|
|
|
2919
|
|
|
|
|
|
|
C now has C option, which shows how undefined values are printed to tables, which is C by default. |