| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#!/usr/bin/env perl |
|
2
|
|
|
|
|
|
|
# ABSTRACT: Get basic statistical functions, like in R, but with Perl using XS for performance |
|
3
|
|
|
|
|
|
|
require 5.010; |
|
4
|
23
|
|
|
23
|
|
1201268
|
use strict; |
|
|
23
|
|
|
|
|
37
|
|
|
|
23
|
|
|
|
|
810
|
|
|
5
|
23
|
|
|
23
|
|
151
|
use feature 'say'; |
|
|
23
|
|
|
|
|
37
|
|
|
|
23
|
|
|
|
|
6359
|
|
|
6
|
|
|
|
|
|
|
package Stats::LikeR; |
|
7
|
|
|
|
|
|
|
our $VERSION = 0.16; |
|
8
|
|
|
|
|
|
|
require XSLoader; |
|
9
|
23
|
|
|
23
|
|
14427
|
use Devel::Confess 'color'; |
|
|
23
|
|
|
|
|
224576
|
|
|
|
23
|
|
|
|
|
87
|
|
|
10
|
23
|
|
|
23
|
|
2358
|
use warnings FATAL => 'all'; |
|
|
23
|
|
|
|
|
60
|
|
|
|
23
|
|
|
|
|
1067
|
|
|
11
|
23
|
|
|
23
|
|
9284
|
use autodie ':default'; |
|
|
23
|
|
|
|
|
341909
|
|
|
|
23
|
|
|
|
|
80
|
|
|
12
|
23
|
|
|
23
|
|
110512
|
use Exporter 'import'; |
|
|
23
|
|
|
|
|
38
|
|
|
|
23
|
|
|
|
|
891
|
|
|
13
|
23
|
|
|
23
|
|
106
|
use Scalar::Util 'looks_like_number'; |
|
|
23
|
|
|
|
|
32
|
|
|
|
23
|
|
|
|
|
14366
|
|
|
14
|
|
|
|
|
|
|
XSLoader::load('Stats::LikeR', $VERSION); |
|
15
|
|
|
|
|
|
|
our @EXPORT_OK = qw(add_data aoh2hoa aov cfilter chisq_test col col2col cor cor_test cov csort dnorm filter fisher_test glm group_by hoh2hoa hist kruskal_test ks_test ljoin lm matrix max mean median min mode oneway_test p_adjust power_t_test prcomp quantile rbinom read_table rnorm runif sample scale sd seq shapiro_test sum summary t_test transpose value_counts var var_test view wilcox_test write_table); |
|
16
|
|
|
|
|
|
|
our @EXPORT = @EXPORT_OK; |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
require XSLoader; |
|
19
|
|
|
|
|
|
|
# ---- filter DSL: col() builds a predicate via overloading (pure Perl) ------- |
|
20
|
|
|
|
|
|
|
# Exported: filter (XS) and col. Place col()/Col/Pred near the top of the .pm; |
|
21
|
|
|
|
|
|
|
# they need no XS. filter() is the XSUB. |
|
22
|
50
|
|
|
50
|
1
|
274588
|
sub col { Stats::LikeR::Col->new($_[0]) } |
|
23
|
|
|
|
|
|
|
{ |
|
24
|
|
|
|
|
|
|
package Stats::LikeR::Col; |
|
25
|
50
|
|
33
|
50
|
|
351
|
sub new { bless { name => $_[1] }, ref($_[0]) || $_[0] } |
|
26
|
|
|
|
|
|
|
# build a comparison leaf; if operands were swapped (4 > col('x')), flip the op |
|
27
|
|
|
|
|
|
|
sub _c { |
|
28
|
50
|
|
|
50
|
|
98
|
my ($self, $val, $swapped, $op, $flip) = @_; |
|
29
|
50
|
100
|
|
|
|
127
|
Stats::LikeR::Pred->_leaf($self->{name}, $swapped ? $flip : $op, $val); |
|
30
|
|
|
|
|
|
|
} |
|
31
|
|
|
|
|
|
|
use overload |
|
32
|
19
|
|
|
19
|
|
42
|
'>' => sub { $_[0]->_c($_[1],$_[2],'>','<') }, |
|
33
|
7
|
|
|
7
|
|
23
|
'<' => sub { $_[0]->_c($_[1],$_[2],'<','>') }, |
|
34
|
2
|
|
|
2
|
|
7
|
'>=' => sub { $_[0]->_c($_[1],$_[2],'>=','<=') }, |
|
35
|
3
|
|
|
3
|
|
9
|
'<=' => sub { $_[0]->_c($_[1],$_[2],'<=','>=') }, |
|
36
|
4
|
|
|
4
|
|
11
|
'==' => sub { $_[0]->_c($_[1],$_[2],'==','==') }, |
|
37
|
2
|
|
|
2
|
|
6
|
'!=' => sub { $_[0]->_c($_[1],$_[2],'!=','!=') }, |
|
38
|
2
|
|
|
2
|
|
6
|
'lt' => sub { $_[0]->_c($_[1],$_[2],'lt','gt') }, |
|
39
|
2
|
|
|
2
|
|
6
|
'gt' => sub { $_[0]->_c($_[1],$_[2],'gt','lt') }, |
|
40
|
2
|
|
|
2
|
|
6
|
'le' => sub { $_[0]->_c($_[1],$_[2],'le','ge') }, |
|
41
|
2
|
|
|
2
|
|
5
|
'ge' => sub { $_[0]->_c($_[1],$_[2],'ge','le') }, |
|
42
|
3
|
|
|
3
|
|
10
|
'eq' => sub { $_[0]->_c($_[1],$_[2],'eq','eq') }, |
|
43
|
2
|
|
|
2
|
|
7
|
'ne' => sub { $_[0]->_c($_[1],$_[2],'ne','ne') }, |
|
44
|
23
|
|
|
23
|
|
206
|
fallback => 1; |
|
|
23
|
|
|
|
|
43
|
|
|
|
23
|
|
|
|
|
489
|
|
|
45
|
|
|
|
|
|
|
} |
|
46
|
|
|
|
|
|
|
{ |
|
47
|
|
|
|
|
|
|
package Stats::LikeR::Pred; |
|
48
|
50
|
|
|
50
|
|
504
|
sub _leaf { bless { op => $_[2], col => $_[1], val => $_[3] }, 'Stats::LikeR::Pred' } |
|
49
|
8
|
|
|
8
|
|
68
|
sub _node { bless { op => $_[0], l => $_[1], r => $_[2] }, 'Stats::LikeR::Pred' } |
|
50
|
|
|
|
|
|
|
use overload |
|
51
|
4
|
|
|
4
|
|
11
|
'&' => sub { Stats::LikeR::Pred::_node('and', $_[0], $_[1]) }, |
|
52
|
2
|
|
|
2
|
|
6
|
'|' => sub { Stats::LikeR::Pred::_node('or', $_[0], $_[1]) }, |
|
53
|
2
|
|
|
2
|
|
6
|
'!' => sub { Stats::LikeR::Pred::_node('not', $_[0], undef) }, |
|
54
|
23
|
|
|
23
|
|
7861
|
fallback => 1; |
|
|
23
|
|
|
|
|
31
|
|
|
|
23
|
|
|
|
|
157
|
|
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
sub summary { |
|
57
|
15
|
|
|
15
|
1
|
799852
|
my ($data, %args); |
|
58
|
15
|
|
|
|
|
128
|
my $current_sub = (split(/::/,(caller(0))[3]))[-1]; |
|
59
|
|
|
|
|
|
|
|
|
60
|
15
|
100
|
66
|
|
|
99
|
if (@_ && ref $_[0]) { |
|
61
|
|
|
|
|
|
|
# Handles: summary(\@arr) or summary(\@arr, nrows => 5) or summary(\%h, nrow => 3) |
|
62
|
10
|
|
|
|
|
16
|
$data = shift; |
|
63
|
10
|
|
|
|
|
22
|
%args = @_; # capture any trailing key/value pairs |
|
64
|
|
|
|
|
|
|
} else { |
|
65
|
|
|
|
|
|
|
# Handles: summary(@runif) or summary(@runif, nrows => 2) |
|
66
|
|
|
|
|
|
|
# Extract known trailing named arguments from the flat list |
|
67
|
5
|
|
33
|
|
|
51
|
while (@_ >= 2 && defined $_[-2] && !ref($_[-2]) && $_[-2] =~ /^(?:nrows|nrow)$/) { |
|
|
|
|
33
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
68
|
2
|
|
|
|
|
3
|
my $val = pop @_; |
|
69
|
2
|
|
|
|
|
3
|
my $key = pop @_; |
|
70
|
2
|
|
|
|
|
11
|
$args{$key} = $val; |
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
# The remaining items in @_ make up the actual data array |
|
73
|
5
|
|
|
|
|
17
|
my @list = @_; |
|
74
|
5
|
|
|
|
|
14
|
$data = \@list; |
|
75
|
|
|
|
|
|
|
} |
|
76
|
|
|
|
|
|
|
# Normalize nrow -> nrows, default to 10 |
|
77
|
15
|
|
100
|
|
|
105
|
$args{nrows} //= delete($args{nrow}) // 10; |
|
|
|
|
66
|
|
|
|
|
|
78
|
15
|
|
|
|
|
28
|
my $ref_type = ref $data; |
|
79
|
15
|
100
|
100
|
|
|
72
|
if (($ref_type ne 'ARRAY') && ($ref_type ne 'HASH')) { |
|
80
|
1
|
|
|
|
|
14
|
die "$current_sub' data must either be a hash or an array, not \"$ref_type\""; |
|
81
|
|
|
|
|
|
|
} |
|
82
|
14
|
|
|
|
|
21
|
my $single_arr = 0; |
|
83
|
14
|
100
|
100
|
|
|
53
|
if (($ref_type eq 'ARRAY') && (ref $data->[0] eq '')) { |
|
84
|
8
|
|
|
|
|
15
|
$single_arr = 1; |
|
85
|
|
|
|
|
|
|
} |
|
86
|
14
|
|
|
|
|
37
|
my @header = ('# values', 'Min.', '1st Qu.', 'Median', 'Mean', '3rd Qu.', 'Max.'); |
|
87
|
14
|
|
|
|
|
19
|
my @out; |
|
88
|
14
|
100
|
|
|
|
40
|
if ($single_arr == 1) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
89
|
8
|
|
|
|
|
14
|
push @out, '-' x 75; |
|
90
|
8
|
|
|
|
|
42
|
my $header = sprintf('%9s ' x scalar @header, @header); |
|
91
|
8
|
|
|
|
|
14
|
push @out, $header; |
|
92
|
8
|
|
|
|
|
12
|
push @out, '-' x 75; |
|
93
|
8
|
|
|
|
|
12
|
my @undef = grep {!defined $data->[$_]} 0..scalar @{ $data }-1; |
|
|
219
|
|
|
|
|
402
|
|
|
|
8
|
|
|
|
|
30
|
|
|
94
|
8
|
100
|
|
|
|
45
|
if (scalar @undef > 0) { |
|
95
|
1
|
|
|
|
|
27
|
say STDERR join (',', @undef); |
|
96
|
1
|
|
|
|
|
13
|
die "The above indices are not defined in $current_sub"; |
|
97
|
|
|
|
|
|
|
} |
|
98
|
7
|
|
|
|
|
10
|
my @numeric = grep {looks_like_number($_)} @{ $data }; |
|
|
216
|
|
|
|
|
334
|
|
|
|
7
|
|
|
|
|
28
|
|
|
99
|
7
|
|
|
|
|
158
|
my $q = quantile(\@numeric, probs => [0.25, 0.75]); |
|
100
|
7
|
|
|
|
|
202
|
my $vals = sprintf('%9.4g ' x scalar @header, scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric)); |
|
101
|
7
|
|
|
|
|
30
|
push @out, $vals; |
|
102
|
|
|
|
|
|
|
} elsif ($ref_type eq 'ARRAY') { |
|
103
|
3
|
|
|
|
|
7
|
push @out, '-' x 75; |
|
104
|
3
|
|
|
|
|
19
|
my $header = sprintf('%9s ' x scalar @header, @header); |
|
105
|
3
|
|
|
|
|
7
|
unshift @header, 'Index'; |
|
106
|
3
|
|
|
|
|
7
|
$header = 'Index ' . $header; |
|
107
|
3
|
|
|
|
|
5
|
push @out, $header; |
|
108
|
3
|
|
|
|
|
5
|
push @out, '-' x 75; |
|
109
|
3
|
|
|
|
|
5
|
my $rows_printed = 0; |
|
110
|
3
|
|
|
|
|
10
|
foreach my $index (0..$#$data) { |
|
111
|
6
|
|
|
|
|
11
|
my @undef = grep {!defined $data->[$index][$_]} 0..scalar @{ $data->[$index] }-1; |
|
|
36
|
|
|
|
|
90
|
|
|
|
6
|
|
|
|
|
15
|
|
|
112
|
6
|
50
|
|
|
|
18
|
if (scalar @undef > 0) { |
|
113
|
0
|
|
|
|
|
0
|
say STDERR join (',', @undef); |
|
114
|
0
|
|
|
|
|
0
|
die "The above indices are not defined for index $index in $current_sub"; |
|
115
|
|
|
|
|
|
|
} |
|
116
|
6
|
|
|
|
|
13
|
my @numeric = grep {looks_like_number($_)} @{ $data->[$index] }; |
|
|
36
|
|
|
|
|
86
|
|
|
|
6
|
|
|
|
|
13
|
|
|
117
|
6
|
|
|
|
|
60
|
my $q = quantile(\@numeric, probs => [0.25, 0.75]); |
|
118
|
6
|
|
|
|
|
118
|
my $vals = sprintf('%6.4g', $index) . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric)); |
|
119
|
6
|
|
|
|
|
14
|
push @out, $vals; |
|
120
|
6
|
|
|
|
|
9
|
$rows_printed++; |
|
121
|
6
|
100
|
|
|
|
28
|
last if $rows_printed >= $args{nrows}; # Changed to >= just to be safe |
|
122
|
|
|
|
|
|
|
} |
|
123
|
|
|
|
|
|
|
} elsif ($ref_type eq 'HASH') { |
|
124
|
3
|
|
|
|
|
8
|
push @out, '-' x 78; |
|
125
|
3
|
|
|
|
|
18
|
my $header = sprintf('%9s ' x scalar @header, @header); |
|
126
|
3
|
|
|
|
|
10
|
unshift @header, 'Key'; |
|
127
|
3
|
|
|
|
|
9
|
$header = ' Key ' . $header; |
|
128
|
3
|
|
|
|
|
6
|
push @out, $header; |
|
129
|
3
|
|
|
|
|
6
|
push @out, '-' x 78; |
|
130
|
3
|
|
|
|
|
6
|
my $rows_printed = 0; |
|
131
|
3
|
|
|
|
|
7
|
foreach my $key (sort {lc $a cmp lc $b} keys %{ $data }) { |
|
|
4
|
|
|
|
|
17
|
|
|
|
3
|
|
|
|
|
19
|
|
|
132
|
5
|
|
|
|
|
11
|
my @undef = grep {!defined $data->{$key}[$_]} 0..scalar @{ $data->{$key} }-1; |
|
|
33
|
|
|
|
|
64
|
|
|
|
5
|
|
|
|
|
17
|
|
|
133
|
5
|
50
|
|
|
|
16
|
if (scalar @undef > 0) { |
|
134
|
0
|
|
|
|
|
0
|
say STDERR join (',', @undef); |
|
135
|
0
|
|
|
|
|
0
|
die "The above indices are not defined for key $key in $current_sub"; |
|
136
|
|
|
|
|
|
|
} |
|
137
|
5
|
|
|
|
|
9
|
my @numeric = grep {looks_like_number($_)} @{ $data->{$key} }; |
|
|
33
|
|
|
|
|
73
|
|
|
|
5
|
|
|
|
|
11
|
|
|
138
|
5
|
|
|
|
|
60
|
my $q = quantile(\@numeric, probs => [0.25, 0.75]); |
|
139
|
5
|
|
|
|
|
15
|
my $print_key = substr($key, 0, 9); |
|
140
|
5
|
50
|
|
|
|
15
|
if ((length $print_key) < 9) { # make sure that short keys line up correctly |
|
141
|
5
|
|
|
|
|
12
|
$print_key .= ' ' x (9 - length $print_key); |
|
142
|
|
|
|
|
|
|
} |
|
143
|
5
|
|
|
|
|
116
|
my $vals = $print_key . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric)); |
|
144
|
5
|
|
|
|
|
15
|
push @out, $vals; |
|
145
|
5
|
|
|
|
|
7
|
$rows_printed++; |
|
146
|
5
|
100
|
|
|
|
45
|
last if $rows_printed >= $args{nrows}; |
|
147
|
|
|
|
|
|
|
} |
|
148
|
|
|
|
|
|
|
} |
|
149
|
13
|
|
|
|
|
110
|
say join ("\n", @out); |
|
150
|
13
|
|
|
|
|
77
|
return \@out; |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
sub read_table { |
|
154
|
552
|
|
|
552
|
1
|
975312
|
my $file = shift; |
|
155
|
552
|
100
|
|
|
|
5058
|
die "read_table: \"$file\" is not a file\n" unless -f $file; |
|
156
|
551
|
50
|
|
|
|
3342
|
die "read_table: \"$file\" is not readable\n" unless -r $file; |
|
157
|
|
|
|
|
|
|
|
|
158
|
551
|
|
|
|
|
727
|
my %input_args = @_; |
|
159
|
551
|
100
|
|
|
|
1009
|
if (exists $input_args{delim}) { |
|
160
|
|
|
|
|
|
|
# FIX: sep + delim together used to silently prefer delim |
|
161
|
|
|
|
|
|
|
die "read_table: pass either 'sep' or 'delim', not both\n" |
|
162
|
2
|
100
|
|
|
|
13
|
if exists $input_args{sep}; |
|
163
|
1
|
|
|
|
|
3
|
$input_args{sep} = delete $input_args{delim}; |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
|
|
166
|
550
|
100
|
|
|
|
1475
|
my $default_sep = $file =~ /\.tsv$/i ? "\t" : ','; |
|
167
|
550
|
|
|
|
|
1196
|
my %args = ( |
|
168
|
|
|
|
|
|
|
sep => $default_sep, |
|
169
|
|
|
|
|
|
|
comment => '#', |
|
170
|
|
|
|
|
|
|
%input_args, |
|
171
|
|
|
|
|
|
|
); |
|
172
|
|
|
|
|
|
|
|
|
173
|
550
|
|
|
|
|
724
|
my %allowed_args = map { $_ => 1 } ( |
|
|
2750
|
|
|
|
|
3378
|
|
|
174
|
|
|
|
|
|
|
'comment', 'output.type', 'filter', 'row.names', 'sep', |
|
175
|
|
|
|
|
|
|
); |
|
176
|
550
|
|
|
|
|
962
|
my @undef_args = sort grep { !$allowed_args{$_} } keys %args; |
|
|
1139
|
|
|
|
|
1649
|
|
|
177
|
550
|
100
|
|
|
|
851
|
if (@undef_args) { |
|
178
|
1
|
|
|
|
|
3
|
my $current_sub = ( split /::/, (caller(0))[3] )[-1]; |
|
179
|
|
|
|
|
|
|
# FIX: no more printing to STDOUT from a library ('say'); the die |
|
180
|
|
|
|
|
|
|
# message carries the offending argument names itself |
|
181
|
1
|
|
|
|
|
39
|
die "the args \"@undef_args\" aren't defined for $current_sub\n"; |
|
182
|
|
|
|
|
|
|
} |
|
183
|
549
|
|
100
|
|
|
1219
|
my $otype = $args{'output.type'} // 'aoh'; |
|
184
|
549
|
100
|
|
|
|
1236
|
die "read_table: output.type \"$otype\" isn't allowed (aoh, hoa, hoh)\n" |
|
185
|
|
|
|
|
|
|
unless $otype =~ m/^(?:aoh|hoa|hoh)$/; |
|
186
|
|
|
|
|
|
|
|
|
187
|
547
|
|
|
|
|
590
|
my $filter = $args{filter}; |
|
188
|
547
|
100
|
100
|
|
|
1350
|
if (defined $filter && ref($filter) eq 'CODE') { |
|
|
|
100
|
100
|
|
|
|
|
|
189
|
1
|
|
|
|
|
5
|
$filter = { 0 => $filter }; |
|
190
|
|
|
|
|
|
|
} elsif (defined $filter && ref($filter) ne 'HASH') { |
|
191
|
1
|
|
|
|
|
6
|
die "'filter' must be a CODE or HASH reference\n"; |
|
192
|
|
|
|
|
|
|
} |
|
193
|
|
|
|
|
|
|
|
|
194
|
546
|
|
|
|
|
560
|
my (@data, %data, @header, @uniq_header, |
|
195
|
|
|
|
|
|
|
%mapped_filters, @sorted_filter_flds, %seen_rownames); |
|
196
|
546
|
|
|
|
|
508
|
my $data_row = 0; |
|
197
|
|
|
|
|
|
|
_parse_csv_file($file, $args{sep} // '', $args{comment} // '', sub { |
|
198
|
6780
|
|
|
6780
|
|
7810
|
my ($line_ref) = @_; |
|
199
|
6780
|
100
|
|
|
|
8376
|
if (!@header) { |
|
200
|
|
|
|
|
|
|
# --- HEADER PROCESSING (copy made only here; runs once) --- |
|
201
|
546
|
|
|
|
|
918
|
my @line = @$line_ref; |
|
202
|
|
|
|
|
|
|
$line[0] =~ s/^\Q$args{comment}\E// |
|
203
|
546
|
50
|
33
|
|
|
3461
|
if @line && defined $line[0] && length( $args{comment} // '' ); |
|
|
|
|
50
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
204
|
546
|
|
|
|
|
763
|
@header = @line; |
|
205
|
546
|
100
|
66
|
|
|
1164
|
if (@header && $header[0] eq '') { |
|
206
|
11
|
|
|
|
|
28
|
$header[0] = 'row_name'; |
|
207
|
|
|
|
|
|
|
} |
|
208
|
546
|
|
|
|
|
501
|
my %seen_h; |
|
209
|
|
|
|
|
|
|
# FIX: hoa output with duplicate column names used to push the |
|
210
|
|
|
|
|
|
|
# same cell once PER OCCURRENCE, silently corrupting the column |
|
211
|
|
|
|
|
|
|
# lengths. Iterate unique names (order preserved) instead. |
|
212
|
546
|
|
|
|
|
576
|
@uniq_header = grep { !$seen_h{$_}++ } @header; |
|
|
1223
|
|
|
|
|
2272
|
|
|
213
|
546
|
|
|
|
|
574
|
my @dup_cols = grep { $seen_h{$_} > 1 } @uniq_header; |
|
|
1221
|
|
|
|
|
1462
|
|
|
214
|
546
|
100
|
|
|
|
1147
|
warn "read_table: duplicate column name(s) in $file: @dup_cols (later values win)\n" |
|
215
|
|
|
|
|
|
|
if @dup_cols; |
|
216
|
546
|
100
|
100
|
|
|
811
|
if ($otype eq 'hoh' && !defined $args{'row.names'}) { |
|
217
|
6
|
|
|
|
|
12
|
$args{'row.names'} = $header[0]; |
|
218
|
|
|
|
|
|
|
} |
|
219
|
546
|
100
|
100
|
|
|
814
|
if (defined $args{'row.names'} |
|
220
|
71
|
|
|
|
|
112
|
&& !grep { $_ eq $args{'row.names'} } @header) { |
|
221
|
1
|
|
|
|
|
7
|
die "\"$args{'row.names'}\" isn't in the header of $file\n"; |
|
222
|
|
|
|
|
|
|
} |
|
223
|
545
|
100
|
|
|
|
687
|
if ($filter) { |
|
224
|
11
|
|
|
|
|
28
|
for my $k (keys %$filter) { |
|
225
|
12
|
100
|
|
|
|
40
|
if ($k =~ /^\d+$/) { |
|
226
|
|
|
|
|
|
|
# FIX: a numeric key past the last column used to be |
|
227
|
|
|
|
|
|
|
# accepted and then silently extended every row via |
|
228
|
|
|
|
|
|
|
# the $_ write-back |
|
229
|
3
|
100
|
|
|
|
32
|
die "read_table: numeric filter key $k exceeds the " |
|
230
|
|
|
|
|
|
|
. scalar(@header) . " columns of $file\n" |
|
231
|
|
|
|
|
|
|
if $k > @header; |
|
232
|
2
|
|
|
|
|
4
|
$mapped_filters{$k} = $filter->{$k}; |
|
233
|
|
|
|
|
|
|
} else { |
|
234
|
9
|
|
|
|
|
20
|
my ($idx) = grep { $header[$_] eq $k } 0 .. $#header; |
|
|
54
|
|
|
|
|
67
|
|
|
235
|
9
|
100
|
|
|
|
22
|
die "Filter column '$k' not found in header\n" |
|
236
|
|
|
|
|
|
|
unless defined $idx; |
|
237
|
8
|
|
|
|
|
26
|
$mapped_filters{ $idx + 1 } = $filter->{$k}; |
|
238
|
|
|
|
|
|
|
} |
|
239
|
|
|
|
|
|
|
} |
|
240
|
9
|
|
|
|
|
22
|
@sorted_filter_flds = sort { $a <=> $b } keys %mapped_filters; |
|
|
1
|
|
|
|
|
5
|
|
|
241
|
|
|
|
|
|
|
} |
|
242
|
543
|
|
|
|
|
2052
|
return; |
|
243
|
|
|
|
|
|
|
} |
|
244
|
|
|
|
|
|
|
# --- DATA PROCESSING (operate on $line_ref directly; no row copy) --- |
|
245
|
6234
|
|
|
|
|
5488
|
$data_row++; |
|
246
|
6234
|
100
|
|
|
|
7575
|
if (@$line_ref != @header) { |
|
247
|
|
|
|
|
|
|
# FIX: alignment errors now say WHICH row is ragged |
|
248
|
2
|
|
|
|
|
18
|
die sprintf "Alignment error on %s data row %d (%d fields vs %d headers).\n", |
|
249
|
|
|
|
|
|
|
$file, $data_row, scalar @$line_ref, scalar @header; |
|
250
|
|
|
|
|
|
|
} |
|
251
|
6232
|
|
|
|
|
5715
|
my %line_hash; |
|
252
|
6232
|
|
|
|
|
8025
|
for my $i (0 .. $#header) { |
|
253
|
74819
|
|
|
|
|
72881
|
my $v = $line_ref->[$i]; |
|
254
|
74819
|
100
|
66
|
|
|
158118
|
$line_hash{ $header[$i] } = ( !defined($v) || $v eq '' ) ? undef : $v; |
|
255
|
|
|
|
|
|
|
} |
|
256
|
|
|
|
|
|
|
# --- APPLY FILTERS --- |
|
257
|
6232
|
100
|
|
|
|
7821
|
if (@sorted_filter_flds) { |
|
258
|
|
|
|
|
|
|
# FIX: 'local %_ = %line_hash' copied the whole row for every |
|
259
|
|
|
|
|
|
|
# filtered row AND went stale after a $_ write-back. Aliasing the |
|
260
|
|
|
|
|
|
|
# glob makes %_ THE row hash: zero copy, mutations always visible. |
|
261
|
1862
|
|
|
|
|
1903
|
local *_ = \%line_hash; |
|
262
|
1862
|
|
|
|
|
1720
|
my $skip = 0; |
|
263
|
1862
|
|
|
|
|
1899
|
foreach my $fld (@sorted_filter_flds) { |
|
264
|
1865
|
100
|
|
|
|
2706
|
local $_ = $fld == 0 ? $line_ref : $line_hash{ $header[ $fld - 1 ] }; |
|
265
|
1865
|
100
|
|
|
|
2533
|
if ( !$mapped_filters{$fld}->( $line_ref, \%line_hash ) ) { |
|
266
|
1137
|
|
|
|
|
2247
|
$skip = 1; |
|
267
|
1137
|
|
|
|
|
1111
|
last; |
|
268
|
|
|
|
|
|
|
} |
|
269
|
728
|
100
|
|
|
|
1820
|
if ( $fld > 0 ) { # write back any mutation made to $_ |
|
270
|
727
|
|
|
|
|
709
|
$line_ref->[ $fld - 1 ] = $_; |
|
271
|
727
|
100
|
100
|
|
|
1635
|
$line_hash{ $header[ $fld - 1 ] } |
|
272
|
|
|
|
|
|
|
= ( !defined($_) || $_ eq '' ) ? undef : $_; |
|
273
|
|
|
|
|
|
|
} |
|
274
|
|
|
|
|
|
|
} |
|
275
|
1862
|
100
|
|
|
|
8849
|
return if $skip; |
|
276
|
|
|
|
|
|
|
} |
|
277
|
|
|
|
|
|
|
# Populate requested data structure |
|
278
|
5095
|
100
|
|
|
|
7354
|
if ($otype eq 'aoh') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
279
|
1886
|
|
|
|
|
14820
|
push @data, \%line_hash; |
|
280
|
|
|
|
|
|
|
} elsif ($otype eq 'hoa') { |
|
281
|
1118
|
|
|
|
|
1172
|
push @{ $data{$_} }, $line_hash{$_} for @uniq_header; |
|
|
15767
|
|
|
|
|
29527
|
|
|
282
|
|
|
|
|
|
|
} elsif ($otype eq 'hoh') { |
|
283
|
2091
|
|
|
|
|
2318
|
my $row_name = $line_hash{ $args{'row.names'} }; |
|
284
|
|
|
|
|
|
|
# FIX: an undef/empty row-name cell used to become the '' hash |
|
285
|
|
|
|
|
|
|
# key with "uninitialized" warnings; now it dies with the row |
|
286
|
|
|
|
|
|
|
die sprintf "read_table: undefined row name (column '%s') in %s data row %d\n", |
|
287
|
2091
|
100
|
|
|
|
2439
|
$args{'row.names'}, $file, $data_row |
|
288
|
|
|
|
|
|
|
unless defined $row_name; |
|
289
|
|
|
|
|
|
|
warn "read_table: duplicate row name '$row_name' in $file (later values win)\n" |
|
290
|
2090
|
100
|
|
|
|
3509
|
if $seen_rownames{$row_name}++; |
|
291
|
2090
|
|
|
|
|
2036
|
foreach my $col (@uniq_header) { |
|
292
|
29180
|
100
|
|
|
|
33709
|
next if $col eq $args{'row.names'}; |
|
293
|
27090
|
|
|
|
|
54994
|
$data{$row_name}{$col} = $line_hash{$col}; |
|
294
|
|
|
|
|
|
|
} |
|
295
|
|
|
|
|
|
|
} |
|
296
|
546
|
|
50
|
|
|
15670
|
}); |
|
|
|
|
50
|
|
|
|
|
|
297
|
540
|
100
|
|
|
|
6861
|
if ($otype eq 'aoh') { |
|
298
|
524
|
|
|
|
|
2113
|
return \@data; |
|
299
|
|
|
|
|
|
|
} else { # hoa or hoh |
|
300
|
16
|
|
|
|
|
7933
|
return \%data; |
|
301
|
|
|
|
|
|
|
} |
|
302
|
|
|
|
|
|
|
} |
|
303
|
|
|
|
|
|
|
# --------------------------------------------------------------------------- |
|
304
|
|
|
|
|
|
|
# view(): an R-style `head` for the structures read_table() returns. |
|
305
|
|
|
|
|
|
|
# |
|
306
|
|
|
|
|
|
|
# Handles all three output.type values: |
|
307
|
|
|
|
|
|
|
# aoh -> ARRAY of HASH refs |
|
308
|
|
|
|
|
|
|
# hoa -> HASH of ARRAY refs |
|
309
|
|
|
|
|
|
|
# hoh -> HASH of HASH refs |
|
310
|
|
|
|
|
|
|
# |
|
311
|
|
|
|
|
|
|
# Pure Perl; the only dependency is Scalar::Util (core). There is nothing for |
|
312
|
|
|
|
|
|
|
# XS to speed up here: view only touches the first n rows, and the total-row |
|
313
|
|
|
|
|
|
|
# count is O(1) for every structure (scalar @$aoh, array length, scalar keys). |
|
314
|
|
|
|
|
|
|
# Keeping it pure Perl also keeps install portable (no compiler needed). |
|
315
|
|
|
|
|
|
|
# |
|
316
|
|
|
|
|
|
|
# Usage: |
|
317
|
|
|
|
|
|
|
# view($data); # first 6 rows, like head() |
|
318
|
|
|
|
|
|
|
# view($data, n => 20); # first 20 rows |
|
319
|
|
|
|
|
|
|
# view($data, cols => [...]); # pin explicit column order |
|
320
|
|
|
|
|
|
|
# view($data, na => '.', max_width => 30, gap => 1, ellipsis => '~'); |
|
321
|
|
|
|
|
|
|
# view($data, 'row.names' => 'id'); # use column 'id' as the row label |
|
322
|
|
|
|
|
|
|
# view($data, to => \*STDERR); # print elsewhere |
|
323
|
|
|
|
|
|
|
# my $s = view($data, return_only => 1); # capture string, suppress print |
|
324
|
|
|
|
|
|
|
# |
|
325
|
|
|
|
|
|
|
# Column order: read_table stores rows as hashes, so the original CSV column |
|
326
|
|
|
|
|
|
|
# order is gone. view sorts columns by name for a stable layout and treats a |
|
327
|
|
|
|
|
|
|
# column literally named 'row_name' (read_table's label for a leading blank |
|
328
|
|
|
|
|
|
|
# header) as the row label. Pass cols => [...] to override the order. |
|
329
|
|
|
|
|
|
|
# --------------------------------------------------------------------------- |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
sub view { |
|
332
|
41
|
|
|
41
|
1
|
164210
|
my $data = shift; |
|
333
|
41
|
|
|
|
|
112
|
my %args = @_; |
|
334
|
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# --- reject unknown arguments (mirrors read_table/write_table) --- |
|
336
|
41
|
|
|
|
|
74
|
my %allowed = map { $_ => 1 } qw( |
|
|
492
|
|
|
|
|
651
|
|
|
337
|
|
|
|
|
|
|
n rows na max_width ellipsis gap cols columns |
|
338
|
|
|
|
|
|
|
to return_only row.names row_names |
|
339
|
|
|
|
|
|
|
); |
|
340
|
41
|
|
|
|
|
98
|
my @bad = sort grep { !$allowed{$_} } keys %args; |
|
|
60
|
|
|
|
|
114
|
|
|
341
|
41
|
100
|
|
|
|
90
|
die "view: unknown argument(s): @bad\n" if @bad; |
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
# --- n / rows (synonyms); reject conflicting or non-integer values --- |
|
344
|
|
|
|
|
|
|
die "view: pass either 'n' or 'rows', not both\n" |
|
345
|
40
|
100
|
100
|
|
|
112
|
if exists $args{n} && exists $args{rows}; |
|
346
|
|
|
|
|
|
|
my $n = exists $args{rows} ? $args{rows} |
|
347
|
|
|
|
|
|
|
: exists $args{n} ? $args{n} |
|
348
|
39
|
100
|
|
|
|
89
|
: 6; |
|
|
|
100
|
|
|
|
|
|
|
349
|
39
|
100
|
66
|
|
|
285
|
die "view: 'n'/'rows' must be a non-negative integer\n" |
|
350
|
|
|
|
|
|
|
unless defined $n && $n =~ /^\d+$/; |
|
351
|
|
|
|
|
|
|
|
|
352
|
37
|
100
|
|
|
|
53
|
my $na = exists $args{na} ? $args{na} : 'NA'; |
|
353
|
37
|
100
|
|
|
|
62
|
my $maxw = exists $args{max_width} ? $args{max_width} : 80; |
|
354
|
37
|
100
|
|
|
|
48
|
my $ell = exists $args{ellipsis} ? $args{ellipsis} : '...'; |
|
355
|
37
|
50
|
|
|
|
45
|
my $gap = exists $args{gap} ? (' ' x $args{gap}) : ' '; |
|
356
|
37
|
|
100
|
|
|
80
|
my $ucols = $args{cols} || $args{columns}; |
|
357
|
37
|
|
|
|
|
42
|
my $fh = $args{to}; |
|
358
|
37
|
|
|
|
|
41
|
my $quiet = $args{return_only}; |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
# 'row.names' takes precedence over the row_names alias (both accepted) |
|
361
|
|
|
|
|
|
|
my $label_col = exists $args{'row.names'} ? $args{'row.names'} |
|
362
|
|
|
|
|
|
|
: exists $args{row_names} ? $args{row_names} |
|
363
|
37
|
100
|
|
|
|
67
|
: undef; |
|
|
|
100
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
|
|
365
|
37
|
|
|
|
|
58
|
my $rt = ref $data; |
|
366
|
37
|
100
|
100
|
|
|
85
|
die "view: expected an ARRAY (AoH) or HASH (HoA/HoH) reference, got " |
|
|
|
|
100
|
|
|
|
|
|
367
|
|
|
|
|
|
|
. ($rt || 'a non-reference') . "\n" |
|
368
|
|
|
|
|
|
|
unless $rt eq 'ARRAY' or $rt eq 'HASH'; |
|
369
|
|
|
|
|
|
|
|
|
370
|
35
|
|
|
|
|
48
|
my ($kind, @cols, @labels, @raw, $total, $lab_header); |
|
371
|
35
|
100
|
|
|
|
52
|
if ($rt eq 'ARRAY') { # ---- AoH ---- |
|
|
|
50
|
|
|
|
|
|
|
372
|
22
|
|
|
|
|
23
|
$kind = 'AoH'; |
|
373
|
22
|
|
|
|
|
25
|
$total = scalar @$data; |
|
374
|
22
|
100
|
|
|
|
27
|
my $show = $n < $total ? $n : $total; |
|
375
|
|
|
|
|
|
|
|
|
376
|
22
|
100
|
|
|
|
42
|
if ($ucols) { |
|
377
|
3
|
|
|
|
|
5
|
@cols = @$ucols; |
|
378
|
|
|
|
|
|
|
} else { |
|
379
|
|
|
|
|
|
|
# BUG FIX: scan at least one row when data exists, so the header |
|
380
|
|
|
|
|
|
|
# still lists columns even when showing 0 rows (n => 0). PERF: |
|
381
|
|
|
|
|
|
|
# collect unique keys once, then sort once -- not sort-per-row. |
|
382
|
19
|
100
|
|
|
|
32
|
my $scan = $show > 0 ? $show : ($total > 0 ? 1 : 0); |
|
|
|
100
|
|
|
|
|
|
|
383
|
19
|
|
|
|
|
22
|
my %seen; |
|
384
|
19
|
|
|
|
|
37
|
for my $i (0 .. $scan - 1) { |
|
385
|
29
|
|
|
|
|
28
|
my $row = $data->[$i]; |
|
386
|
29
|
100
|
|
|
|
47
|
next unless ref $row eq 'HASH'; |
|
387
|
28
|
|
|
|
|
74
|
$seen{$_} = 1 for keys %$row; |
|
388
|
|
|
|
|
|
|
} |
|
389
|
19
|
|
|
|
|
57
|
@cols = sort keys %seen; |
|
390
|
|
|
|
|
|
|
} |
|
391
|
|
|
|
|
|
|
my $lc = defined $label_col ? $label_col |
|
392
|
22
|
100
|
|
|
|
36
|
: (grep { $_ eq 'row_name' } @cols) ? 'row_name' : undef; |
|
|
32
|
100
|
|
|
|
44
|
|
|
393
|
22
|
100
|
|
|
|
53
|
if (defined $lc) { |
|
394
|
3
|
|
|
|
|
3
|
@cols = grep { $_ ne $lc } @cols; |
|
|
6
|
|
|
|
|
10
|
|
|
395
|
3
|
|
|
|
|
5
|
$lab_header = $lc; |
|
396
|
|
|
|
|
|
|
} |
|
397
|
22
|
|
|
|
|
30
|
for my $i (0 .. $show - 1) { |
|
398
|
31
|
|
|
|
|
33
|
my $row = $data->[$i]; |
|
399
|
31
|
100
|
|
|
|
40
|
$row = {} unless ref $row eq 'HASH'; |
|
400
|
31
|
100
|
|
|
|
42
|
push @labels, defined $lc ? $row->{$lc} : ($i + 1); |
|
401
|
31
|
|
|
|
|
34
|
push @raw, [ map { $row->{$_} } @cols ]; |
|
|
50
|
|
|
|
|
83
|
|
|
402
|
|
|
|
|
|
|
} |
|
403
|
22
|
100
|
|
|
|
39
|
$lab_header = '' unless defined $lab_header; |
|
404
|
|
|
|
|
|
|
} elsif ($rt eq 'HASH') { |
|
405
|
13
|
|
|
|
|
23
|
my @keys = keys %$data; |
|
406
|
13
|
|
|
|
|
12
|
my $sample; |
|
407
|
13
|
100
|
|
|
|
19
|
for my $k (@keys) { $sample = $data->{$k}; last if defined $sample; } |
|
|
13
|
|
|
|
|
14
|
|
|
|
13
|
|
|
|
|
21
|
|
|
408
|
13
|
|
|
|
|
15
|
my $vt = ref $sample; |
|
409
|
|
|
|
|
|
|
|
|
410
|
13
|
100
|
|
|
|
28
|
if (!@keys) { # ---- empty ---- |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
# BUG FIX: an empty hash used to die ("neither ARRAY nor HASH"); |
|
412
|
|
|
|
|
|
|
# treat it as an empty table, mirroring an empty AoH. |
|
413
|
1
|
|
|
|
|
2
|
$kind = 'Hash'; |
|
414
|
1
|
|
|
|
|
1
|
$total = 0; |
|
415
|
1
|
|
|
|
|
1
|
$lab_header = ''; |
|
416
|
|
|
|
|
|
|
} elsif ($vt eq 'ARRAY') { # ---- HoA ---- |
|
417
|
5
|
|
|
|
|
5
|
$kind = 'HoA'; |
|
418
|
5
|
100
|
|
|
|
14
|
my @allcols = $ucols ? @$ucols : sort @keys; |
|
419
|
5
|
|
|
|
|
6
|
$total = 0; |
|
420
|
5
|
|
|
|
|
6
|
for my $k (@keys) { |
|
421
|
10
|
100
|
|
|
|
14
|
next unless ref $data->{$k} eq 'ARRAY'; |
|
422
|
9
|
|
|
|
|
8
|
my $l = scalar @{ $data->{$k} }; |
|
|
9
|
|
|
|
|
9
|
|
|
423
|
9
|
100
|
|
|
|
16
|
$total = $l if $l > $total; |
|
424
|
|
|
|
|
|
|
} |
|
425
|
5
|
50
|
|
|
|
7
|
my $show = $n < $total ? $n : $total; |
|
426
|
|
|
|
|
|
|
my $lc = defined $label_col ? $label_col |
|
427
|
5
|
50
|
|
|
|
11
|
: (grep { $_ eq 'row_name' } @allcols) ? 'row_name' : undef; |
|
|
7
|
100
|
|
|
|
11
|
|
|
428
|
5
|
100
|
|
|
|
7
|
@cols = grep { !defined $lc || $_ ne $lc } @allcols; |
|
|
9
|
|
|
|
|
41
|
|
|
429
|
5
|
100
|
|
|
|
11
|
$lab_header = defined $lc ? $lc : ''; |
|
430
|
5
|
|
|
|
|
12
|
for my $i (0 .. $show - 1) { |
|
431
|
|
|
|
|
|
|
push @labels, defined $lc |
|
432
|
13
|
50
|
|
|
|
23
|
? (ref $data->{$lc} eq 'ARRAY' ? $data->{$lc}[$i] : undef) |
|
|
|
100
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
: ($i + 1); |
|
434
|
|
|
|
|
|
|
push @raw, [ map { |
|
435
|
13
|
100
|
|
|
|
18
|
ref $data->{$_} eq 'ARRAY' ? $data->{$_}[$i] : undef |
|
|
21
|
|
|
|
|
51
|
|
|
436
|
|
|
|
|
|
|
} @cols ]; |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
} elsif ($vt eq 'HASH') { # ---- HoH ---- |
|
439
|
6
|
|
|
|
|
6
|
$kind = 'HoH'; |
|
440
|
6
|
|
|
|
|
7
|
$total = scalar @keys; |
|
441
|
6
|
|
|
|
|
18
|
my @rk = sort @keys; |
|
442
|
6
|
100
|
|
|
|
9
|
my $show = $n < $total ? $n : $total; |
|
443
|
6
|
50
|
|
|
|
32
|
my @shown = $show > 0 ? @rk[0 .. $show - 1] : (); |
|
444
|
6
|
100
|
|
|
|
9
|
if ($ucols) { |
|
445
|
1
|
|
|
|
|
1
|
@cols = @$ucols; |
|
446
|
|
|
|
|
|
|
} else { |
|
447
|
|
|
|
|
|
|
# PERF: gather unique inner keys, sort once. |
|
448
|
5
|
|
|
|
|
6
|
my %seen; |
|
449
|
5
|
|
|
|
|
6
|
for my $rkk (@shown) { |
|
450
|
9
|
100
|
|
|
|
14
|
next unless ref $data->{$rkk} eq 'HASH'; |
|
451
|
8
|
|
|
|
|
8
|
$seen{$_} = 1 for keys %{ $data->{$rkk} }; |
|
|
8
|
|
|
|
|
24
|
|
|
452
|
|
|
|
|
|
|
} |
|
453
|
5
|
|
|
|
|
12
|
@cols = sort keys %seen; |
|
454
|
|
|
|
|
|
|
} |
|
455
|
6
|
100
|
|
|
|
11
|
@cols = grep { $_ ne $label_col } @cols if defined $label_col; |
|
|
4
|
|
|
|
|
7
|
|
|
456
|
|
|
|
|
|
|
# BUG FIX: honour the row_names alias here too (was 'row.names' |
|
457
|
|
|
|
|
|
|
# only, so row_names => 'x' showed a 'row_name' header). |
|
458
|
6
|
100
|
|
|
|
9
|
$lab_header = defined $label_col ? $label_col : 'row_name'; |
|
459
|
6
|
|
|
|
|
6
|
for my $rkk (@shown) { |
|
460
|
11
|
|
|
|
|
12
|
push @labels, $rkk; |
|
461
|
11
|
100
|
|
|
|
18
|
my $inner = ref $data->{$rkk} eq 'HASH' ? $data->{$rkk} : {}; |
|
462
|
11
|
|
|
|
|
12
|
push @raw, [ map { $inner->{$_} } @cols ]; |
|
|
18
|
|
|
|
|
35
|
|
|
463
|
|
|
|
|
|
|
} |
|
464
|
|
|
|
|
|
|
} else { # ---- flat hash ---- |
|
465
|
|
|
|
|
|
|
# BUG/FEATURE: a hash whose values are plain scalars |
|
466
|
|
|
|
|
|
|
# ({ a => 1, b => 2, ... }) used to die ("neither ARRAY nor |
|
467
|
|
|
|
|
|
|
# HASH"). Render it like write_table's flat hash: one row, keys |
|
468
|
|
|
|
|
|
|
# as columns, a numeric '1' row label. |
|
469
|
1
|
|
|
|
|
2
|
$kind = 'Hash'; |
|
470
|
1
|
|
|
|
|
1
|
$total = 1; |
|
471
|
1
|
50
|
|
|
|
3
|
my $show = $n < $total ? $n : $total; |
|
472
|
1
|
50
|
|
|
|
3
|
if ($ucols) { |
|
473
|
0
|
|
|
|
|
0
|
@cols = @$ucols; |
|
474
|
|
|
|
|
|
|
} else { |
|
475
|
1
|
|
|
|
|
4
|
@cols = sort @keys; |
|
476
|
|
|
|
|
|
|
} |
|
477
|
|
|
|
|
|
|
my $lc = defined $label_col ? $label_col |
|
478
|
1
|
50
|
|
|
|
3
|
: (grep { $_ eq 'row_name' } @cols) ? 'row_name' : undef; |
|
|
3
|
50
|
|
|
|
5
|
|
|
479
|
1
|
50
|
|
|
|
3
|
if (defined $lc) { |
|
480
|
0
|
|
|
|
|
0
|
@cols = grep { $_ ne $lc } @cols; |
|
|
0
|
|
|
|
|
0
|
|
|
481
|
0
|
|
|
|
|
0
|
$lab_header = $lc; |
|
482
|
|
|
|
|
|
|
} |
|
483
|
1
|
50
|
|
|
|
2
|
$lab_header = '' unless defined $lab_header; |
|
484
|
1
|
|
|
|
|
3
|
for my $i (0 .. $show - 1) { |
|
485
|
1
|
50
|
|
|
|
2
|
push @labels, defined $lc ? $data->{$lc} : ($i + 1); |
|
486
|
1
|
|
|
|
|
2
|
push @raw, [ map { $data->{$_} } @cols ]; |
|
|
3
|
|
|
|
|
5
|
|
|
487
|
|
|
|
|
|
|
} |
|
488
|
|
|
|
|
|
|
} |
|
489
|
|
|
|
|
|
|
} |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
# numeric detection per column (drives right/left alignment) |
|
492
|
35
|
|
|
|
|
65
|
my @numeric = (1) x scalar @cols; |
|
493
|
35
|
|
|
|
|
44
|
for my $r (@raw) { |
|
494
|
56
|
|
|
|
|
79
|
for my $j (0 .. $#cols) { |
|
495
|
92
|
|
|
|
|
90
|
my $v = $r->[$j]; |
|
496
|
92
|
100
|
|
|
|
126
|
next unless defined $v; |
|
497
|
84
|
100
|
|
|
|
172
|
$numeric[$j] = 0 unless looks_like_number($v); |
|
498
|
|
|
|
|
|
|
} |
|
499
|
|
|
|
|
|
|
} |
|
500
|
35
|
100
|
|
|
|
53
|
my $lab_numeric = @labels ? 1 : 0; |
|
501
|
35
|
|
|
|
|
36
|
for my $l (@labels) { |
|
502
|
48
|
100
|
66
|
|
|
134
|
$lab_numeric = 0, last unless defined $l && looks_like_number($l); |
|
503
|
|
|
|
|
|
|
} |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# stringify + sanitize + truncate cells (column names left intact) |
|
506
|
35
|
|
|
|
|
36
|
my $ell_len = length $ell; |
|
507
|
|
|
|
|
|
|
my $clean = sub { |
|
508
|
148
|
|
|
148
|
|
161
|
my $v = shift; |
|
509
|
148
|
100
|
|
|
|
197
|
return $na unless defined $v; |
|
510
|
140
|
|
|
|
|
136
|
$v = "$v"; |
|
511
|
140
|
|
|
|
|
162
|
$v =~ s/\t/\\t/g; $v =~ s/\r/\\r/g; $v =~ s/\n/\\n/g; |
|
|
140
|
|
|
|
|
149
|
|
|
|
140
|
|
|
|
|
147
|
|
|
512
|
140
|
100
|
66
|
|
|
256
|
if ($maxw && length($v) > $maxw) { |
|
513
|
2
|
|
|
|
|
2
|
my $keep = $maxw - $ell_len; |
|
514
|
2
|
50
|
|
|
|
4
|
$keep = 0 if $keep < 0; |
|
515
|
2
|
|
|
|
|
4
|
$v = substr($v, 0, $keep) . $ell; |
|
516
|
|
|
|
|
|
|
} |
|
517
|
140
|
|
|
|
|
236
|
return $v; |
|
518
|
35
|
|
|
|
|
178
|
}; |
|
519
|
35
|
|
|
|
|
47
|
my @lab_s = map { $clean->($_) } @labels; |
|
|
56
|
|
|
|
|
96
|
|
|
520
|
35
|
|
|
|
|
43
|
my @rows_s = map { [ map { $clean->($_) } @$_ ] } @raw; |
|
|
56
|
|
|
|
|
64
|
|
|
|
92
|
|
|
|
|
88
|
|
|
521
|
35
|
|
|
|
|
54
|
my @head_s = @cols; |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
# column widths |
|
524
|
35
|
|
|
|
|
36
|
my $lab_w = length $lab_header; |
|
525
|
35
|
100
|
|
|
|
44
|
for my $s (@lab_s) { my $l = length $s; $lab_w = $l if $l > $lab_w; } |
|
|
56
|
|
|
|
|
54
|
|
|
|
56
|
|
|
|
|
91
|
|
|
526
|
35
|
|
|
|
|
35
|
my @w = map { length $_ } @head_s; |
|
|
54
|
|
|
|
|
66
|
|
|
527
|
35
|
|
|
|
|
39
|
for my $r (@rows_s) { |
|
528
|
56
|
|
|
|
|
66
|
for my $j (0 .. $#cols) { |
|
529
|
92
|
|
|
|
|
103
|
my $l = length $r->[$j]; |
|
530
|
92
|
100
|
|
|
|
136
|
$w[$j] = $l if $l > $w[$j]; |
|
531
|
|
|
|
|
|
|
} |
|
532
|
|
|
|
|
|
|
} |
|
533
|
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
my $pad = sub { |
|
535
|
237
|
|
|
237
|
|
295
|
my ($s, $width, $right) = @_; |
|
536
|
237
|
50
|
|
|
|
243
|
my $g = $width - length $s; $g = 0 if $g < 0; |
|
|
237
|
|
|
|
|
308
|
|
|
537
|
237
|
100
|
|
|
|
502
|
return $right ? (' ' x $g) . $s : $s . (' ' x $g); |
|
538
|
35
|
|
|
|
|
93
|
}; |
|
539
|
|
|
|
|
|
|
|
|
540
|
35
|
|
|
|
|
37
|
my @out; |
|
541
|
35
|
|
|
|
|
39
|
my $shown = scalar @rows_s; |
|
542
|
35
|
100
|
|
|
|
167
|
push @out, sprintf("# %s: %d row%s x %d col%s (showing %d)", |
|
|
|
100
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
$kind, $total, ($total == 1 ? '' : 's'), |
|
544
|
|
|
|
|
|
|
scalar(@cols), (@cols == 1 ? '' : 's'), $shown); |
|
545
|
|
|
|
|
|
|
|
|
546
|
35
|
|
|
|
|
60
|
my @hcells = ( $pad->($lab_header, $lab_w, 0) ); |
|
547
|
35
|
|
|
|
|
83
|
push @hcells, $pad->($head_s[$_], $w[$_], $numeric[$_]) for 0 .. $#cols; |
|
548
|
35
|
|
|
|
|
76
|
push @out, join($gap, @hcells); |
|
549
|
|
|
|
|
|
|
|
|
550
|
35
|
|
|
|
|
45
|
for my $ri (0 .. $#rows_s) { |
|
551
|
56
|
|
|
|
|
66
|
my @cells = ( $pad->($lab_s[$ri], $lab_w, $lab_numeric) ); |
|
552
|
56
|
|
|
|
|
96
|
push @cells, $pad->($rows_s[$ri][$_], $w[$_], $numeric[$_]) for 0 .. $#cols; |
|
553
|
56
|
|
|
|
|
133
|
push @out, join($gap, @cells); |
|
554
|
|
|
|
|
|
|
} |
|
555
|
35
|
50
|
|
|
|
59
|
push @out, sprintf("# ... %d more row%s", $total - $shown, |
|
|
|
100
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
($total - $shown == 1 ? '' : 's')) if $shown < $total; |
|
557
|
|
|
|
|
|
|
|
|
558
|
35
|
|
|
|
|
65
|
my $str = join("\n", @out) . "\n"; |
|
559
|
35
|
50
|
|
|
|
57
|
unless ($quiet) { defined $fh ? print {$fh} $str : print $str; } |
|
|
1
|
100
|
|
|
|
7
|
|
|
|
1
|
|
|
|
|
5
|
|
|
560
|
35
|
|
|
|
|
461
|
return $str; |
|
561
|
|
|
|
|
|
|
} |
|
562
|
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
1; |
|
564
|
|
|
|
|
|
|
=encoding utf8 |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=head1 Synopsis |
|
567
|
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
Get basic statistical functions working in Perl as if they were part of List::Util, like C, C, C, etc. |
|
569
|
|
|
|
|
|
|
I've used Artificial Intelligence tools such as Claude, Gemini, and Grok to write this as well as using my own gray matter. |
|
570
|
|
|
|
|
|
|
There are other similar tools on CPAN, but I want speed and a form like List::Util, which I've gotten here with the help of AI, which often required many attempts to do correctly. |
|
571
|
|
|
|
|
|
|
This is meant to call subroutines directly through eXternal Subroutines (XS) for performance and portability. |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
There B other modules on CPAN that can do B of this, but this works the way that I B it to. |
|
574
|
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
=head1 Functions/Subroutines |
|
576
|
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
======================================================================== |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=head2 add_data |
|
580
|
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
Add data to an existing hash or array reference. This function acts as the equivalent of adding new rows, as well as an C (described below). It dynamically infers your target data structure, handles deeply nested records, and seamlessly coerces mismatched data shapes to preserve the structural integrity of your primary reference. |
|
582
|
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
=head3 Hash of Hashes (HoH) |
|
584
|
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
When the target is a Hash of Hashes, incoming hash keys update existing rows, and new keys create new rows. |
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
$data = { 'Jack Smith' => { age => 30 } }; |
|
588
|
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
$n = { |
|
590
|
|
|
|
|
|
|
'Jack Smith' => { # Update existing (Hash) |
|
591
|
|
|
|
|
|
|
dept => 'Engineering' |
|
592
|
|
|
|
|
|
|
}, |
|
593
|
|
|
|
|
|
|
'Jane Doe' => { age => 25, dept => 'Sales' }, # Add new (Hash) |
|
594
|
|
|
|
|
|
|
'Invalid' => 'Not a reference' # Edge case safety |
|
595
|
|
|
|
|
|
|
}; |
|
596
|
|
|
|
|
|
|
|
|
597
|
|
|
|
|
|
|
add_data($data, $n); |
|
598
|
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
B |
|
600
|
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
{ |
|
602
|
|
|
|
|
|
|
"Jack Smith": { |
|
603
|
|
|
|
|
|
|
"age": 30, |
|
604
|
|
|
|
|
|
|
"dept": "Engineering" |
|
605
|
|
|
|
|
|
|
}, |
|
606
|
|
|
|
|
|
|
"Jane Doe": { |
|
607
|
|
|
|
|
|
|
"age": 25, |
|
608
|
|
|
|
|
|
|
"dept": "Sales" |
|
609
|
|
|
|
|
|
|
} |
|
610
|
|
|
|
|
|
|
} |
|
611
|
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=head3 Hash of Arrays (HoA) |
|
613
|
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
When the target is a Hash of Arrays, incoming arrays are pushed onto the existing arrays, appending the new elements, similarly to R's C. |
|
615
|
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
$data = { 'Project Alpha' => [ 'task1', 'task2' ] }; |
|
617
|
|
|
|
|
|
|
$n = { |
|
618
|
|
|
|
|
|
|
'Project Alpha' => [ 'task3' ], # Appends to existing array |
|
619
|
|
|
|
|
|
|
'Project Beta' => [ 'task1', 'task2' ] # Creates new array row |
|
620
|
|
|
|
|
|
|
}; |
|
621
|
|
|
|
|
|
|
add_data($data, $n); |
|
622
|
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
B |
|
624
|
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
{ |
|
626
|
|
|
|
|
|
|
"Project Alpha": [ "task1", "task2", "task3" ], |
|
627
|
|
|
|
|
|
|
"Project Beta": [ "task1", "task2" ] |
|
628
|
|
|
|
|
|
|
} |
|
629
|
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
=head3 Array of Hashes / Arrays (AoH / AoA) |
|
631
|
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
C now natively supports Array references at the root level. When targeting an Array, it iterates through the source array and merges data at the corresponding indices. |
|
633
|
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
$data = [ |
|
635
|
|
|
|
|
|
|
{ id => 1, name => 'Alice' } |
|
636
|
|
|
|
|
|
|
]; |
|
637
|
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
$n = [ |
|
639
|
|
|
|
|
|
|
{ role => 'Admin' }, # Updates index 0 |
|
640
|
|
|
|
|
|
|
{ id => 2, name => 'Bob' } # Creates index 1 |
|
641
|
|
|
|
|
|
|
]; |
|
642
|
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
add_data($data, $n); |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
B |
|
646
|
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
[ |
|
648
|
|
|
|
|
|
|
{ "id": 1, "name": "Alice", "role": "Admin" }, |
|
649
|
|
|
|
|
|
|
{ "id": 2, "name": "Bob" } |
|
650
|
|
|
|
|
|
|
] |
|
651
|
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
=head3 Advanced Structural Coercion & Cross-Merging |
|
653
|
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
C strictly enforces the primary structure of your target reference (determined by inspecting its outer and inner bounds). If you mix Array and Hash types, the function automatically coerces the incoming data to match the target. |
|
655
|
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
B<1. Inner Coercion (Mixing Rows):> |
|
657
|
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
=over |
|
659
|
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
=item * B Source Array rows are read in pairs and converted to key-value pairs. |
|
661
|
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
=item * B Source Hash rows are flattened into key-value pairs and pushed onto the array. |
|
663
|
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
=back |
|
665
|
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
B<2. Root-Level Coercion (Mixing Outer Containers):> |
|
667
|
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
=over |
|
669
|
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
=item * B The function evaluates the Hash keys as numeric indices. (e.g., source key C<"0"> merges into target array index C<[0]>). Non-numeric keys are safely ignored. |
|
671
|
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
=item * B The function converts the Array indices into stringified Hash keys. (e.g., source array index C<[1]> merges into target hash key C<"1">). |
|
673
|
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
=back |
|
675
|
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=head3 Source is a mixed Hash. Keys dictate the target array index! |
|
677
|
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
$n = { |
|
679
|
|
|
|
|
|
|
'0' => { y => 20 }, # Merges into $data->[0] |
|
680
|
|
|
|
|
|
|
'1' => [ 'z', 30 ], # Array pair coerced to Hash, creates $data->[1] |
|
681
|
|
|
|
|
|
|
'ignored' => { k => 'v' } # Ignored: cannot map to an array index |
|
682
|
|
|
|
|
|
|
}; |
|
683
|
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
add_data($data, $n); |
|
685
|
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
B |
|
687
|
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
[ |
|
689
|
|
|
|
|
|
|
{ "x": 10, "y": 20 }, |
|
690
|
|
|
|
|
|
|
{ "z": 30 } |
|
691
|
|
|
|
|
|
|
] |
|
692
|
|
|
|
|
|
|
|
|
693
|
|
|
|
|
|
|
NB: If C is called on a completely empty target reference (e.g., C<$data = {}> or C<$data = []>), it will intelligently infer the required inner structure (Hashes vs Arrays) by inspecting the first valid row of the source data. |
|
694
|
|
|
|
|
|
|
|
|
695
|
|
|
|
|
|
|
=head2 aoh2hoa |
|
696
|
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
C — transpose an B (row-major) into a B (column-major). |
|
698
|
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
my $hoa = aoh2hoa([ { a => 1, b => 2 }, { a => 3 } ]); |
|
700
|
|
|
|
|
|
|
# $hoa = { a => [1, 3], b => [2, undef] } |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
Rows go in, columns come out: each distinct key across the input rows becomes one output column, and the values are gathered down that column in row order. |
|
703
|
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
=head3 Arguments |
|
705
|
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
C<$aoh> — an array ref of hash refs, one hash per row. This is the only argument, and it is required. Passing anything that is not an array ref is fatal: |
|
707
|
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
aoh2hoa({ a => 1 }); # dies: argument must be an arrayref of hashrefs |
|
709
|
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
=head3 Returns |
|
711
|
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
A hash ref of array refs. Each key is a column name (the union of all keys seen across the rows); each value is an array ref holding that column's cells. Every column has exactly C elements, so the result is rectangular even when the input is ragged. |
|
713
|
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
=head3 Behavior |
|
715
|
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
The column set is the B of every row's keys — a key that appears in only some rows still produces a full-length column, with C in the rows that lacked it. |
|
717
|
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
Each column is padded to exactly the row count. Cells missing from a given row come through as C, including trailing gaps (a column whose last contributing row is early still runs the full length). These absent cells are cheap holes in the array, not stored SVs. |
|
719
|
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
Values are B (C), so the returned structure is independent of the input — mutating C<$aoh> afterward won't disturb the result. The copy is shallow: a value that is itself a reference is copied the same way C<< $col-E[$i] = $row-E{$k} >> would, i.e. the ref is duplicated but its referent is shared. |
|
721
|
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
Keys are handled SV-first (C / C), so UTF-8 and otherwise non-trivial hash keys round-trip correctly. |
|
723
|
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
A row that is B a hash ref is skipped rather than fatal: it contributes C to every column at its index. So a stray C or scalar in the input thins the columns at that position instead of dying. |
|
725
|
|
|
|
|
|
|
|
|
726
|
|
|
|
|
|
|
=head3 Notes |
|
727
|
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
The output column order follows hash iteration order and is therefore not guaranteed — sort the keys if you need a stable layout. Round-tripping through C (or the reverse) reconstructs the data but not necessarily the original key/row ordering, and rows originally absent a key will gain it as an explicit C. |
|
729
|
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
=head2 aov |
|
731
|
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
Warning: assumes normal distribution |
|
733
|
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
aov( |
|
735
|
|
|
|
|
|
|
{ |
|
736
|
|
|
|
|
|
|
yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2], |
|
737
|
|
|
|
|
|
|
ctrl => [1, 1, 1, 0, 0, 0] |
|
738
|
|
|
|
|
|
|
}, |
|
739
|
|
|
|
|
|
|
'yield ~ ctrl'); |
|
740
|
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
which returns |
|
742
|
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
{ |
|
744
|
|
|
|
|
|
|
ctrl { |
|
745
|
|
|
|
|
|
|
Df 1, |
|
746
|
|
|
|
|
|
|
"F value" 25.6000000000001, |
|
747
|
|
|
|
|
|
|
"Mean Sq" 1.70666666666667, |
|
748
|
|
|
|
|
|
|
Pr(>F) 0.00718232855871859, |
|
749
|
|
|
|
|
|
|
"Sum Sq" 1.70666666666667 |
|
750
|
|
|
|
|
|
|
}, |
|
751
|
|
|
|
|
|
|
Residuals { |
|
752
|
|
|
|
|
|
|
Df 4, |
|
753
|
|
|
|
|
|
|
"Mean Sq" 0.0666666666666665, |
|
754
|
|
|
|
|
|
|
"Sum Sq" 0.266666666666666 |
|
755
|
|
|
|
|
|
|
} |
|
756
|
|
|
|
|
|
|
} |
|
757
|
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
You can also perform Two-Way ANOVA with categorical interactions using the C<*> operator. The parser will implicitly evaluate the main effects alongside the interaction: |
|
759
|
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
my $res_2way = aov($data_2way, 'len ~ supp * dose'); |
|
761
|
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
It is robust against rank deficiency; collinear terms will gracefully receive 0 degrees of freedom and 0 sum of squares, matching R's behavior. |
|
763
|
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
=head3 Input Parameters |
|
765
|
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
=begin html |
|
769
|
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
| Parameter |
|
774
|
|
|
|
|
|
|
| Type |
|
775
|
|
|
|
|
|
|
| Default |
|
776
|
|
|
|
|
|
|
| Description |
|
777
|
|
|
|
|
|
|
| Example |
|
778
|
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
| |
|
780
|
|
|
|
|
|
|
|
|
781
|
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
| data_sv |
|
783
|
|
|
|
|
|
|
| HashRef or ArrayRef |
|
784
|
|
|
|
|
|
|
| (Required) |
|
785
|
|
|
|
|
|
|
| The dataset to analyze. Accepts a Hash of Arrays (HoA) or Array of Hashes (AoH). If no formula is provided, it must be an HoA to allow automatic stacking (mimicking R's stack() on a named list). |
|
786
|
|
|
|
|
|
|
| |
|
787
|
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
| formula_sv |
|
790
|
|
|
|
|
|
|
| String |
|
791
|
|
|
|
|
|
|
| undef |
|
792
|
|
|
|
|
|
|
| A symbolic description of the model to be fitted. If omitted, the formula automatically defaults to 'Value ~ Group' and the input data is stacked. |
|
793
|
|
|
|
|
|
|
| 'yield ~ N * P' |
|
794
|
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
| |
|
797
|
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
=end html |
|
799
|
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
=head3 Output Variables |
|
803
|
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
The function returns a single C containing the evaluated statistical results. Because the keys map dynamically to the terms parsed from your formula, the structure will vary based on your inputs. |
|
805
|
|
|
|
|
|
|
|
|
806
|
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
=begin html |
|
809
|
|
|
|
|
|
|
|
|
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
| Parameter |
|
814
|
|
|
|
|
|
|
| Type |
|
815
|
|
|
|
|
|
|
| Default |
|
816
|
|
|
|
|
|
|
| Description |
|
817
|
|
|
|
|
|
|
| Example |
|
818
|
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
| |
|
820
|
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
| (Term Name) |
|
823
|
|
|
|
|
|
|
| HashRef |
|
824
|
|
|
|
|
|
|
| undef |
|
825
|
|
|
|
|
|
|
| A nested hash for each independent term in the formula (e.g., 'Group', 'N:P'), containing its ANOVA table statistics. |
|
826
|
|
|
|
|
|
|
| {'Df' => 1, 'Sum Sq' => 14.2, 'Mean Sq' => 14.2, 'F value' => 25.81, 'Pr(>F)' => 0.0004} |
|
827
|
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
| Residuals |
|
830
|
|
|
|
|
|
|
| HashRef |
|
831
|
|
|
|
|
|
|
| undef |
|
832
|
|
|
|
|
|
|
| A nested hash containing the residual (error) statistics for the fitted model. |
|
833
|
|
|
|
|
|
|
| {'Df' => 10, 'Sum Sq' => 5.5, 'Mean Sq' => 0.55} |
|
834
|
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
| group_stats |
|
837
|
|
|
|
|
|
|
| HashRef |
|
838
|
|
|
|
|
|
|
| undef |
|
839
|
|
|
|
|
|
|
| A nested hash containing descriptive statistics (mean and size / count) for every column evaluated in the original unstacked data structure. |
|
840
|
|
|
|
|
|
|
| {'mean' => {'A' => 2.1, 'B' => 5.4}, 'size' => {'A' => 10, 'B' => 10}} |
|
841
|
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
| |
|
844
|
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
=end html |
|
846
|
|
|
|
|
|
|
|
|
847
|
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
=head3 omitting formula |
|
850
|
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
As of version 0.07, in the case of an omitted formula, stacking is done: |
|
852
|
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
aov( |
|
854
|
|
|
|
|
|
|
{ |
|
855
|
|
|
|
|
|
|
yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2], |
|
856
|
|
|
|
|
|
|
ctrl => [1, 1, 1, 0, 0, 0] |
|
857
|
|
|
|
|
|
|
}, |
|
858
|
|
|
|
|
|
|
); |
|
859
|
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
is the equivalent of: |
|
861
|
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
yield <- c(5.5, 5.4, 5.8, 4.5, 4.8, 4.2) |
|
863
|
|
|
|
|
|
|
ctrl <- c(1, 1, 1, 0, 0, 0) |
|
864
|
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
# Combine them into a named list (the R equivalent of your hash) |
|
866
|
|
|
|
|
|
|
my_list <- list(yield = yield, ctrl = ctrl) |
|
867
|
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
# Convert the list into a "long" dataframe |
|
869
|
|
|
|
|
|
|
# This creates two columns: "values" and "ind" (the group name) |
|
870
|
|
|
|
|
|
|
my_data <- stack(my_list) |
|
871
|
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
# Rename columns for clarity (optional but good practice) |
|
873
|
|
|
|
|
|
|
colnames(my_data) <- c("Value", "Group") |
|
874
|
|
|
|
|
|
|
anova_model <- aov(Value ~ Group, data = my_data) |
|
875
|
|
|
|
|
|
|
summary(anova_model) |
|
876
|
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
in R |
|
878
|
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
=head2 cfilter |
|
880
|
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
Select B out of a table and return it in the same shape. A column is |
|
882
|
|
|
|
|
|
|
the inner (second-level) key of a B or an B, |
|
883
|
|
|
|
|
|
|
or the outer key of a B: |
|
884
|
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
use Stats::LikeR; |
|
886
|
|
|
|
|
|
|
my %hoa = ( x => [1,2,3], y => [4,5,6], z => [0,0,0] ); |
|
887
|
|
|
|
|
|
|
cfilter(\%hoa, keep => ['x','y']); # { x => [1,2,3], y => [4,5,6] } |
|
888
|
|
|
|
|
|
|
cfilter(\%hoa, remove => ['z']); # { x => [1,2,3], y => [4,5,6] } |
|
889
|
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
C takes exactly one of C or C. C returns only the |
|
891
|
|
|
|
|
|
|
matching columns; C returns everything except them. The result is the |
|
892
|
|
|
|
|
|
|
same shape as the input (HoH → HoH, HoA → HoA, AoH → AoH), with cell values |
|
893
|
|
|
|
|
|
|
copied and the original structure left untouched. |
|
894
|
|
|
|
|
|
|
|
|
895
|
|
|
|
|
|
|
=head3 Selecting by name |
|
896
|
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
Pass an array ref of column names. Naming a column that is not present in the |
|
898
|
|
|
|
|
|
|
data is an error (it catches typos), and a row that happens not to contain a |
|
899
|
|
|
|
|
|
|
kept column simply comes back without it: |
|
900
|
|
|
|
|
|
|
|
|
901
|
|
|
|
|
|
|
my @aoh = ( { a => 1, b => 2 }, { a => 3 } ); |
|
902
|
|
|
|
|
|
|
cfilter(\@aoh, keep => ['b']); # [ { b => 2 }, {} ] |
|
903
|
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
=head3 Selecting by a predicate |
|
905
|
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
Instead of names, C/C accept a B — a CODE ref or a |
|
907
|
|
|
|
|
|
|
function name — evaluated once per column. It is called as |
|
908
|
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
$predicate->($column_values, $column_name) |
|
910
|
|
|
|
|
|
|
|
|
911
|
|
|
|
|
|
|
where C<$column_values> is an array ref of the column's B cells (undef |
|
912
|
|
|
|
|
|
|
and missing cells are dropped, so functions like C get clean input). |
|
913
|
|
|
|
|
|
|
With C, columns for which the predicate is true are kept; with C, |
|
914
|
|
|
|
|
|
|
those columns are dropped. |
|
915
|
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
# Keep only the constant columns (standard deviation zero): |
|
917
|
|
|
|
|
|
|
my $const = cfilter(\%hoa, keep => sub { sd($_[0]) == 0 }); # { z => [0,0,0] } |
|
918
|
|
|
|
|
|
|
# Drop the constant columns instead: |
|
919
|
|
|
|
|
|
|
my $varying = cfilter(\%hoa, remove => sub { sd($_[0]) == 0 }); # { x=>..., y=>... } |
|
920
|
|
|
|
|
|
|
# A bare function name resolves in Stats::LikeR:: (use a package for your own): |
|
921
|
|
|
|
|
|
|
cfilter(\%hoa, keep => 'some_predicate'); |
|
922
|
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
A bare string is always treated as a B, not a single column |
|
924
|
|
|
|
|
|
|
name, so to keep one column by name use an array ref: C<< keep =E ['x'] >>. |
|
925
|
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
=head3 Errors |
|
927
|
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
C dies (via C) when: |
|
929
|
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
=over |
|
931
|
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
=item * neither C nor C is given, or both are, |
|
933
|
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
=item * a named column is not present in the data, |
|
935
|
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
=item * the selector is neither an array ref nor a code ref / function name, or the |
|
937
|
|
|
|
|
|
|
function name cannot be resolved, |
|
938
|
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
=item * an unknown option is given, or the options are not C<< name =E value >> pairs, |
|
940
|
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
=item * the data is not a hash/array reference of the expected shape (a hash of hash |
|
942
|
|
|
|
|
|
|
refs or array refs, or an array of hash refs). |
|
943
|
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
=back |
|
945
|
|
|
|
|
|
|
|
|
946
|
|
|
|
|
|
|
=head2 chisq_test |
|
947
|
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
The C function performs chi-squared contingency table tests and goodness-of-fit tests. It natively accepts both arrays and hashes (1D and 2D) and mathematically mirrors R's C, returning a structured hash reference of the results. |
|
949
|
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
For 2x2 matrices, Yates' Continuity Correction is applied automatically. |
|
951
|
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
=head3 Accepted Inputs |
|
953
|
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
=begin html |
|
957
|
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
|
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
|
|
961
|
|
|
|
|
|
|
| Input Type |
|
962
|
|
|
|
|
|
|
| Data Structure |
|
963
|
|
|
|
|
|
|
| Applied Test |
|
964
|
|
|
|
|
|
|
|
|
965
|
|
|
|
|
|
|
| |
|
966
|
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
|
|
968
|
|
|
|
|
|
|
| 1D Array |
|
969
|
|
|
|
|
|
|
| [ $v1, $v2, ... ] |
|
970
|
|
|
|
|
|
|
| Chi-squared test for given probabilities |
|
971
|
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
| 2D Array |
|
974
|
|
|
|
|
|
|
| [ [ $v1, $v2 ], [ $v3, $v4 ] ] |
|
975
|
|
|
|
|
|
|
| Pearson's Chi-squared test (Yates' correction if 2x2) |
|
976
|
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
| 1D Hash |
|
979
|
|
|
|
|
|
|
| { key1 => $v1, key2 => $v2 } |
|
980
|
|
|
|
|
|
|
| Chi-squared test for given probabilities |
|
981
|
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
| 2D Hash |
|
984
|
|
|
|
|
|
|
| { row1 => { c1 => $v1, c2 => $v2 } } |
|
985
|
|
|
|
|
|
|
| Pearson's Chi-squared test (Yates' correction if 2x2) |
|
986
|
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
| |
|
989
|
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
=end html |
|
991
|
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
=head3 Output Object Structure |
|
995
|
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
The function returns a single Hash Reference containing the following key-value pairs. The internal structure of C and C will always identically match the structure of your input. |
|
997
|
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
=begin html |
|
1001
|
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
|
|
1048
|
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
=end html |
|
1050
|
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
|
|
1052
|
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
=head3 Two-Dimensional Array |
|
1054
|
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
Passing an Array of Arrays (AoA) triggers a standard Pearson's Chi-squared test. If the input is exactly a 2x2 matrix, Yates' continuity correction is applied automatically. |
|
1056
|
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
my $test_data = [ |
|
1058
|
|
|
|
|
|
|
[762, 327, 468], |
|
1059
|
|
|
|
|
|
|
[484, 239, 477] |
|
1060
|
|
|
|
|
|
|
]; |
|
1061
|
|
|
|
|
|
|
my $res = chisq_test($test_data); |
|
1062
|
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
B |
|
1064
|
|
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
{ |
|
1066
|
|
|
|
|
|
|
'data.name' => 'Perl ArrayRef', |
|
1067
|
|
|
|
|
|
|
'expected' => [ |
|
1068
|
|
|
|
|
|
|
[ 703.671381936888, 319.645266594124, 533.683351468988 ], |
|
1069
|
|
|
|
|
|
|
[ 542.328618063112, 246.354733405876, 411.316648531012 ] |
|
1070
|
|
|
|
|
|
|
], |
|
1071
|
|
|
|
|
|
|
'method' => "Pearson's Chi-squared test", |
|
1072
|
|
|
|
|
|
|
'observed' => [ |
|
1073
|
|
|
|
|
|
|
[ 762, 327, 468 ], |
|
1074
|
|
|
|
|
|
|
[ 484, 239, 477 ] |
|
1075
|
|
|
|
|
|
|
], |
|
1076
|
|
|
|
|
|
|
'p.value' => 2.95358918321176e-07, |
|
1077
|
|
|
|
|
|
|
'parameter' => { 'df' => 2 }, |
|
1078
|
|
|
|
|
|
|
'statistic' => { 'X-squared' => 30.0701490957547 } |
|
1079
|
|
|
|
|
|
|
} |
|
1080
|
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
=head3 1-Dimensional Array (Goodness of Fit) |
|
1082
|
|
|
|
|
|
|
|
|
1083
|
|
|
|
|
|
|
Passing a flat Array Reference triggers a Goodness of Fit test, assuming equal expected probabilities across all items. |
|
1084
|
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
my $data = [10, 20, 30]; |
|
1086
|
|
|
|
|
|
|
my $res = chisq_test($data); |
|
1087
|
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
B |
|
1089
|
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
{ |
|
1091
|
|
|
|
|
|
|
'data.name' => 'Perl ArrayRef', |
|
1092
|
|
|
|
|
|
|
'expected' => [ 20, 20, 20 ], |
|
1093
|
|
|
|
|
|
|
'method' => 'Chi-squared test for given probabilities', |
|
1094
|
|
|
|
|
|
|
'observed' => [ 10, 20, 30 ], |
|
1095
|
|
|
|
|
|
|
'p.value' => 0.00673794699908547, |
|
1096
|
|
|
|
|
|
|
'parameter' => { 'df' => 2 }, |
|
1097
|
|
|
|
|
|
|
'statistic' => { 'X-squared' => 10 } |
|
1098
|
|
|
|
|
|
|
} |
|
1099
|
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
=head3 2-Dimensional Hash (Pearson's Chi-squared) |
|
1101
|
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
Passing a Hash of Hashes (HoH) applies the exact same logic as a 2D Array, but preserves your nested string keys in the output. This is particularly useful when mapping data extracted directly from JSON, databases, or categorical mappings. |
|
1103
|
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
my $data = { |
|
1105
|
|
|
|
|
|
|
GroupA => { Success => 10, Failure => 15 }, |
|
1106
|
|
|
|
|
|
|
GroupB => { Success => 20, Failure => 5 } |
|
1107
|
|
|
|
|
|
|
}; |
|
1108
|
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
my $res = chisq_test($data); |
|
1110
|
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
B |
|
1112
|
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
{ |
|
1114
|
|
|
|
|
|
|
'data.name' => 'Perl HashRef', |
|
1115
|
|
|
|
|
|
|
'expected' => { |
|
1116
|
|
|
|
|
|
|
'GroupA' => { 'Failure' => 10, 'Success' => 15 }, |
|
1117
|
|
|
|
|
|
|
'GroupB' => { 'Failure' => 10, 'Success' => 15 } |
|
1118
|
|
|
|
|
|
|
}, |
|
1119
|
|
|
|
|
|
|
'method' => "Pearson's Chi-squared test with Yates' continuity correction", |
|
1120
|
|
|
|
|
|
|
'observed' => { |
|
1121
|
|
|
|
|
|
|
'GroupA' => { 'Failure' => 15, 'Success' => 10 }, |
|
1122
|
|
|
|
|
|
|
'GroupB' => { 'Failure' => 5, 'Success' => 20 } |
|
1123
|
|
|
|
|
|
|
}, |
|
1124
|
|
|
|
|
|
|
'p.value' => 0.00937475878430379, |
|
1125
|
|
|
|
|
|
|
'parameter' => { 'df' => 1 }, |
|
1126
|
|
|
|
|
|
|
'statistic' => { 'X-squared' => 6.75 } |
|
1127
|
|
|
|
|
|
|
} |
|
1128
|
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
=head3 One-Dimensional Hash (Goodness of Fit) |
|
1130
|
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
Flat Hash References evaluate Goodness of Fit while preserving your categorical keys in the C and C output blocks. |
|
1132
|
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
my $data = { |
|
1134
|
|
|
|
|
|
|
Apples => 10, |
|
1135
|
|
|
|
|
|
|
Oranges => 20, |
|
1136
|
|
|
|
|
|
|
Bananas => 30 |
|
1137
|
|
|
|
|
|
|
}; |
|
1138
|
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
my $res = chisq_test($data); |
|
1140
|
|
|
|
|
|
|
|
|
1141
|
|
|
|
|
|
|
=head2 C |
|
1142
|
|
|
|
|
|
|
|
|
1143
|
|
|
|
|
|
|
Apply a B to every pair of columns in a table and collect |
|
1144
|
|
|
|
|
|
|
the answers in a hash of hashes. |
|
1145
|
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
It's the workhorse behind things like correlation matrices: give it your data and |
|
1147
|
|
|
|
|
|
|
the name of a function that takes two columns (C, C, …) and you get |
|
1148
|
|
|
|
|
|
|
back every column compared against every other column. |
|
1149
|
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
use Stats::LikeR; |
|
1151
|
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
my %data = ( |
|
1153
|
|
|
|
|
|
|
height => [ 170, 165, 180, 175 ], |
|
1154
|
|
|
|
|
|
|
weight => [ 70, 60, 85, 77 ], |
|
1155
|
|
|
|
|
|
|
age => [ 30, 41, 25, 38 ], |
|
1156
|
|
|
|
|
|
|
); |
|
1157
|
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
my $result = col2col(\%data, 'cor'); |
|
1159
|
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
# $result->{height}{weight} == correlation of height vs weight |
|
1161
|
|
|
|
|
|
|
# $result->{height}{age} == correlation of height vs age |
|
1162
|
|
|
|
|
|
|
# ...and so on for every pair |
|
1163
|
|
|
|
|
|
|
|
|
1164
|
|
|
|
|
|
|
======================================================================== |
|
1165
|
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
=head3 Arguments |
|
1167
|
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
col2col( $data, $command, $cols, %options ) |
|
1169
|
|
|
|
|
|
|
col2col( $data, $command, \%options ) # options in place of $cols |
|
1170
|
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
=begin html |
|
1174
|
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
|
|
1206
|
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
=end html |
|
1208
|
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
|
|
1210
|
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
======================================================================== |
|
1212
|
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
=head3 Data shapes |
|
1214
|
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
C understands three layouts. In every case a B is the thing that |
|
1216
|
|
|
|
|
|
|
gets compared, and the result is keyed by column name. |
|
1217
|
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
B — keys are column names: |
|
1219
|
|
|
|
|
|
|
|
|
1220
|
|
|
|
|
|
|
my %hoa = ( a => [1, 2, 3], b => [4, 5, 6] ); |
|
1221
|
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
B — First keys are row names, second keys are columns: |
|
1223
|
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
my %hoh = ( |
|
1225
|
|
|
|
|
|
|
row1 => { a => 1, b => 4 }, |
|
1226
|
|
|
|
|
|
|
row2 => { a => 2, b => 5 }, |
|
1227
|
|
|
|
|
|
|
); |
|
1228
|
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
B — each element is a row, inner keys are columns: |
|
1230
|
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
my @aoh = ( { a => 1, b => 4 }, { a => 2, b => 5 } ); |
|
1232
|
|
|
|
|
|
|
|
|
1233
|
|
|
|
|
|
|
All three produce the same result for the same underlying numbers. Missing or |
|
1234
|
|
|
|
|
|
|
C cells are handled by the C option (below). |
|
1235
|
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
======================================================================== |
|
1237
|
|
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
|
=head3 The command |
|
1239
|
|
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
The second argument is the function applied to each pair of columns. It is called |
|
1241
|
|
|
|
|
|
|
as: |
|
1242
|
|
|
|
|
|
|
|
|
1243
|
|
|
|
|
|
|
$command->( $column_a, $column_b ) # two ARRAY refs |
|
1244
|
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
so inside a block the two columns arrive in C<@_>: |
|
1246
|
|
|
|
|
|
|
|
|
1247
|
|
|
|
|
|
|
my $result = col2col(\%data, sub { |
|
1248
|
|
|
|
|
|
|
my ($x, $y) = @_; # $x and $y are array refs |
|
1249
|
|
|
|
|
|
|
cor($x, $y); |
|
1250
|
|
|
|
|
|
|
}); |
|
1251
|
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
You can also pass a B. A bare name is looked up in |
|
1253
|
|
|
|
|
|
|
C, so these two are equivalent: |
|
1254
|
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
col2col(\%data, 'cor'); |
|
1256
|
|
|
|
|
|
|
col2col(\%data, sub { cor($_[0], $_[1]) }); |
|
1257
|
|
|
|
|
|
|
|
|
1258
|
|
|
|
|
|
|
======================================================================== |
|
1259
|
|
|
|
|
|
|
|
|
1260
|
|
|
|
|
|
|
=head3 The result |
|
1261
|
|
|
|
|
|
|
|
|
1262
|
|
|
|
|
|
|
Always a hash of hashes: B<< C<< $result-E{from}{to} >> >>. |
|
1263
|
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
for my $from (sort keys %$result) { |
|
1265
|
|
|
|
|
|
|
for my $to (sort keys %{ $result->{$from} }) { |
|
1266
|
|
|
|
|
|
|
printf "%s vs %s = %s\n", $from, $to, $result->{$from}{$to}; |
|
1267
|
|
|
|
|
|
|
} |
|
1268
|
|
|
|
|
|
|
} |
|
1269
|
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
A column is never compared with itself, so C<< $result-E{a}{a} >> does not exist. |
|
1271
|
|
|
|
|
|
|
|
|
1272
|
|
|
|
|
|
|
======================================================================== |
|
1273
|
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
=head3 Restricting columns (C<$cols>) |
|
1275
|
|
|
|
|
|
|
|
|
1276
|
|
|
|
|
|
|
By default every column is used as the "from" side. The third argument narrows |
|
1277
|
|
|
|
|
|
|
that down — handy when you only care about one variable. |
|
1278
|
|
|
|
|
|
|
|
|
1279
|
|
|
|
|
|
|
# all columns vs all columns |
|
1280
|
|
|
|
|
|
|
my $all = col2col(\%data, 'cor'); |
|
1281
|
|
|
|
|
|
|
# just ONE column vs every other column |
|
1282
|
|
|
|
|
|
|
my $one = col2col(\%data, 'cor', 'height'); |
|
1283
|
|
|
|
|
|
|
my $cors = $one->{height}; # { weight => ..., age => ... } |
|
1284
|
|
|
|
|
|
|
# a FEW specific columns vs every other column |
|
1285
|
|
|
|
|
|
|
my $few = col2col(\%data, 'cor', ['height', 'weight']); |
|
1286
|
|
|
|
|
|
|
|
|
1287
|
|
|
|
|
|
|
The "to" side is always every other column; C<$cols> only limits the outer keys. |
|
1288
|
|
|
|
|
|
|
|
|
1289
|
|
|
|
|
|
|
======================================================================== |
|
1290
|
|
|
|
|
|
|
|
|
1291
|
|
|
|
|
|
|
=head3 Options |
|
1292
|
|
|
|
|
|
|
|
|
1293
|
|
|
|
|
|
|
Options can be given two ways: |
|
1294
|
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
col2col(\%data, 'cor', $cols, 'skip.errors' => 0); # after $cols |
|
1296
|
|
|
|
|
|
|
col2col(\%data, 'cor', { 'skip.errors' => 0 }); # hash ref, no $cols needed |
|
1297
|
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
The hash-ref form is convenient when you have B column restriction — it saves |
|
1299
|
|
|
|
|
|
|
you from passing a placeholder. (A hash ref I C<$cols>, so you can't use |
|
1300
|
|
|
|
|
|
|
it to restrict columns at the same time; use the trailing form for that.) |
|
1301
|
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
=head4 C — how undefined values are handled |
|
1303
|
|
|
|
|
|
|
|
|
1304
|
|
|
|
|
|
|
Real data has gaps. C decides what the function sees. |
|
1305
|
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
|
|
1307
|
|
|
|
|
|
|
|
|
1308
|
|
|
|
|
|
|
=begin html |
|
1309
|
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
|
1311
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
|
|
1313
|
|
|
|
|
|
|
| Value |
|
1314
|
|
|
|
|
|
|
| Behaviour |
|
1315
|
|
|
|
|
|
|
| Use for |
|
1316
|
|
|
|
|
|
|
|
|
1317
|
|
|
|
|
|
|
| |
|
1318
|
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
| 'pairwise' (default) |
|
1321
|
|
|
|
|
|
|
| A row is used for a pair only if both columns are defined there. The two columns arrive aligned and equal-length. |
|
1322
|
|
|
|
|
|
|
| Paired stats like cor. |
|
1323
|
|
|
|
|
|
|
|
|
1324
|
|
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
|
| 'omit' |
|
1326
|
|
|
|
|
|
|
| Each column drops its own undefined values independently. The two columns may end up different lengths. |
|
1327
|
|
|
|
|
|
|
| Unpaired tests like t_test, kruskal_test, where a gap in one sample shouldn't discard a value in the other. |
|
1328
|
|
|
|
|
|
|
|
|
1329
|
|
|
|
|
|
|
|
|
1330
|
|
|
|
|
|
|
| 'keep' |
|
1331
|
|
|
|
|
|
|
| Every row is passed through, undef and all. |
|
1332
|
|
|
|
|
|
|
| When your function does its own missing-data handling. |
|
1333
|
|
|
|
|
|
|
|
|
1334
|
|
|
|
|
|
|
|
|
1335
|
|
|
|
|
|
|
| |
|
1336
|
|
|
|
|
|
|
|
|
1337
|
|
|
|
|
|
|
=end html |
|
1338
|
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
# correlation: keep only complete pairs (the default) |
|
1342
|
|
|
|
|
|
|
col2col(\%data, 'cor'); |
|
1343
|
|
|
|
|
|
|
# two-sample test: each column keeps its own values |
|
1344
|
|
|
|
|
|
|
col2col(\%data, 't_test', undef, na => 'omit'); |
|
1345
|
|
|
|
|
|
|
col2col(\%data, 't_test', { na => 'omit' }); # same, no placeholder |
|
1346
|
|
|
|
|
|
|
|
|
1347
|
|
|
|
|
|
|
C / C remain as boolean aliases for backward compatibility: |
|
1348
|
|
|
|
|
|
|
C means C<'pairwise'>, C means C<'keep'>. Don't combine them with C. |
|
1349
|
|
|
|
|
|
|
|
|
1350
|
|
|
|
|
|
|
=head4 C — keep going when a pair fails I<(default: true)> |
|
1351
|
|
|
|
|
|
|
|
|
1352
|
|
|
|
|
|
|
Some functions croak on degenerate input — for example C dies if a column has |
|
1353
|
|
|
|
|
|
|
zero variance. By default C B that croak per pair: instead of |
|
1354
|
|
|
|
|
|
|
aborting the whole run, it stores the B of the error message in that |
|
1355
|
|
|
|
|
|
|
cell, so the result tells you I pair failed and I. Every other cell is |
|
1356
|
|
|
|
|
|
|
computed normally. |
|
1357
|
|
|
|
|
|
|
|
|
1358
|
|
|
|
|
|
|
my $r = col2col(\%data, 'cor'); |
|
1359
|
|
|
|
|
|
|
# a good pair: $r->{a}{b} == 0.83 |
|
1360
|
|
|
|
|
|
|
# a bad pair: $r->{a}{const} eq 'cor: standard deviation of y is 0' |
|
1361
|
|
|
|
|
|
|
|
|
1362
|
|
|
|
|
|
|
To restore the old "die on the first error" behaviour, turn it off: |
|
1363
|
|
|
|
|
|
|
|
|
1364
|
|
|
|
|
|
|
col2col(\%data, 'cor', undef, 'skip.errors' => 0); |
|
1365
|
|
|
|
|
|
|
col2col(\%data, 'cor', { 'skip.errors' => 0 }); |
|
1366
|
|
|
|
|
|
|
|
|
1367
|
|
|
|
|
|
|
Only errors from B are trapped. Mistakes in the call itself |
|
1368
|
|
|
|
|
|
|
(unknown column, bad data, unknown function name, unknown option) always die. |
|
1369
|
|
|
|
|
|
|
|
|
1370
|
|
|
|
|
|
|
======================================================================== |
|
1371
|
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
=head3 Worked examples |
|
1373
|
|
|
|
|
|
|
|
|
1374
|
|
|
|
|
|
|
B |
|
1375
|
|
|
|
|
|
|
|
|
1376
|
|
|
|
|
|
|
my $m = col2col(\%data, 'cor'); |
|
1377
|
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
B |
|
1379
|
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
my $col = 'Testosterone, total (nmol/L)'; |
|
1381
|
|
|
|
|
|
|
my $cors = col2col($hoa, 'cor', $col)->{$col}; |
|
1382
|
|
|
|
|
|
|
for my $other (sort { ($cors->{$b} // -2) <=> ($cors->{$a} // -2) } keys %$cors) { |
|
1383
|
|
|
|
|
|
|
next unless $cors->{$other} =~ /^-?\d/; # skip cells holding an error message |
|
1384
|
|
|
|
|
|
|
printf "%-30s % .3f\n", $other, $cors->{$other}; |
|
1385
|
|
|
|
|
|
|
} |
|
1386
|
|
|
|
|
|
|
|
|
1387
|
|
|
|
|
|
|
B |
|
1388
|
|
|
|
|
|
|
|
|
1389
|
|
|
|
|
|
|
my $t = col2col($hoa, 't_test', undef, na => 'omit'); |
|
1390
|
|
|
|
|
|
|
|
|
1391
|
|
|
|
|
|
|
B |
|
1392
|
|
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
|
my $m = col2col($hoa, 'cor'); |
|
1394
|
|
|
|
|
|
|
for my $from (sort keys %$m) { |
|
1395
|
|
|
|
|
|
|
for my $to (sort keys %{ $m->{$from} }) { |
|
1396
|
|
|
|
|
|
|
my $v = $m->{$from}{$to}; |
|
1397
|
|
|
|
|
|
|
warn "$from vs $to: $v\n" if defined $v && $v !~ /^-?\d/; # non-numeric = error |
|
1398
|
|
|
|
|
|
|
} |
|
1399
|
|
|
|
|
|
|
} |
|
1400
|
|
|
|
|
|
|
|
|
1401
|
|
|
|
|
|
|
======================================================================== |
|
1402
|
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
=head3 Gotchas |
|
1404
|
|
|
|
|
|
|
|
|
1405
|
|
|
|
|
|
|
=over |
|
1406
|
|
|
|
|
|
|
|
|
1407
|
|
|
|
|
|
|
=item * B, C<($col_a, $col_b)> — not a column and |
|
1408
|
|
|
|
|
|
|
a name. Unpack with C. |
|
1409
|
|
|
|
|
|
|
|
|
1410
|
|
|
|
|
|
|
=item * B<< C<'pairwise'> can still hit a constant I. >> A column with overall |
|
1411
|
|
|
|
|
|
|
variance can be flat on just the rows it shares with one partner, so C may |
|
1412
|
|
|
|
|
|
|
still croak for that pair. With the default C, that shows up as a |
|
1413
|
|
|
|
|
|
|
message in the single offending cell rather than killing the run. |
|
1414
|
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
=item * B<< C does not modify your data. >> It reads the table and returns a new |
|
1416
|
|
|
|
|
|
|
hash of hashes. |
|
1417
|
|
|
|
|
|
|
|
|
1418
|
|
|
|
|
|
|
=item * B — i.e. |
|
1419
|
|
|
|
|
|
|
C is the inner ("to") key. So C<< $result-E{A}{B} >> reading C<…deviation of y is 0> |
|
1420
|
|
|
|
|
|
|
means column C is the degenerate one for that pair. |
|
1421
|
|
|
|
|
|
|
|
|
1422
|
|
|
|
|
|
|
=back |
|
1423
|
|
|
|
|
|
|
|
|
1424
|
|
|
|
|
|
|
=head2 cor |
|
1425
|
|
|
|
|
|
|
|
|
1426
|
|
|
|
|
|
|
cor($array1, $array2, $method = 'pearson'), |
|
1427
|
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
that is, C is the default and will be used if C<$method> is not specified. |
|
1429
|
|
|
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
Just like R, C, C, and C are available |
|
1431
|
|
|
|
|
|
|
|
|
1432
|
|
|
|
|
|
|
If you provide an array of arrays (a matrix), C will compute the correlation matrix automatically. |
|
1433
|
|
|
|
|
|
|
|
|
1434
|
|
|
|
|
|
|
=head2 cor_test |
|
1435
|
|
|
|
|
|
|
|
|
1436
|
|
|
|
|
|
|
my $result = cor_test( |
|
1437
|
|
|
|
|
|
|
'x' => $x, |
|
1438
|
|
|
|
|
|
|
'y' => $y, |
|
1439
|
|
|
|
|
|
|
alternative => 'two.sided', |
|
1440
|
|
|
|
|
|
|
method => 'pearson', |
|
1441
|
|
|
|
|
|
|
continuity => 1 |
|
1442
|
|
|
|
|
|
|
); |
|
1443
|
|
|
|
|
|
|
|
|
1444
|
|
|
|
|
|
|
C safely handles C (or C) values seamlessly by computing over pairwise complete observations. |
|
1445
|
|
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
|
=head2 cov |
|
1447
|
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
cov($array1, $array2, 'pearson') |
|
1449
|
|
|
|
|
|
|
|
|
1450
|
|
|
|
|
|
|
or |
|
1451
|
|
|
|
|
|
|
|
|
1452
|
|
|
|
|
|
|
cov($array1, $array2, 'spearman') |
|
1453
|
|
|
|
|
|
|
|
|
1454
|
|
|
|
|
|
|
or |
|
1455
|
|
|
|
|
|
|
|
|
1456
|
|
|
|
|
|
|
cov($array1, $array2, 'kendall') |
|
1457
|
|
|
|
|
|
|
|
|
1458
|
|
|
|
|
|
|
=head2 csort |
|
1459
|
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
Sort a table by a column or by a custom comparator. Works on both common Perl table shapes and can transpose between them on the way out. Stable, non-destructive. |
|
1461
|
|
|
|
|
|
|
|
|
1462
|
|
|
|
|
|
|
=head3 Signature |
|
1463
|
|
|
|
|
|
|
|
|
1464
|
|
|
|
|
|
|
my $sorted = csort($data, $by); |
|
1465
|
|
|
|
|
|
|
my $sorted = csort($data, $by, $output); |
|
1466
|
|
|
|
|
|
|
|
|
1467
|
|
|
|
|
|
|
=over |
|
1468
|
|
|
|
|
|
|
|
|
1469
|
|
|
|
|
|
|
=item * B<< C<$data> >> — your table, in either shape: |
|
1470
|
|
|
|
|
|
|
=over |
|
1471
|
|
|
|
|
|
|
|
|
1472
|
|
|
|
|
|
|
=item * B — arrayref of hashrefs (a list of rows): C<< [ {id=E1, v=E10}, {id=E2, v=E20} ] >> |
|
1473
|
|
|
|
|
|
|
|
|
1474
|
|
|
|
|
|
|
=item * B — hashref of arrayrefs (parallel columns): C<< { id=E[1,2], v=E[10,20] } >> |
|
1475
|
|
|
|
|
|
|
|
|
1476
|
|
|
|
|
|
|
=back |
|
1477
|
|
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
|
=item * B<< C<$by> >> — I to sort: |
|
1479
|
|
|
|
|
|
|
=over |
|
1480
|
|
|
|
|
|
|
|
|
1481
|
|
|
|
|
|
|
=item * a B (string), or |
|
1482
|
|
|
|
|
|
|
|
|
1483
|
|
|
|
|
|
|
=item * a B coderef using C<$a> / C<$b>, just like Perl's C |
|
1484
|
|
|
|
|
|
|
|
|
1485
|
|
|
|
|
|
|
=back |
|
1486
|
|
|
|
|
|
|
|
|
1487
|
|
|
|
|
|
|
=item * B<< C<$output> >> I<(optional)> — C<'aoh'> or C<'hoa'> (case-insensitive). Defaults to the input shape; C also means "same as input". |
|
1488
|
|
|
|
|
|
|
|
|
1489
|
|
|
|
|
|
|
=back |
|
1490
|
|
|
|
|
|
|
|
|
1491
|
|
|
|
|
|
|
Returns a B structure. The input is never modified. |
|
1492
|
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
=head3 What it does |
|
1494
|
|
|
|
|
|
|
|
|
1495
|
|
|
|
|
|
|
=over |
|
1496
|
|
|
|
|
|
|
|
|
1497
|
|
|
|
|
|
|
=item * B — numeric if every defined value in that column looks like a number, otherwise string comparison. Missing / C values sort B (matching R's C). |
|
1498
|
|
|
|
|
|
|
|
|
1499
|
|
|
|
|
|
|
=item * B — C<$a> and C<$b> are set in the comparator's I package, so a named sub from another package still sees its own C<$a>/C<$b>. For AoH they're the row hashrefs; for HoA they're per-row hashref views synthesized from the columns. |
|
1500
|
|
|
|
|
|
|
|
|
1501
|
|
|
|
|
|
|
=item * B — equal keys keep their original order (merge sort, same as Perl C and R C). |
|
1502
|
|
|
|
|
|
|
|
|
1503
|
|
|
|
|
|
|
=item * B — keep the input shape, or transpose: AoH→HoA builds the union of all row keys (ordered by first appearance, gaps filled with C); HoA→AoH emits one hashref per row. |
|
1504
|
|
|
|
|
|
|
|
|
1505
|
|
|
|
|
|
|
=back |
|
1506
|
|
|
|
|
|
|
|
|
1507
|
|
|
|
|
|
|
=head3 Examples |
|
1508
|
|
|
|
|
|
|
|
|
1509
|
|
|
|
|
|
|
# by column, ascending numeric, AoH in / AoH out |
|
1510
|
|
|
|
|
|
|
my $rows = csort($aoh, 'score'); |
|
1511
|
|
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
|
# custom comparator (descending), HoA in / HoA out |
|
1513
|
|
|
|
|
|
|
my $cols = csort($hoa, sub { $b->{score} <=> $a->{score} }); |
|
1514
|
|
|
|
|
|
|
|
|
1515
|
|
|
|
|
|
|
# sort an AoH but hand it back as columns (HoA) |
|
1516
|
|
|
|
|
|
|
my $cols = csort($aoh, 'name', 'hoa'); |
|
1517
|
|
|
|
|
|
|
|
|
1518
|
|
|
|
|
|
|
=head3 Notes |
|
1519
|
|
|
|
|
|
|
|
|
1520
|
|
|
|
|
|
|
=over |
|
1521
|
|
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
|
=item * B AoH output reuses the original row hashrefs (re-ordered); HoA output permutes every column in lockstep. |
|
1523
|
|
|
|
|
|
|
|
|
1524
|
|
|
|
|
|
|
=item * Empty and single-row tables are handled for all four in/out combinations. |
|
1525
|
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
=item * An invalid C<$output> value croaks. |
|
1527
|
|
|
|
|
|
|
|
|
1528
|
|
|
|
|
|
|
=back |
|
1529
|
|
|
|
|
|
|
|
|
1530
|
|
|
|
|
|
|
=head2 dnorm |
|
1531
|
|
|
|
|
|
|
|
|
1532
|
|
|
|
|
|
|
gives the density of the normal distribution, with the specified mean and standard deviation. |
|
1533
|
|
|
|
|
|
|
|
|
1534
|
|
|
|
|
|
|
In other words, the predicted height of the value C, given a mean, standard deviation, and whether or not to use a log value. |
|
1535
|
|
|
|
|
|
|
|
|
1536
|
|
|
|
|
|
|
returns a single scalar/number if a single value is given, otherwise returns an array reference. |
|
1537
|
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
Usage: |
|
1539
|
|
|
|
|
|
|
|
|
1540
|
|
|
|
|
|
|
dnorm(4) # assumes a mean of 0 and standard deviation of 1 |
|
1541
|
|
|
|
|
|
|
|
|
1542
|
|
|
|
|
|
|
but default mean, standard deviation, and log can be passed as parameters: |
|
1543
|
|
|
|
|
|
|
|
|
1544
|
|
|
|
|
|
|
$x = dnorm(0, mean => 0, sd => 2, 'log' => 0); |
|
1545
|
|
|
|
|
|
|
|
|
1546
|
|
|
|
|
|
|
=head2 filter |
|
1547
|
|
|
|
|
|
|
|
|
1548
|
|
|
|
|
|
|
Return a new data frame containing only the rows of C<$df> that match a predicate. The original C<$df> is never modified. |
|
1549
|
|
|
|
|
|
|
|
|
1550
|
|
|
|
|
|
|
my $df2 = filter($df, col('column.name') > 4); |
|
1551
|
|
|
|
|
|
|
|
|
1552
|
|
|
|
|
|
|
C accepts a predicate in one of two forms: |
|
1553
|
|
|
|
|
|
|
|
|
1554
|
|
|
|
|
|
|
=over |
|
1555
|
|
|
|
|
|
|
|
|
1556
|
|
|
|
|
|
|
=item 1. a B<< C expression >> — a small, composable comparison built with overloaded operators, and |
|
1557
|
|
|
|
|
|
|
|
|
1558
|
|
|
|
|
|
|
=item 2. a B — for anything the operators can't express (multiple columns, regexes, arbitrary logic), in the same spirit as the C option of L<#>. |
|
1559
|
|
|
|
|
|
|
|
|
1560
|
|
|
|
|
|
|
=back |
|
1561
|
|
|
|
|
|
|
|
|
1562
|
|
|
|
|
|
|
Both C and C are exported by default. |
|
1563
|
|
|
|
|
|
|
|
|
1564
|
|
|
|
|
|
|
=head3 Arguments |
|
1565
|
|
|
|
|
|
|
|
|
1566
|
|
|
|
|
|
|
|
|
1567
|
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
=begin html |
|
1569
|
|
|
|
|
|
|
|
|
1570
|
|
|
|
|
|
|
|
|
1591
|
|
|
|
|
|
|
|
|
1592
|
|
|
|
|
|
|
=end html |
|
1593
|
|
|
|
|
|
|
|
|
1594
|
|
|
|
|
|
|
|
|
1595
|
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
The return value is a B data frame of the B as the input (AoH in → AoH out, HoA in → HoA out). For an HoA, every column is filtered in parallel by row index, so all returned columns stay the same length and aligned. |
|
1597
|
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
=head3 The C form |
|
1599
|
|
|
|
|
|
|
|
|
1600
|
|
|
|
|
|
|
C is a deferred reference to a column. It carries no data — only the column name — so it can be compared with a literal (or another value) to build a predicate that C evaluates once per row. |
|
1601
|
|
|
|
|
|
|
|
|
1602
|
|
|
|
|
|
|
filter($df, col('age') >= 18); # keep rows where age >= 18 |
|
1603
|
|
|
|
|
|
|
filter($df, col('sex') eq 'f'); # keep rows where sex is 'f' |
|
1604
|
|
|
|
|
|
|
filter($df, 18 <= col('age')); # operands may be in either order |
|
1605
|
|
|
|
|
|
|
|
|
1606
|
|
|
|
|
|
|
=head3 Comparison operators |
|
1607
|
|
|
|
|
|
|
|
|
1608
|
|
|
|
|
|
|
|
|
1609
|
|
|
|
|
|
|
|
|
1610
|
|
|
|
|
|
|
=begin html |
|
1611
|
|
|
|
|
|
|
|
|
1612
|
|
|
|
|
|
|
|
|
1633
|
|
|
|
|
|
|
|
|
1634
|
|
|
|
|
|
|
=end html |
|
1635
|
|
|
|
|
|
|
|
|
1636
|
|
|
|
|
|
|
|
|
1637
|
|
|
|
|
|
|
|
|
1638
|
|
|
|
|
|
|
C may appear on either side of the operator; C<< 4 E col('x') >> is automatically rewritten to the equivalent C<< col('x') E 4 >>. |
|
1639
|
|
|
|
|
|
|
|
|
1640
|
|
|
|
|
|
|
=head3 Combining predicates: C<&>, C<|>, C |
|
1641
|
|
|
|
|
|
|
|
|
1642
|
|
|
|
|
|
|
Predicates compose with bitwise C<&> (and), C<|> (or), and C (not): |
|
1643
|
|
|
|
|
|
|
|
|
1644
|
|
|
|
|
|
|
filter($df, (col('age') > 18) & (col('sex') eq 'f')); # and |
|
1645
|
|
|
|
|
|
|
filter($df, (col('grp') eq 'a') | (col('grp') eq 'c')); # or |
|
1646
|
|
|
|
|
|
|
filter($df, !(col('x') > 100)); # not |
|
1647
|
|
|
|
|
|
|
|
|
1648
|
|
|
|
|
|
|
Comparison operators bind more tightly than C<&> and C<|>, so C<< (col('a') E 4) & (col('b') E 2) >> is parsed correctly, but the parentheses are recommended for readability. |
|
1649
|
|
|
|
|
|
|
|
|
1650
|
|
|
|
|
|
|
=head3 The code-reference form |
|
1651
|
|
|
|
|
|
|
|
|
1652
|
|
|
|
|
|
|
For logic the operators can't express, pass a C. It is called once per row; the B, available both as C<$_> and as the first argument C<$_[0]>. Return a true value to keep the row. |
|
1653
|
|
|
|
|
|
|
|
|
1654
|
|
|
|
|
|
|
filter($df, sub { $_->{x} > 4 && $_->{grp} eq 'a' }); |
|
1655
|
|
|
|
|
|
|
filter($df, sub { $_->{name} =~ /^A/ }); |
|
1656
|
|
|
|
|
|
|
filter($df, sub { $_[0]{score} > $_[0]{threshold} }); |
|
1657
|
|
|
|
|
|
|
|
|
1658
|
|
|
|
|
|
|
For an HoA, each row is assembled into a temporary hash reference (C<< { column =E value, ... } >>) before the sub is called, so the same C<< $_-E{column} >> syntax works regardless of the input shape. |
|
1659
|
|
|
|
|
|
|
|
|
1660
|
|
|
|
|
|
|
=head3 Examples |
|
1661
|
|
|
|
|
|
|
|
|
1662
|
|
|
|
|
|
|
use Stats::LikeR; |
|
1663
|
|
|
|
|
|
|
my $df = read_table('patients.csv'); # array of hashes |
|
1664
|
|
|
|
|
|
|
# numeric threshold |
|
1665
|
|
|
|
|
|
|
my $adults = filter($df, col('Age') >= 18); |
|
1666
|
|
|
|
|
|
|
# combine conditions |
|
1667
|
|
|
|
|
|
|
my $target = filter($df, (col('Age') >= 18) & (col('Sex') eq 'f')); |
|
1668
|
|
|
|
|
|
|
# arbitrary logic with a coderef |
|
1669
|
|
|
|
|
|
|
my $flagged = filter($df, sub { $_->{ALT} > 40 || $_->{AST} > 40 }); |
|
1670
|
|
|
|
|
|
|
# hash-of-arrays input -> hash-of-arrays output, columns filtered in parallel |
|
1671
|
|
|
|
|
|
|
my $hoa = read_table('patients.csv', 'output.type' => 'hoa'); |
|
1672
|
|
|
|
|
|
|
my $sub = filter($hoa, col('Age') > 32); |
|
1673
|
|
|
|
|
|
|
# $sub->{Age}, $sub->{Sex}, ... are all the same length and row-aligned |
|
1674
|
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
=head3 Behavior and notes |
|
1676
|
|
|
|
|
|
|
|
|
1677
|
|
|
|
|
|
|
=over |
|
1678
|
|
|
|
|
|
|
|
|
1679
|
|
|
|
|
|
|
=item * B C builds and returns a new frame; C<$df> is left untouched. |
|
1680
|
|
|
|
|
|
|
|
|
1681
|
|
|
|
|
|
|
=item * B<< A missing or C cell never matches >> a C comparison. For example C<< col('x') E 0 >> silently drops any row that has no C value or whose C is C. |
|
1682
|
|
|
|
|
|
|
|
|
1683
|
|
|
|
|
|
|
=item * B, into the returned frame: the returned array references the I row hashes as the input (fast, low-memory). Mutating a row in the result would therefore also change it in the original. HoA values are copied into fresh arrays. |
|
1684
|
|
|
|
|
|
|
|
|
1685
|
|
|
|
|
|
|
=item * B are well defined: a predicate true for every row returns a copy-shaped frame with all rows; a predicate true for none returns an empty frame (C<[]> for AoH, a hash of empty arrays for HoA). |
|
1686
|
|
|
|
|
|
|
|
|
1687
|
|
|
|
|
|
|
=item * B Passing a non-reference, an array element that is not a hash reference, or an HoA column that is not an array reference raises a descriptive error. |
|
1688
|
|
|
|
|
|
|
|
|
1689
|
|
|
|
|
|
|
=item * B The C/operator layer is pure Perl (operator overloading); the per-row evaluation is done in XS. |
|
1690
|
|
|
|
|
|
|
|
|
1691
|
|
|
|
|
|
|
=back |
|
1692
|
|
|
|
|
|
|
|
|
1693
|
|
|
|
|
|
|
=head3 See also |
|
1694
|
|
|
|
|
|
|
|
|
1695
|
|
|
|
|
|
|
C (whose C option applies the same coderef convention while reading a file), C. |
|
1696
|
|
|
|
|
|
|
|
|
1697
|
|
|
|
|
|
|
=head2 fisher_test |
|
1698
|
|
|
|
|
|
|
|
|
1699
|
|
|
|
|
|
|
=head3 array reference entry |
|
1700
|
|
|
|
|
|
|
|
|
1701
|
|
|
|
|
|
|
my $array_data = [ |
|
1702
|
|
|
|
|
|
|
[10, 2], |
|
1703
|
|
|
|
|
|
|
[3, 15] |
|
1704
|
|
|
|
|
|
|
]; |
|
1705
|
|
|
|
|
|
|
my $res1 = fisher_test($array_data); |
|
1706
|
|
|
|
|
|
|
|
|
1707
|
|
|
|
|
|
|
which returns a hash reference: |
|
1708
|
|
|
|
|
|
|
|
|
1709
|
|
|
|
|
|
|
{ |
|
1710
|
|
|
|
|
|
|
alternative "two.sided", |
|
1711
|
|
|
|
|
|
|
conf_int [ |
|
1712
|
|
|
|
|
|
|
[0] 2.75343836564204, |
|
1713
|
|
|
|
|
|
|
[1] 300.682787419401 |
|
1714
|
|
|
|
|
|
|
], |
|
1715
|
|
|
|
|
|
|
conf_level 0.95, |
|
1716
|
|
|
|
|
|
|
estimate { |
|
1717
|
|
|
|
|
|
|
"odds ratio" 21.3053312750168 |
|
1718
|
|
|
|
|
|
|
}, |
|
1719
|
|
|
|
|
|
|
method "Fisher's Exact Test for Count Data", |
|
1720
|
|
|
|
|
|
|
p_value 0.000536724119143435 |
|
1721
|
|
|
|
|
|
|
} |
|
1722
|
|
|
|
|
|
|
|
|
1723
|
|
|
|
|
|
|
=head3 hash reference entry |
|
1724
|
|
|
|
|
|
|
|
|
1725
|
|
|
|
|
|
|
$ft = fisher_test( { |
|
1726
|
|
|
|
|
|
|
Guess => { |
|
1727
|
|
|
|
|
|
|
Milk => 3, Tea => 1 |
|
1728
|
|
|
|
|
|
|
}, |
|
1729
|
|
|
|
|
|
|
Truth => { |
|
1730
|
|
|
|
|
|
|
Milk => 1, Tea => 3 |
|
1731
|
|
|
|
|
|
|
} |
|
1732
|
|
|
|
|
|
|
}); |
|
1733
|
|
|
|
|
|
|
|
|
1734
|
|
|
|
|
|
|
=head2 glm |
|
1735
|
|
|
|
|
|
|
|
|
1736
|
|
|
|
|
|
|
takes a hash of an array as input |
|
1737
|
|
|
|
|
|
|
|
|
1738
|
|
|
|
|
|
|
my %tooth_growth = ( |
|
1739
|
|
|
|
|
|
|
dose => [qw(0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 |
|
1740
|
|
|
|
|
|
|
1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 |
|
1741
|
|
|
|
|
|
|
0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 |
|
1742
|
|
|
|
|
|
|
2.0 2.0 2.0)], |
|
1743
|
|
|
|
|
|
|
len => [qw(4.2 11.5 7.3 5.8 6.4 10.0 11.2 11.2 5.2 7.0 16.5 16.5 15.2 17.3 22.5 |
|
1744
|
|
|
|
|
|
|
17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5 |
|
1745
|
|
|
|
|
|
|
15.2 21.5 17.6 9.7 14.5 10.0 8.2 9.4 16.5 9.7 19.7 23.3 23.6 26.4 20.0 |
|
1746
|
|
|
|
|
|
|
25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0)], |
|
1747
|
|
|
|
|
|
|
supp => [qw(VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC |
|
1748
|
|
|
|
|
|
|
VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ |
|
1749
|
|
|
|
|
|
|
OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ)] |
|
1750
|
|
|
|
|
|
|
); |
|
1751
|
|
|
|
|
|
|
|
|
1752
|
|
|
|
|
|
|
my $glm_teeth = glm( |
|
1753
|
|
|
|
|
|
|
data => \%tooth_growth, |
|
1754
|
|
|
|
|
|
|
formula => 'len ~ dose + supp', |
|
1755
|
|
|
|
|
|
|
family => 'gaussian' |
|
1756
|
|
|
|
|
|
|
); |
|
1757
|
|
|
|
|
|
|
|
|
1758
|
|
|
|
|
|
|
In addition to the C default, it fully supports logistic regression using the C family parameter via Iteratively Reweighted Least Squares (IRLS): |
|
1759
|
|
|
|
|
|
|
|
|
1760
|
|
|
|
|
|
|
my $glm_bin = glm(formula => 'am ~ wt + hp', data => \%mtcars, family => 'binomial'); |
|
1761
|
|
|
|
|
|
|
|
|
1762
|
|
|
|
|
|
|
=head3 Input Parameters |
|
1763
|
|
|
|
|
|
|
|
|
1764
|
|
|
|
|
|
|
|
|
1765
|
|
|
|
|
|
|
|
|
1766
|
|
|
|
|
|
|
=begin html |
|
1767
|
|
|
|
|
|
|
|
|
1768
|
|
|
|
|
|
|
|
1769
|
|
|
|
|
|
|
|
1770
|
|
|
|
|
|
|
|
|
1771
|
|
|
|
|
|
|
| Parameter |
|
1772
|
|
|
|
|
|
|
| Type |
|
1773
|
|
|
|
|
|
|
| Default |
|
1774
|
|
|
|
|
|
|
| Description |
|
1775
|
|
|
|
|
|
|
| Example |
|
1776
|
|
|
|
|
|
|
|
|
1777
|
|
|
|
|
|
|
| |
|
1778
|
|
|
|
|
|
|
|
|
1779
|
|
|
|
|
|
|
|
|
1780
|
|
|
|
|
|
|
| formula |
|
1781
|
|
|
|
|
|
|
| String |
|
1782
|
|
|
|
|
|
|
| None (Required) |
|
1783
|
|
|
|
|
|
|
| A symbolic description of the model to be fitted. Supports operators like +, :, *, ^, and -1 (to remove the intercept). |
|
1784
|
|
|
|
|
|
|
| 'am ~ wt + hp', 'y ~ x - 1' |
|
1785
|
|
|
|
|
|
|
|
|
1786
|
|
|
|
|
|
|
|
|
1787
|
|
|
|
|
|
|
| data |
|
1788
|
|
|
|
|
|
|
| HashRef or ArrayRef |
|
1789
|
|
|
|
|
|
|
| None (Required) |
|
1790
|
|
|
|
|
|
|
| The dataset containing the variables used in the formula. Accepts either a Hash of Arrays (HoA) or an Array of Hashes (AoH). |
|
1791
|
|
|
|
|
|
|
| \%mtcars, [{x => 1, y => 2}, ...] |
|
1792
|
|
|
|
|
|
|
|
|
1793
|
|
|
|
|
|
|
|
|
1794
|
|
|
|
|
|
|
| family |
|
1795
|
|
|
|
|
|
|
| String |
|
1796
|
|
|
|
|
|
|
| 'gaussian' |
|
1797
|
|
|
|
|
|
|
| A description of the error distribution and link function to be used in the model. Currently supports 'gaussian' (identity link) and 'binomial' (logit link). |
|
1798
|
|
|
|
|
|
|
| 'binomial' |
|
1799
|
|
|
|
|
|
|
|
|
1800
|
|
|
|
|
|
|
|
|
1801
|
|
|
|
|
|
|
| |
|
1802
|
|
|
|
|
|
|
|
|
1803
|
|
|
|
|
|
|
=end html |
|
1804
|
|
|
|
|
|
|
|
|
1805
|
|
|
|
|
|
|
|
|
1806
|
|
|
|
|
|
|
|
|
1807
|
|
|
|
|
|
|
=head3 Output variables |
|
1808
|
|
|
|
|
|
|
|
|
1809
|
|
|
|
|
|
|
|
|
1810
|
|
|
|
|
|
|
|
|
1811
|
|
|
|
|
|
|
=begin html |
|
1812
|
|
|
|
|
|
|
|
|
1813
|
|
|
|
|
|
|
|
1814
|
|
|
|
|
|
|
|
1815
|
|
|
|
|
|
|
|
|
1816
|
|
|
|
|
|
|
| Variable |
|
1817
|
|
|
|
|
|
|
| Type |
|
1818
|
|
|
|
|
|
|
| Description |
|
1819
|
|
|
|
|
|
|
| Example |
|
1820
|
|
|
|
|
|
|
|
|
1821
|
|
|
|
|
|
|
| |
|
1822
|
|
|
|
|
|
|
|
|
1823
|
|
|
|
|
|
|
|
|
1824
|
|
|
|
|
|
|
| aic |
|
1825
|
|
|
|
|
|
|
| Double |
|
1826
|
|
|
|
|
|
|
| Akaike's Information Criterion for the fitted model. |
|
1827
|
|
|
|
|
|
|
| 123.45 |
|
1828
|
|
|
|
|
|
|
|
|
1829
|
|
|
|
|
|
|
|
|
1830
|
|
|
|
|
|
|
| boundary |
|
1831
|
|
|
|
|
|
|
| Integer (Boolean) |
|
1832
|
|
|
|
|
|
|
| 1 if the fitted values computationally reached the 0 or 1 boundary (specific to the binomial family), 0 otherwise. |
|
1833
|
|
|
|
|
|
|
| 0 |
|
1834
|
|
|
|
|
|
|
|
|
1835
|
|
|
|
|
|
|
|
|
1836
|
|
|
|
|
|
|
| coefficients |
|
1837
|
|
|
|
|
|
|
| HashRef |
|
1838
|
|
|
|
|
|
|
| A hash mapping the expanded model term names to their estimated coefficient values. |
|
1839
|
|
|
|
|
|
|
| {'Intercept' => 1.5, 'wt' => -0.5} |
|
1840
|
|
|
|
|
|
|
|
|
1841
|
|
|
|
|
|
|
|
|
1842
|
|
|
|
|
|
|
| converged |
|
1843
|
|
|
|
|
|
|
| Integer (Boolean) |
|
1844
|
|
|
|
|
|
|
| 1 if the Iteratively Reweighted Least Squares (IRLS) algorithm converged within the maximum iterations, 0 otherwise. |
|
1845
|
|
|
|
|
|
|
| 1 |
|
1846
|
|
|
|
|
|
|
|
|
1847
|
|
|
|
|
|
|
|
|
1848
|
|
|
|
|
|
|
| deviance |
|
1849
|
|
|
|
|
|
|
| Double |
|
1850
|
|
|
|
|
|
|
| The residual deviance of the fitted model. |
|
1851
|
|
|
|
|
|
|
| 15.2 |
|
1852
|
|
|
|
|
|
|
|
|
1853
|
|
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
| deviance.resid |
|
1855
|
|
|
|
|
|
|
| HashRef |
|
1856
|
|
|
|
|
|
|
| A hash mapping data row names to their computed deviance residuals. |
|
1857
|
|
|
|
|
|
|
| {'Mazda RX4' => 0.12} |
|
1858
|
|
|
|
|
|
|
|
|
1859
|
|
|
|
|
|
|
|
|
1860
|
|
|
|
|
|
|
| df.null |
|
1861
|
|
|
|
|
|
|
| Integer |
|
1862
|
|
|
|
|
|
|
| The residual degrees of freedom for the null model. |
|
1863
|
|
|
|
|
|
|
| 31 |
|
1864
|
|
|
|
|
|
|
|
|
1865
|
|
|
|
|
|
|
|
|
1866
|
|
|
|
|
|
|
| df.residual |
|
1867
|
|
|
|
|
|
|
| Integer |
|
1868
|
|
|
|
|
|
|
| The residual degrees of freedom for the fitted model. |
|
1869
|
|
|
|
|
|
|
| 30 |
|
1870
|
|
|
|
|
|
|
|
|
1871
|
|
|
|
|
|
|
|
|
1872
|
|
|
|
|
|
|
| family |
|
1873
|
|
|
|
|
|
|
| String |
|
1874
|
|
|
|
|
|
|
| The statistical family used to fit the model. |
|
1875
|
|
|
|
|
|
|
| "gaussian" |
|
1876
|
|
|
|
|
|
|
|
|
1877
|
|
|
|
|
|
|
|
|
1878
|
|
|
|
|
|
|
| fitted.values |
|
1879
|
|
|
|
|
|
|
| HashRef |
|
1880
|
|
|
|
|
|
|
| A hash mapping data row names to the fitted mean values (the model's predictions on the scale of the response). |
|
1881
|
|
|
|
|
|
|
| {'Mazda RX4' => 0.85} |
|
1882
|
|
|
|
|
|
|
|
|
1883
|
|
|
|
|
|
|
|
|
1884
|
|
|
|
|
|
|
| iter |
|
1885
|
|
|
|
|
|
|
| Integer |
|
1886
|
|
|
|
|
|
|
| The number of IRLS iterations performed before convergence or hitting the iteration limit. |
|
1887
|
|
|
|
|
|
|
| 4 |
|
1888
|
|
|
|
|
|
|
|
|
1889
|
|
|
|
|
|
|
|
|
1890
|
|
|
|
|
|
|
| null.deviance |
|
1891
|
|
|
|
|
|
|
| Double |
|
1892
|
|
|
|
|
|
|
| The deviance for the null model (a baseline model containing only an intercept, or an offset of 0 if the intercept is removed). |
|
1893
|
|
|
|
|
|
|
| 43.5 |
|
1894
|
|
|
|
|
|
|
|
|
1895
|
|
|
|
|
|
|
|
|
1896
|
|
|
|
|
|
|
| rank |
|
1897
|
|
|
|
|
|
|
| Integer |
|
1898
|
|
|
|
|
|
|
| The numeric rank of the fitted linear model (the number of estimated, non-aliased parameters). |
|
1899
|
|
|
|
|
|
|
| 2 |
|
1900
|
|
|
|
|
|
|
|
|
1901
|
|
|
|
|
|
|
|
|
1902
|
|
|
|
|
|
|
| summary |
|
1903
|
|
|
|
|
|
|
| HashRef |
|
1904
|
|
|
|
|
|
|
| A nested hash mapping each term to its detailed summary statistics, including Estimate, Std. Error, t value / z value, and Pr(> t ) / Pr(> z ). Aliased parameters return "NaN". |
|
1905
|
|
|
|
|
|
|
| {'wt' => {'Estimate' => -0.5, 'Std. Error' => 0.1, ...}} |
|
1906
|
|
|
|
|
|
|
|
|
1907
|
|
|
|
|
|
|
|
|
1908
|
|
|
|
|
|
|
| terms |
|
1909
|
|
|
|
|
|
|
| ArrayRef |
|
1910
|
|
|
|
|
|
|
| An ordered list of the expanded term names included in the model matrix. |
|
1911
|
|
|
|
|
|
|
| ['Intercept', 'wt', 'hp'] |
|
1912
|
|
|
|
|
|
|
|
|
1913
|
|
|
|
|
|
|
|
|
1914
|
|
|
|
|
|
|
| |
|
1915
|
|
|
|
|
|
|
|
|
1916
|
|
|
|
|
|
|
=end html |
|
1917
|
|
|
|
|
|
|
|
|
1918
|
|
|
|
|
|
|
|
|
1919
|
|
|
|
|
|
|
|
|
1920
|
|
|
|
|
|
|
=head2 group_by |
|
1921
|
|
|
|
|
|
|
|
|
1922
|
|
|
|
|
|
|
Take a hash of arrays, hash of hashes, or array of hashes, and group a column by another column. |
|
1923
|
|
|
|
|
|
|
|
|
1924
|
|
|
|
|
|
|
my $aoh_data = [ |
|
1925
|
|
|
|
|
|
|
{ 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 }, |
|
1926
|
|
|
|
|
|
|
{ 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 }, |
|
1927
|
|
|
|
|
|
|
{ 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 }, |
|
1928
|
|
|
|
|
|
|
{ 'Gender' => 'Female' } # Intentional missing target value |
|
1929
|
|
|
|
|
|
|
]; |
|
1930
|
|
|
|
|
|
|
|
|
1931
|
|
|
|
|
|
|
as well as |
|
1932
|
|
|
|
|
|
|
|
|
1933
|
|
|
|
|
|
|
$hoh_data = { |
|
1934
|
|
|
|
|
|
|
'Patient_A' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 }, |
|
1935
|
|
|
|
|
|
|
'Patient_B' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 }, |
|
1936
|
|
|
|
|
|
|
'Patient_C' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 }, |
|
1937
|
|
|
|
|
|
|
'Patient_D' => { 'Gender' => 'Female' }, # Intentional missing target value |
|
1938
|
|
|
|
|
|
|
'Patient_E' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => undef } # Explicit undef |
|
1939
|
|
|
|
|
|
|
}; |
|
1940
|
|
|
|
|
|
|
|
|
1941
|
|
|
|
|
|
|
and |
|
1942
|
|
|
|
|
|
|
|
|
1943
|
|
|
|
|
|
|
my $hoa_data = { |
|
1944
|
|
|
|
|
|
|
'Gender' => ['Male', 'Female', 'Male', 'Female'], |
|
1945
|
|
|
|
|
|
|
'Testosterone, total (nmol/L)' => [22.1, 2.5, 19.4, undef ] |
|
1946
|
|
|
|
|
|
|
}; |
|
1947
|
|
|
|
|
|
|
|
|
1948
|
|
|
|
|
|
|
then run the function thus: |
|
1949
|
|
|
|
|
|
|
|
|
1950
|
|
|
|
|
|
|
group_by( $hoa_data, 'Testosterone, total (nmol/L)', 'Gender'); |
|
1951
|
|
|
|
|
|
|
|
|
1952
|
|
|
|
|
|
|
The output can be thought of like a hash, with the first string broken down by the second. |
|
1953
|
|
|
|
|
|
|
|
|
1954
|
|
|
|
|
|
|
all become hash of arrays: |
|
1955
|
|
|
|
|
|
|
|
|
1956
|
|
|
|
|
|
|
{ |
|
1957
|
|
|
|
|
|
|
Female [ |
|
1958
|
|
|
|
|
|
|
[0] 1.8 |
|
1959
|
|
|
|
|
|
|
], |
|
1960
|
|
|
|
|
|
|
Male [ |
|
1961
|
|
|
|
|
|
|
[0] 18.2, |
|
1962
|
|
|
|
|
|
|
[1] 20.5 |
|
1963
|
|
|
|
|
|
|
] |
|
1964
|
|
|
|
|
|
|
} |
|
1965
|
|
|
|
|
|
|
|
|
1966
|
|
|
|
|
|
|
returns an empty array of hashes if neither target nor group keys are found. |
|
1967
|
|
|
|
|
|
|
|
|
1968
|
|
|
|
|
|
|
=head3 Filtering |
|
1969
|
|
|
|
|
|
|
|
|
1970
|
|
|
|
|
|
|
Data can be further broken down with filter/subs like in C: |
|
1971
|
|
|
|
|
|
|
|
|
1972
|
|
|
|
|
|
|
my $testosterone = group_by($d, # group testosterone by "Gender" |
|
1973
|
|
|
|
|
|
|
'Testosterone, total (nmol/L)', |
|
1974
|
|
|
|
|
|
|
'Gender', |
|
1975
|
|
|
|
|
|
|
{ 'Race/Hispanic origin w/ NH Asian' => sub { $_ eq $n } },# filter |
|
1976
|
|
|
|
|
|
|
{ 'Testosterone, total (nmol/L)' => sub { $_ ne 'NA' } } # filter |
|
1977
|
|
|
|
|
|
|
); |
|
1978
|
|
|
|
|
|
|
|
|
1979
|
|
|
|
|
|
|
where each filter filters on the columns, e.g. second hash keys. |
|
1980
|
|
|
|
|
|
|
|
|
1981
|
|
|
|
|
|
|
=head2 hoh2hoa |
|
1982
|
|
|
|
|
|
|
|
|
1983
|
|
|
|
|
|
|
Convert a B (row-major: outer key = row, inner key = column) |
|
1984
|
|
|
|
|
|
|
into a B (column-major: key = column, value = that column's |
|
1985
|
|
|
|
|
|
|
cells down the rows). |
|
1986
|
|
|
|
|
|
|
|
|
1987
|
|
|
|
|
|
|
use Stats::LikeR; |
|
1988
|
|
|
|
|
|
|
|
|
1989
|
|
|
|
|
|
|
my %hoh = ( |
|
1990
|
|
|
|
|
|
|
'r1' => { 'a' => 1, 'b' => 2 }, |
|
1991
|
|
|
|
|
|
|
'r2' => { 'a' => 3, 'b' => 4 }, |
|
1992
|
|
|
|
|
|
|
); |
|
1993
|
|
|
|
|
|
|
|
|
1994
|
|
|
|
|
|
|
my $hoa = hoh2hoa(\%hoh); |
|
1995
|
|
|
|
|
|
|
|
|
1996
|
|
|
|
|
|
|
which returns |
|
1997
|
|
|
|
|
|
|
{ |
|
1998
|
|
|
|
|
|
|
a => [1, 3], |
|
1999
|
|
|
|
|
|
|
b => [2, 4], |
|
2000
|
|
|
|
|
|
|
} |
|
2001
|
|
|
|
|
|
|
|
|
2002
|
|
|
|
|
|
|
=head3 Behavior |
|
2003
|
|
|
|
|
|
|
|
|
2004
|
|
|
|
|
|
|
=over |
|
2005
|
|
|
|
|
|
|
|
|
2006
|
|
|
|
|
|
|
=item * B are the union of every inner key, so a key that appears in only |
|
2007
|
|
|
|
|
|
|
some rows still becomes a column. |
|
2008
|
|
|
|
|
|
|
|
|
2009
|
|
|
|
|
|
|
=item * B are emitted in sorted outer-key (row-name) order, and that one order |
|
2010
|
|
|
|
|
|
|
is used for every column, so the arrays stay aligned and the result is |
|
2011
|
|
|
|
|
|
|
reproducible regardless of hash ordering. |
|
2012
|
|
|
|
|
|
|
|
|
2013
|
|
|
|
|
|
|
=item * B — a missing inner key, or a cell whose value is C — are filled |
|
2014
|
|
|
|
|
|
|
with the fill value (see C below). Every column therefore has |
|
2015
|
|
|
|
|
|
|
exactly one entry per row. |
|
2016
|
|
|
|
|
|
|
|
|
2017
|
|
|
|
|
|
|
=item * Values are B into the result; the original structure is left |
|
2018
|
|
|
|
|
|
|
untouched. |
|
2019
|
|
|
|
|
|
|
|
|
2020
|
|
|
|
|
|
|
=item * An B hash of hashes returns an empty hash of arrays (it is not an |
|
2021
|
|
|
|
|
|
|
error). |
|
2022
|
|
|
|
|
|
|
|
|
2023
|
|
|
|
|
|
|
=back |
|
2024
|
|
|
|
|
|
|
|
|
2025
|
|
|
|
|
|
|
=head3 Options |
|
2026
|
|
|
|
|
|
|
|
|
2027
|
|
|
|
|
|
|
Options are passed as trailing C<< name =E value >> pairs. |
|
2028
|
|
|
|
|
|
|
|
|
2029
|
|
|
|
|
|
|
|
|
2030
|
|
|
|
|
|
|
|
|
2031
|
|
|
|
|
|
|
=begin html |
|
2032
|
|
|
|
|
|
|
|
|
2033
|
|
|
|
|
|
|
|
2034
|
|
|
|
|
|
|
|
2035
|
|
|
|
|
|
|
|
|
2036
|
|
|
|
|
|
|
| Option |
|
2037
|
|
|
|
|
|
|
| Default |
|
2038
|
|
|
|
|
|
|
| Meaning |
|
2039
|
|
|
|
|
|
|
|
|
2040
|
|
|
|
|
|
|
| |
|
2041
|
|
|
|
|
|
|
|
|
2042
|
|
|
|
|
|
|
|
|
2043
|
|
|
|
|
|
|
| undef.val |
|
2044
|
|
|
|
|
|
|
| undef |
|
2045
|
|
|
|
|
|
|
| Value used to fill a missing key or an undef cell. Any defined scalar works, including 0 and ''. Passing undef keeps the default. |
|
2046
|
|
|
|
|
|
|
|
|
2047
|
|
|
|
|
|
|
|
|
2048
|
|
|
|
|
|
|
| row.names |
|
2049
|
|
|
|
|
|
|
| (none) |
|
2050
|
|
|
|
|
|
|
| If set to a string, an extra column of that name is added holding the sorted row labels, aligned with the data. Dies if the name collides with an existing column. |
|
2051
|
|
|
|
|
|
|
|
|
2052
|
|
|
|
|
|
|
|
|
2053
|
|
|
|
|
|
|
| |
|
2054
|
|
|
|
|
|
|
|
|
2055
|
|
|
|
|
|
|
=end html |
|
2056
|
|
|
|
|
|
|
|
|
2057
|
|
|
|
|
|
|
|
|
2058
|
|
|
|
|
|
|
|
|
2059
|
|
|
|
|
|
|
# Ragged input with an explicit fill string: |
|
2060
|
|
|
|
|
|
|
my %ragged = ( |
|
2061
|
|
|
|
|
|
|
'r1' => { 'a' => 1, 'b' => 2 }, |
|
2062
|
|
|
|
|
|
|
'r2' => { 'a' => 3, 'c' => 9 }, |
|
2063
|
|
|
|
|
|
|
); |
|
2064
|
|
|
|
|
|
|
my $hoa = hoh2hoa(\%ragged, 'undef.val' => 'NA'); |
|
2065
|
|
|
|
|
|
|
# { |
|
2066
|
|
|
|
|
|
|
# a => [1, 3 ], |
|
2067
|
|
|
|
|
|
|
# b => [2, 'NA'], |
|
2068
|
|
|
|
|
|
|
# c => ['NA', 9 ], |
|
2069
|
|
|
|
|
|
|
# } |
|
2070
|
|
|
|
|
|
|
|
|
2071
|
|
|
|
|
|
|
# Keep the row labels as a column: |
|
2072
|
|
|
|
|
|
|
my $with_ids = hoh2hoa(\%ragged, 'row.names' => 'id'); |
|
2073
|
|
|
|
|
|
|
# { |
|
2074
|
|
|
|
|
|
|
# id => ['r1', 'r2'], |
|
2075
|
|
|
|
|
|
|
# a => [1, 3 ], |
|
2076
|
|
|
|
|
|
|
# b => [2, undef], |
|
2077
|
|
|
|
|
|
|
# c => [undef, 9 ], |
|
2078
|
|
|
|
|
|
|
# } |
|
2079
|
|
|
|
|
|
|
|
|
2080
|
|
|
|
|
|
|
=head3 Errors |
|
2081
|
|
|
|
|
|
|
|
|
2082
|
|
|
|
|
|
|
C dies (via C) when: |
|
2083
|
|
|
|
|
|
|
|
|
2084
|
|
|
|
|
|
|
=over |
|
2085
|
|
|
|
|
|
|
|
|
2086
|
|
|
|
|
|
|
=item * the argument is not a hash reference, |
|
2087
|
|
|
|
|
|
|
|
|
2088
|
|
|
|
|
|
|
=item * any value in the hash is not itself a hash reference, |
|
2089
|
|
|
|
|
|
|
|
|
2090
|
|
|
|
|
|
|
=item * an unknown option is given, or the options are not C<< name =E value >> pairs, |
|
2091
|
|
|
|
|
|
|
|
|
2092
|
|
|
|
|
|
|
=item * C is not a plain string, or it names an already-present column. |
|
2093
|
|
|
|
|
|
|
|
|
2094
|
|
|
|
|
|
|
=back |
|
2095
|
|
|
|
|
|
|
|
|
2096
|
|
|
|
|
|
|
=head2 hist |
|
2097
|
|
|
|
|
|
|
|
|
2098
|
|
|
|
|
|
|
Computes the histogram of the given data values, operating in single $O(N)$ pass performance. It returns the bin counts, computed breaks, midpoints, and density. |
|
2099
|
|
|
|
|
|
|
|
|
2100
|
|
|
|
|
|
|
my $res = hist([1, 2, 2, 3, 3, 3, 4, 4, 5], breaks => 4); |
|
2101
|
|
|
|
|
|
|
|
|
2102
|
|
|
|
|
|
|
If C is not explicitly provided, it defaults to calculating the number of bins using Sturges' formula. |
|
2103
|
|
|
|
|
|
|
|
|
2104
|
|
|
|
|
|
|
=head2 kruskal_test |
|
2105
|
|
|
|
|
|
|
|
|
2106
|
|
|
|
|
|
|
Essentially the test determines if all groups have the same median (same distribution) (an excellent review is at https://library.virginia.edu/data/articles/getting-started-with-the-kruskal-wallis-test) |
|
2107
|
|
|
|
|
|
|
|
|
2108
|
|
|
|
|
|
|
Performs a Kruskal-Wallis rank sum test, see |
|
2109
|
|
|
|
|
|
|
https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/kruskal.test |
|
2110
|
|
|
|
|
|
|
|
|
2111
|
|
|
|
|
|
|
=head3 hash of array entry |
|
2112
|
|
|
|
|
|
|
|
|
2113
|
|
|
|
|
|
|
I feel that this is better, and more easily read, than what you get in R: |
|
2114
|
|
|
|
|
|
|
|
|
2115
|
|
|
|
|
|
|
my %x = ( |
|
2116
|
|
|
|
|
|
|
'normal.subjects' => [2.9, 3.0, 2.5, 2.6, 3.2], |
|
2117
|
|
|
|
|
|
|
'obs. airway disease' => [3.8, 2.7, 4.0, 2.4], |
|
2118
|
|
|
|
|
|
|
'asbestosis' => [2.8, 3.4, 3.7, 2.2, 2.0] |
|
2119
|
|
|
|
|
|
|
); |
|
2120
|
|
|
|
|
|
|
$kt = kruskal_test(\%x); |
|
2121
|
|
|
|
|
|
|
|
|
2122
|
|
|
|
|
|
|
=head3 R-like array entry |
|
2123
|
|
|
|
|
|
|
|
|
2124
|
|
|
|
|
|
|
my @xk = (2.9, 3.0, 2.5, 2.6, 3.2); # normal subjects |
|
2125
|
|
|
|
|
|
|
my @yk = (3.8, 2.7, 4.0, 2.4); # with obstructive airway disease |
|
2126
|
|
|
|
|
|
|
my @zk = (2.8, 3.4, 3.7, 2.2, 2.0); # with asbestosis |
|
2127
|
|
|
|
|
|
|
my @x = (@xk, @yk, @zk); |
|
2128
|
|
|
|
|
|
|
my @g = ( |
|
2129
|
|
|
|
|
|
|
(map {'Normal subjects'} 0..4), |
|
2130
|
|
|
|
|
|
|
(map {'Subjects with obstructive airway disease'} 0..3), |
|
2131
|
|
|
|
|
|
|
map {'Subjects with asbestosis'} 0..4 |
|
2132
|
|
|
|
|
|
|
); |
|
2133
|
|
|
|
|
|
|
my $kt = kruskal_test(\@x, \@g); |
|
2134
|
|
|
|
|
|
|
|
|
2135
|
|
|
|
|
|
|
=head2 ks_test |
|
2136
|
|
|
|
|
|
|
|
|
2137
|
|
|
|
|
|
|
The Kolmogorov–Smirnov test checks whether two samples are drawn from the |
|
2138
|
|
|
|
|
|
|
same distribution (two-sample), or whether a single sample is drawn from a |
|
2139
|
|
|
|
|
|
|
given reference distribution (one-sample). It works by comparing the empirical |
|
2140
|
|
|
|
|
|
|
cumulative distribution functions (ECDFs) and measuring the largest gap |
|
2141
|
|
|
|
|
|
|
between them. |
|
2142
|
|
|
|
|
|
|
|
|
2143
|
|
|
|
|
|
|
Two-sample form — pass two array references: |
|
2144
|
|
|
|
|
|
|
|
|
2145
|
|
|
|
|
|
|
$ks = ks_test(\@x, \@y); |
|
2146
|
|
|
|
|
|
|
$ks = ks_test(\@x, \@y, alternative => 'greater'); |
|
2147
|
|
|
|
|
|
|
|
|
2148
|
|
|
|
|
|
|
One-sample form — pass one array reference and the name of a reference CDF. |
|
2149
|
|
|
|
|
|
|
Currently only C<'pnorm'> is supported, i.e. the standard normal distribution |
|
2150
|
|
|
|
|
|
|
(mean 0, standard deviation 1): |
|
2151
|
|
|
|
|
|
|
|
|
2152
|
|
|
|
|
|
|
$ks = ks_test(\@x, 'pnorm'); |
|
2153
|
|
|
|
|
|
|
|
|
2154
|
|
|
|
|
|
|
Arguments may be given positionally (as above) or by name: |
|
2155
|
|
|
|
|
|
|
|
|
2156
|
|
|
|
|
|
|
$ks = ks_test(x => \@x, y => \@y, alternative => 'less', exact => 1); |
|
2157
|
|
|
|
|
|
|
|
|
2158
|
|
|
|
|
|
|
Non-numeric and undefined elements are silently dropped before the test runs. |
|
2159
|
|
|
|
|
|
|
|
|
2160
|
|
|
|
|
|
|
C selects which gap between the ECDFs is measured: |
|
2161
|
|
|
|
|
|
|
|
|
2162
|
|
|
|
|
|
|
=over |
|
2163
|
|
|
|
|
|
|
|
|
2164
|
|
|
|
|
|
|
=item * C<'two.sided'> (default) — the largest gap in either direction, |
|
2165
|
|
|
|
|
|
|
D = sup |F_x − F_y|. |
|
2166
|
|
|
|
|
|
|
|
|
2167
|
|
|
|
|
|
|
=item * C<'greater'> — the largest gap where x's ECDF rises above the other, |
|
2168
|
|
|
|
|
|
|
D⁺ = sup (F_x − F_y). |
|
2169
|
|
|
|
|
|
|
|
|
2170
|
|
|
|
|
|
|
=item * C<'less'> — the largest gap in the other direction, D⁻ = sup (F_y − F_x). |
|
2171
|
|
|
|
|
|
|
|
|
2172
|
|
|
|
|
|
|
=back |
|
2173
|
|
|
|
|
|
|
|
|
2174
|
|
|
|
|
|
|
These follow R's C convention: C<'greater'>/C<'less'> describe which CDF |
|
2175
|
|
|
|
|
|
|
lies I the other, which (because a higher CDF means smaller values) is |
|
2176
|
|
|
|
|
|
|
the opposite of which sample tends to be larger. |
|
2177
|
|
|
|
|
|
|
|
|
2178
|
|
|
|
|
|
|
C controls how the p-value is computed. Omit it to let the test choose: |
|
2179
|
|
|
|
|
|
|
the exact distribution is used for small samples (two-sample when nx·ny |
|
2180
|
|
|
|
|
|
|
10000, one-sample when n < 100) and the asymptotic (Kolmogorov limiting) |
|
2181
|
|
|
|
|
|
|
approximation otherwise. Pass C<< exact =E 1 >> to force the exact computation or |
|
2182
|
|
|
|
|
|
|
C<< exact =E 0 >> to force the asymptotic one. Exact p-values cannot be computed |
|
2183
|
|
|
|
|
|
|
when the data contain ties; if ties are present on the exact path, the test |
|
2184
|
|
|
|
|
|
|
warns and falls back to the asymptotic p-value. (The exact one-sample test is |
|
2185
|
|
|
|
|
|
|
only available for the two-sided alternative; a one-sided one-sample request |
|
2186
|
|
|
|
|
|
|
also falls back to asymptotic.) |
|
2187
|
|
|
|
|
|
|
|
|
2188
|
|
|
|
|
|
|
=head3 Return value |
|
2189
|
|
|
|
|
|
|
|
|
2190
|
|
|
|
|
|
|
C returns a hash reference with four keys: |
|
2191
|
|
|
|
|
|
|
|
|
2192
|
|
|
|
|
|
|
=over |
|
2193
|
|
|
|
|
|
|
|
|
2194
|
|
|
|
|
|
|
=item * B<< C >> — the KS statistic for the chosen C: D, D⁺, or |
|
2195
|
|
|
|
|
|
|
D⁻. It is the maximum distance between the two ECDFs (or, for the one-sample |
|
2196
|
|
|
|
|
|
|
test, between the ECDF and the reference CDF), always in the range [0, 1]. |
|
2197
|
|
|
|
|
|
|
Larger values mean the distributions are further apart. |
|
2198
|
|
|
|
|
|
|
|
|
2199
|
|
|
|
|
|
|
=item * B<< C >> — the probability, under the null hypothesis that the samples |
|
2200
|
|
|
|
|
|
|
share a distribution, of observing a statistic at least this large. It is |
|
2201
|
|
|
|
|
|
|
clamped to [0, 1]; a small value (e.g. < 0.05) is evidence against the null. |
|
2202
|
|
|
|
|
|
|
|
|
2203
|
|
|
|
|
|
|
=item * B<< C >> — a human-readable description of exactly what was run, handy |
|
2204
|
|
|
|
|
|
|
for logging or reproducing a result. One of: |
|
2205
|
|
|
|
|
|
|
C<"Two-sample Kolmogorov-Smirnov exact test">, |
|
2206
|
|
|
|
|
|
|
C<"Two-sample Kolmogorov-Smirnov test (asymptotic)">, |
|
2207
|
|
|
|
|
|
|
C<"One-sample Kolmogorov-Smirnov exact test">, or |
|
2208
|
|
|
|
|
|
|
C<"One-sample Kolmogorov-Smirnov test (asymptotic)">. |
|
2209
|
|
|
|
|
|
|
|
|
2210
|
|
|
|
|
|
|
=item * B<< C >> — the alternative hypothesis that was applied |
|
2211
|
|
|
|
|
|
|
(C<'two.sided'>, C<'greater'>, or C<'less'>), echoed back so the result is |
|
2212
|
|
|
|
|
|
|
self-describing. |
|
2213
|
|
|
|
|
|
|
|
|
2214
|
|
|
|
|
|
|
=back |
|
2215
|
|
|
|
|
|
|
|
|
2216
|
|
|
|
|
|
|
For example: |
|
2217
|
|
|
|
|
|
|
|
|
2218
|
|
|
|
|
|
|
my $ks = ks_test(\@x, \@y); |
|
2219
|
|
|
|
|
|
|
if ($ks->{p_value} < 0.05) { |
|
2220
|
|
|
|
|
|
|
printf "reject H0: D=%.4f, p=%.4g (%s)\n", |
|
2221
|
|
|
|
|
|
|
$ks->{statistic}, $ks->{p_value}, $ks->{method}; |
|
2222
|
|
|
|
|
|
|
} |
|
2223
|
|
|
|
|
|
|
|
|
2224
|
|
|
|
|
|
|
=head2 ljoin |
|
2225
|
|
|
|
|
|
|
|
|
2226
|
|
|
|
|
|
|
Consider a hash: C<$h{$row}{$col}>, and another hash C<$i{$row}{$col}>. |
|
2227
|
|
|
|
|
|
|
C will add information for C<$col> in C<%i> for each C<$row> to C<%h>, where C<$row> exists in both C<%h> and C<%i> |
|
2228
|
|
|
|
|
|
|
|
|
2229
|
|
|
|
|
|
|
For example, |
|
2230
|
|
|
|
|
|
|
|
|
2231
|
|
|
|
|
|
|
{ |
|
2232
|
|
|
|
|
|
|
"Jack Smith" { |
|
2233
|
|
|
|
|
|
|
age 30 |
|
2234
|
|
|
|
|
|
|
} |
|
2235
|
|
|
|
|
|
|
} |
|
2236
|
|
|
|
|
|
|
|
|
2237
|
|
|
|
|
|
|
and a second hash, |
|
2238
|
|
|
|
|
|
|
{ |
|
2239
|
|
|
|
|
|
|
"Jack Smith" { |
|
2240
|
|
|
|
|
|
|
dept "Engineering" |
|
2241
|
|
|
|
|
|
|
}, |
|
2242
|
|
|
|
|
|
|
"Jane Doe" { |
|
2243
|
|
|
|
|
|
|
age 25 |
|
2244
|
|
|
|
|
|
|
} |
|
2245
|
|
|
|
|
|
|
} |
|
2246
|
|
|
|
|
|
|
|
|
2247
|
|
|
|
|
|
|
in this case, running C will modify \%h to result: |
|
2248
|
|
|
|
|
|
|
|
|
2249
|
|
|
|
|
|
|
{ |
|
2250
|
|
|
|
|
|
|
"Jack Smith" { |
|
2251
|
|
|
|
|
|
|
age 30, |
|
2252
|
|
|
|
|
|
|
dept "Engineering" |
|
2253
|
|
|
|
|
|
|
} |
|
2254
|
|
|
|
|
|
|
} |
|
2255
|
|
|
|
|
|
|
|
|
2256
|
|
|
|
|
|
|
=head2 lm |
|
2257
|
|
|
|
|
|
|
|
|
2258
|
|
|
|
|
|
|
This is the linear models function. |
|
2259
|
|
|
|
|
|
|
|
|
2260
|
|
|
|
|
|
|
$lm = lm(formula => 'mpg ~ wt + hp', data => $mtcars); |
|
2261
|
|
|
|
|
|
|
|
|
2262
|
|
|
|
|
|
|
where C<$mtcars> is a hash of hashes |
|
2263
|
|
|
|
|
|
|
|
|
2264
|
|
|
|
|
|
|
C also supports generating interaction terms directly within the formula using the C<*> operator: |
|
2265
|
|
|
|
|
|
|
|
|
2266
|
|
|
|
|
|
|
my $lm = lm(formula => 'mpg ~ wt * hp^2', data => \%mtcars); |
|
2267
|
|
|
|
|
|
|
|
|
2268
|
|
|
|
|
|
|
If your data contains missing numbers (C or C), C handles listwise deletion dynamically to ensure mathematical integrity before fitting. |
|
2269
|
|
|
|
|
|
|
|
|
2270
|
|
|
|
|
|
|
the dot operator also works: |
|
2271
|
|
|
|
|
|
|
|
|
2272
|
|
|
|
|
|
|
$lm = lm(formula => 'y ~ .', data => $dot_data); |
|
2273
|
|
|
|
|
|
|
|
|
2274
|
|
|
|
|
|
|
=head2 matrix |
|
2275
|
|
|
|
|
|
|
|
|
2276
|
|
|
|
|
|
|
my $mat1 = matrix( |
|
2277
|
|
|
|
|
|
|
data => [1..6], |
|
2278
|
|
|
|
|
|
|
nrow => 2 |
|
2279
|
|
|
|
|
|
|
); |
|
2280
|
|
|
|
|
|
|
|
|
2281
|
|
|
|
|
|
|
You can also pass C<< byrow =E 1 >> if you want the matrix populated row-wise instead of column-wise. |
|
2282
|
|
|
|
|
|
|
|
|
2283
|
|
|
|
|
|
|
As of version 0.10, parameters do not need to be named, so that C works more like R: |
|
2284
|
|
|
|
|
|
|
|
|
2285
|
|
|
|
|
|
|
my $d = matrix(rnorm(32000), 1000, 32); |
|
2286
|
|
|
|
|
|
|
|
|
2287
|
|
|
|
|
|
|
works as C, C, and C |
|
2288
|
|
|
|
|
|
|
|
|
2289
|
|
|
|
|
|
|
=head2 max |
|
2290
|
|
|
|
|
|
|
|
|
2291
|
|
|
|
|
|
|
max(1,2,3); |
|
2292
|
|
|
|
|
|
|
|
|
2293
|
|
|
|
|
|
|
or |
|
2294
|
|
|
|
|
|
|
|
|
2295
|
|
|
|
|
|
|
my @arr = 1..8; |
|
2296
|
|
|
|
|
|
|
max(@arr, 4, 5) |
|
2297
|
|
|
|
|
|
|
|
|
2298
|
|
|
|
|
|
|
as of version 0.02, max will die if any undefined values are provided |
|
2299
|
|
|
|
|
|
|
|
|
2300
|
|
|
|
|
|
|
=head2 mean |
|
2301
|
|
|
|
|
|
|
|
|
2302
|
|
|
|
|
|
|
mean(1,2,3); |
|
2303
|
|
|
|
|
|
|
|
|
2304
|
|
|
|
|
|
|
or |
|
2305
|
|
|
|
|
|
|
|
|
2306
|
|
|
|
|
|
|
my @arr = 1..8; |
|
2307
|
|
|
|
|
|
|
mean(@arr, 4, 5) |
|
2308
|
|
|
|
|
|
|
|
|
2309
|
|
|
|
|
|
|
or |
|
2310
|
|
|
|
|
|
|
|
|
2311
|
|
|
|
|
|
|
mean([1,1], [2,2]) # 1.5 |
|
2312
|
|
|
|
|
|
|
|
|
2313
|
|
|
|
|
|
|
as of version 0.02, mean will die if any undefined values are provided |
|
2314
|
|
|
|
|
|
|
|
|
2315
|
|
|
|
|
|
|
=head2 median |
|
2316
|
|
|
|
|
|
|
|
|
2317
|
|
|
|
|
|
|
works like mean, taking array references and arrays: |
|
2318
|
|
|
|
|
|
|
|
|
2319
|
|
|
|
|
|
|
median( $test_data[$i][0] ) |
|
2320
|
|
|
|
|
|
|
|
|
2321
|
|
|
|
|
|
|
as of version 0.02, median will die if any undefined values are provided |
|
2322
|
|
|
|
|
|
|
|
|
2323
|
|
|
|
|
|
|
=head2 min |
|
2324
|
|
|
|
|
|
|
|
|
2325
|
|
|
|
|
|
|
min(1,2,3); |
|
2326
|
|
|
|
|
|
|
|
|
2327
|
|
|
|
|
|
|
or |
|
2328
|
|
|
|
|
|
|
|
|
2329
|
|
|
|
|
|
|
my @arr = 1..8; |
|
2330
|
|
|
|
|
|
|
min(@arr, 4, 5) |
|
2331
|
|
|
|
|
|
|
|
|
2332
|
|
|
|
|
|
|
as of version 0.02, min will die if any undefined values are provided |
|
2333
|
|
|
|
|
|
|
|
|
2334
|
|
|
|
|
|
|
=head2 mode |
|
2335
|
|
|
|
|
|
|
|
|
2336
|
|
|
|
|
|
|
Takes either an array or an array reference, and returns an array of the most common scalars (numbers or strings) |
|
2337
|
|
|
|
|
|
|
|
|
2338
|
|
|
|
|
|
|
@arr = mode([1,3,3,3]); # returns (3) |
|
2339
|
|
|
|
|
|
|
|
|
2340
|
|
|
|
|
|
|
@arr = mode('a','a','c','c','z'); # returns ('a', 'c') |
|
2341
|
|
|
|
|
|
|
|
|
2342
|
|
|
|
|
|
|
=head2 oneway_test |
|
2343
|
|
|
|
|
|
|
|
|
2344
|
|
|
|
|
|
|
A one-way test for equality of group means that, unlike C/ANOVA, B
|
|
2345
|
|
|
|
|
|
|
assume equal variances>. By default it performs B (the |
|
2346
|
|
|
|
|
|
|
same default as R's C), so the residual degrees of freedom are |
|
2347
|
|
|
|
|
|
|
usually fractional. Pass C<< var_equal =E 1 >> for the classic equal-variance form. |
|
2348
|
|
|
|
|
|
|
|
|
2349
|
|
|
|
|
|
|
use Stats::LikeR qw(oneway_test); |
|
2350
|
|
|
|
|
|
|
|
|
2351
|
|
|
|
|
|
|
=head3 Input |
|
2352
|
|
|
|
|
|
|
|
|
2353
|
|
|
|
|
|
|
C accepts your data in one of three shapes. In every case each |
|
2354
|
|
|
|
|
|
|
I is a vector of at least two numeric observations. |
|
2355
|
|
|
|
|
|
|
|
|
2356
|
|
|
|
|
|
|
|
|
2357
|
|
|
|
|
|
|
|
|
2358
|
|
|
|
|
|
|
=begin html |
|
2359
|
|
|
|
|
|
|
|
|
2360
|
|
|
|
|
|
|
|
2361
|
|
|
|
|
|
|
|
2362
|
|
|
|
|
|
|
|
|
2363
|
|
|
|
|
|
|
| Shape |
|
2364
|
|
|
|
|
|
|
| What it means |
|
2365
|
|
|
|
|
|
|
| Group labels |
|
2366
|
|
|
|
|
|
|
|
|
2367
|
|
|
|
|
|
|
| |
|
2368
|
|
|
|
|
|
|
|
|
2369
|
|
|
|
|
|
|
|
|
2370
|
|
|
|
|
|
|
| Hash of arrays { a => [...], b => [...] } |
|
2371
|
|
|
|
|
|
|
| Each key is a group (R's stack() view of a named list) |
|
2372
|
|
|
|
|
|
|
| the hash keys |
|
2373
|
|
|
|
|
|
|
|
|
2374
|
|
|
|
|
|
|
|
|
2375
|
|
|
|
|
|
|
| Array of arrays [ [...], [...] ] |
|
2376
|
|
|
|
|
|
|
| Each element is a group |
|
2377
|
|
|
|
|
|
|
| "Index 0", "Index 1", … |
|
2378
|
|
|
|
|
|
|
|
|
2379
|
|
|
|
|
|
|
|
|
2380
|
|
|
|
|
|
|
| Hash + formula { resp => [...], grp => [...] }, formula => 'resp ~ grp' |
|
2381
|
|
|
|
|
|
|
| Long-format columns split by a factor column |
|
2382
|
|
|
|
|
|
|
| the distinct values of the factor |
|
2383
|
|
|
|
|
|
|
|
|
2384
|
|
|
|
|
|
|
|
|
2385
|
|
|
|
|
|
|
| |
|
2386
|
|
|
|
|
|
|
|
|
2387
|
|
|
|
|
|
|
=end html |
|
2388
|
|
|
|
|
|
|
|
|
2389
|
|
|
|
|
|
|
|
|
2390
|
|
|
|
|
|
|
|
|
2391
|
|
|
|
|
|
|
=head3 Options |
|
2392
|
|
|
|
|
|
|
|
|
2393
|
|
|
|
|
|
|
|
|
2394
|
|
|
|
|
|
|
|
|
2395
|
|
|
|
|
|
|
=begin html |
|
2396
|
|
|
|
|
|
|
|
|
2397
|
|
|
|
|
|
|
|
|
2418
|
|
|
|
|
|
|
|
|
2419
|
|
|
|
|
|
|
=end html |
|
2420
|
|
|
|
|
|
|
|
|
2421
|
|
|
|
|
|
|
|
|
2422
|
|
|
|
|
|
|
|
|
2423
|
|
|
|
|
|
|
=head3 Data validation |
|
2424
|
|
|
|
|
|
|
|
|
2425
|
|
|
|
|
|
|
Every observation must be B; an C or non-numeric |
|
2426
|
|
|
|
|
|
|
cell makes the call C with the offending group and position. This matches |
|
2427
|
|
|
|
|
|
|
the rest of C (C, C, C, … all die on C) and |
|
2428
|
|
|
|
|
|
|
prevents missing values from being silently treated as C<0>. |
|
2429
|
|
|
|
|
|
|
|
|
2430
|
|
|
|
|
|
|
Each group needs at least two observations, and you need at least two groups. |
|
2431
|
|
|
|
|
|
|
|
|
2432
|
|
|
|
|
|
|
=head3 Output |
|
2433
|
|
|
|
|
|
|
|
|
2434
|
|
|
|
|
|
|
A hash reference with three top-level keys: |
|
2435
|
|
|
|
|
|
|
|
|
2436
|
|
|
|
|
|
|
|
|
2437
|
|
|
|
|
|
|
|
|
2438
|
|
|
|
|
|
|
=begin html |
|
2439
|
|
|
|
|
|
|
|
|
2440
|
|
|
|
|
|
|
|
2441
|
|
|
|
|
|
|
|
2442
|
|
|
|
|
|
|
|
|
2443
|
|
|
|
|
|
|
| Key |
|
2444
|
|
|
|
|
|
|
| Value |
|
2445
|
|
|
|
|
|
|
|
|
2446
|
|
|
|
|
|
|
| |
|
2447
|
|
|
|
|
|
|
|
|
2448
|
|
|
|
|
|
|
|
|
2449
|
|
|
|
|
|
|
| factor name (Group, or the formula's factor, e.g. supp) |
|
2450
|
|
|
|
|
|
|
| the between-groups row: Df, Sum Sq, Mean Sq, F value, Pr(>F) |
|
2451
|
|
|
|
|
|
|
|
|
2452
|
|
|
|
|
|
|
|
|
2453
|
|
|
|
|
|
|
| Residuals |
|
2454
|
|
|
|
|
|
|
| the within-groups row: Df, Sum Sq, Mean Sq (Df is fractional under Welch) |
|
2455
|
|
|
|
|
|
|
|
|
2456
|
|
|
|
|
|
|
|
|
2457
|
|
|
|
|
|
|
| group_stats |
|
2458
|
|
|
|
|
|
|
| { mean => { group => mean, … }, size => { group => n, … } } |
|
2459
|
|
|
|
|
|
|
|
|
2460
|
|
|
|
|
|
|
|
|
2461
|
|
|
|
|
|
|
| |
|
2462
|
|
|
|
|
|
|
|
|
2463
|
|
|
|
|
|
|
=end html |
|
2464
|
|
|
|
|
|
|
|
|
2465
|
|
|
|
|
|
|
|
|
2466
|
|
|
|
|
|
|
|
|
2467
|
|
|
|
|
|
|
=head3 Examples |
|
2468
|
|
|
|
|
|
|
|
|
2469
|
|
|
|
|
|
|
=head4 Hash of arrays (each key is a group) |
|
2470
|
|
|
|
|
|
|
|
|
2471
|
|
|
|
|
|
|
my $res = oneway_test({ |
|
2472
|
|
|
|
|
|
|
yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2], |
|
2473
|
|
|
|
|
|
|
ctrl => [1, 1, 1, 0, 0, 0 ], |
|
2474
|
|
|
|
|
|
|
}); |
|
2475
|
|
|
|
|
|
|
|
|
2476
|
|
|
|
|
|
|
{ |
|
2477
|
|
|
|
|
|
|
Group => { |
|
2478
|
|
|
|
|
|
|
Df => 1, |
|
2479
|
|
|
|
|
|
|
"Sum Sq" => 61.6533333333333, |
|
2480
|
|
|
|
|
|
|
"Mean Sq" => 61.6533333333333, |
|
2481
|
|
|
|
|
|
|
"F value" => 177.504798464491, |
|
2482
|
|
|
|
|
|
|
"Pr(>F)" => 1.31343255160843e-07, |
|
2483
|
|
|
|
|
|
|
}, |
|
2484
|
|
|
|
|
|
|
Residuals => { |
|
2485
|
|
|
|
|
|
|
Df => 9.81767348326473, # fractional: Welch correction |
|
2486
|
|
|
|
|
|
|
"Sum Sq" => 3.47333333333333, |
|
2487
|
|
|
|
|
|
|
"Mean Sq" => 0.353783749200256, |
|
2488
|
|
|
|
|
|
|
}, |
|
2489
|
|
|
|
|
|
|
group_stats => { |
|
2490
|
|
|
|
|
|
|
mean => { ctrl => 0.5, yield => 5.03333333333333 }, |
|
2491
|
|
|
|
|
|
|
size => { ctrl => 6, yield => 6 }, |
|
2492
|
|
|
|
|
|
|
}, |
|
2493
|
|
|
|
|
|
|
} |
|
2494
|
|
|
|
|
|
|
|
|
2495
|
|
|
|
|
|
|
=head4 Array of arrays (groups named by index) |
|
2496
|
|
|
|
|
|
|
|
|
2497
|
|
|
|
|
|
|
my $res = oneway_test([ |
|
2498
|
|
|
|
|
|
|
[5.5, 5.4, 5.8, 4.5, 4.8, 4.2], |
|
2499
|
|
|
|
|
|
|
[1, 1, 1, 0, 0, 0 ], |
|
2500
|
|
|
|
|
|
|
]); |
|
2501
|
|
|
|
|
|
|
|
|
2502
|
|
|
|
|
|
|
Identical to the hash form, except C is keyed by position: |
|
2503
|
|
|
|
|
|
|
|
|
2504
|
|
|
|
|
|
|
group_stats => { |
|
2505
|
|
|
|
|
|
|
mean => { "Index 0" => 5.03333333333333, "Index 1" => 0.5 }, |
|
2506
|
|
|
|
|
|
|
size => { "Index 0" => 6, "Index 1" => 6 }, |
|
2507
|
|
|
|
|
|
|
} |
|
2508
|
|
|
|
|
|
|
|
|
2509
|
|
|
|
|
|
|
=head4 Long format with a formula |
|
2510
|
|
|
|
|
|
|
|
|
2511
|
|
|
|
|
|
|
When your data is in columns rather than pre-split groups, name the response |
|
2512
|
|
|
|
|
|
|
and factor columns with a formula. The factor's I become the groups and |
|
2513
|
|
|
|
|
|
|
the factor's I becomes the top-level key: |
|
2514
|
|
|
|
|
|
|
|
|
2515
|
|
|
|
|
|
|
my $res = oneway_test( |
|
2516
|
|
|
|
|
|
|
{ |
|
2517
|
|
|
|
|
|
|
len => [4.2, 11.5, 7.3, 16.5, 17.3, 13.6, 23.6, 18.5, 33.9], |
|
2518
|
|
|
|
|
|
|
supp => [qw(VC VC VC OJ OJ OJ HI HI HI)], |
|
2519
|
|
|
|
|
|
|
}, |
|
2520
|
|
|
|
|
|
|
formula => 'len ~ supp', |
|
2521
|
|
|
|
|
|
|
); |
|
2522
|
|
|
|
|
|
|
# $res->{supp}, $res->{Residuals}, $res->{group_stats} ... |
|
2523
|
|
|
|
|
|
|
|
|
2524
|
|
|
|
|
|
|
=head3 Classic equal-variance form |
|
2525
|
|
|
|
|
|
|
|
|
2526
|
|
|
|
|
|
|
my $res = oneway_test(\%groups, var_equal => 1); # or 'var.equal' => 1 |
|
2527
|
|
|
|
|
|
|
|
|
2528
|
|
|
|
|
|
|
=head3 Notes |
|
2529
|
|
|
|
|
|
|
|
|
2530
|
|
|
|
|
|
|
=over |
|
2531
|
|
|
|
|
|
|
|
|
2532
|
|
|
|
|
|
|
=item * The default (Welch) does B require equal group sizes or equal variances; |
|
2533
|
|
|
|
|
|
|
the pooled form (C<< var_equal =E 1 >>) assumes equal variances. |
|
2534
|
|
|
|
|
|
|
|
|
2535
|
|
|
|
|
|
|
=item * C is only meaningful for a hash input. Passing it with an array of |
|
2536
|
|
|
|
|
|
|
arrays is an error. |
|
2537
|
|
|
|
|
|
|
|
|
2538
|
|
|
|
|
|
|
=item * Group order in the output is not guaranteed for hash inputs (it follows hash |
|
2539
|
|
|
|
|
|
|
iteration order); read results by name, not position. |
|
2540
|
|
|
|
|
|
|
|
|
2541
|
|
|
|
|
|
|
=item * Avoid naming a factor C or C in a formula, since those |
|
2542
|
|
|
|
|
|
|
are reserved top-level keys in the result. |
|
2543
|
|
|
|
|
|
|
|
|
2544
|
|
|
|
|
|
|
=back |
|
2545
|
|
|
|
|
|
|
|
|
2546
|
|
|
|
|
|
|
=head2 p_adjust |
|
2547
|
|
|
|
|
|
|
|
|
2548
|
|
|
|
|
|
|
Returns array of false-discovery-rate-corrected p-values, where methods available are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr" |
|
2549
|
|
|
|
|
|
|
|
|
2550
|
|
|
|
|
|
|
my @q = p_adjust(\@pvalues, $method); |
|
2551
|
|
|
|
|
|
|
|
|
2552
|
|
|
|
|
|
|
=head2 power_t_test |
|
2553
|
|
|
|
|
|
|
|
|
2554
|
|
|
|
|
|
|
$test_data = power_t_test( |
|
2555
|
|
|
|
|
|
|
n => 30, delta => 0.5, |
|
2556
|
|
|
|
|
|
|
sd => 1.0, sig_level => 0.05 |
|
2557
|
|
|
|
|
|
|
); |
|
2558
|
|
|
|
|
|
|
|
|
2559
|
|
|
|
|
|
|
It also allows configuring the test type (C<< type =E 'one.sample' >>, C<'two.sample'>, C<'paired'>) and alternative hypothesis (C<< alternative =E 'one.sided' >>). You can also pass C<< strict =E 1 >> to strictly evaluate both tails of the distribution. |
|
2560
|
|
|
|
|
|
|
|
|
2561
|
|
|
|
|
|
|
|
|
2562
|
|
|
|
|
|
|
|
|
2563
|
|
|
|
|
|
|
=begin html |
|
2564
|
|
|
|
|
|
|
|
|
2565
|
|
|
|
|
|
|
|
|
2631
|
|
|
|
|
|
|
|
|
2632
|
|
|
|
|
|
|
=end html |
|
2633
|
|
|
|
|
|
|
|
|
2634
|
|
|
|
|
|
|
|
|
2635
|
|
|
|
|
|
|
|
|
2636
|
|
|
|
|
|
|
=head2 prcomp |
|
2637
|
|
|
|
|
|
|
|
|
2638
|
|
|
|
|
|
|
Principal Component Analysis |
|
2639
|
|
|
|
|
|
|
|
|
2640
|
|
|
|
|
|
|
=head3 Options |
|
2641
|
|
|
|
|
|
|
|
|
2642
|
|
|
|
|
|
|
|
|
2643
|
|
|
|
|
|
|
|
|
2644
|
|
|
|
|
|
|
=begin html |
|
2645
|
|
|
|
|
|
|
|
|
2646
|
|
|
|
|
|
|
|
2647
|
|
|
|
|
|
|
|
2648
|
|
|
|
|
|
|
|
|
2649
|
|
|
|
|
|
|
| Option |
|
2650
|
|
|
|
|
|
|
| Type |
|
2651
|
|
|
|
|
|
|
| Default |
|
2652
|
|
|
|
|
|
|
| Description |
|
2653
|
|
|
|
|
|
|
|
|
2654
|
|
|
|
|
|
|
| |
|
2655
|
|
|
|
|
|
|
|
|
2656
|
|
|
|
|
|
|
|
|
2657
|
|
|
|
|
|
|
| center |
|
2658
|
|
|
|
|
|
|
| Boolean |
|
2659
|
|
|
|
|
|
|
| 1 (True) |
|
2660
|
|
|
|
|
|
|
| If true, the variables are shifted to be zero-centered before the analysis takes place. |
|
2661
|
|
|
|
|
|
|
|
|
2662
|
|
|
|
|
|
|
|
|
2663
|
|
|
|
|
|
|
| scale |
|
2664
|
|
|
|
|
|
|
| Boolean |
|
2665
|
|
|
|
|
|
|
| 0 (False) |
|
2666
|
|
|
|
|
|
|
| If true, the variables are scaled to have unit variance before the analysis takes place. Note: If a column has zero variance, the function will croak to prevent division by zero. |
|
2667
|
|
|
|
|
|
|
|
|
2668
|
|
|
|
|
|
|
|
|
2669
|
|
|
|
|
|
|
| retx |
|
2670
|
|
|
|
|
|
|
| Boolean |
|
2671
|
|
|
|
|
|
|
| 1 (True) |
|
2672
|
|
|
|
|
|
|
| If true, the rotated data (the original data multiplied by the rotation matrix) is returned under the key x. |
|
2673
|
|
|
|
|
|
|
|
|
2674
|
|
|
|
|
|
|
|
|
2675
|
|
|
|
|
|
|
| tol |
|
2676
|
|
|
|
|
|
|
| Number |
|
2677
|
|
|
|
|
|
|
| undef |
|
2678
|
|
|
|
|
|
|
| A value indicating the magnitude below which components should be omitted. Components are omitted if their standard deviation is less than or equal to tol times the standard deviation of the first component. |
|
2679
|
|
|
|
|
|
|
|
|
2680
|
|
|
|
|
|
|
|
|
2681
|
|
|
|
|
|
|
| rank |
|
2682
|
|
|
|
|
|
|
| Integer |
|
2683
|
|
|
|
|
|
|
| undef |
|
2684
|
|
|
|
|
|
|
| Optionally specify a strict limit on the number of principal components to return. The function will return min(rank, rows, columns) components. |
|
2685
|
|
|
|
|
|
|
|
|
2686
|
|
|
|
|
|
|
|
|
2687
|
|
|
|
|
|
|
| |
|
2688
|
|
|
|
|
|
|
|
|
2689
|
|
|
|
|
|
|
=end html |
|
2690
|
|
|
|
|
|
|
|
|
2691
|
|
|
|
|
|
|
|
|
2692
|
|
|
|
|
|
|
|
|
2693
|
|
|
|
|
|
|
=head3 Results |
|
2694
|
|
|
|
|
|
|
|
|
2695
|
|
|
|
|
|
|
=head4 Returned Data Structure |
|
2696
|
|
|
|
|
|
|
|
|
2697
|
|
|
|
|
|
|
The C function returns a HashRef containing the following keys representing the results of the Principal Component Analysis: |
|
2698
|
|
|
|
|
|
|
|
|
2699
|
|
|
|
|
|
|
|
|
2700
|
|
|
|
|
|
|
|
|
2701
|
|
|
|
|
|
|
=begin html |
|
2702
|
|
|
|
|
|
|
|
|
2703
|
|
|
|
|
|
|
|
2704
|
|
|
|
|
|
|
|
2705
|
|
|
|
|
|
|
|
|
2706
|
|
|
|
|
|
|
| Key |
|
2707
|
|
|
|
|
|
|
| Type |
|
2708
|
|
|
|
|
|
|
| Description |
|
2709
|
|
|
|
|
|
|
|
|
2710
|
|
|
|
|
|
|
| |
|
2711
|
|
|
|
|
|
|
|
|
2712
|
|
|
|
|
|
|
|
|
2713
|
|
|
|
|
|
|
| sdev |
|
2714
|
|
|
|
|
|
|
| ArrayRef[Number] |
|
2715
|
|
|
|
|
|
|
| The standard deviations of the principal components. Mathematically, these are the square roots of the eigenvalues of the covariance matrix. |
|
2716
|
|
|
|
|
|
|
|
|
2717
|
|
|
|
|
|
|
|
|
2718
|
|
|
|
|
|
|
| rotation |
|
2719
|
|
|
|
|
|
|
| ArrayRef[ArrayRef] |
|
2720
|
|
|
|
|
|
|
| A 2D array representing the matrix of variable loadings (the eigenvectors). Each inner array represents a row, and the columns correspond to the principal components. |
|
2721
|
|
|
|
|
|
|
|
|
2722
|
|
|
|
|
|
|
|
|
2723
|
|
|
|
|
|
|
| x |
|
2724
|
|
|
|
|
|
|
| ArrayRef[ArrayRef] |
|
2725
|
|
|
|
|
|
|
| A 2D array containing the rotated data (often referred to as PCA scores). This is the original data projected onto the principal components. Note: Only present if the retx option is true. |
|
2726
|
|
|
|
|
|
|
|
|
2727
|
|
|
|
|
|
|
|
|
2728
|
|
|
|
|
|
|
| center |
|
2729
|
|
|
|
|
|
|
| ArrayRef[Number] or 0 |
|
2730
|
|
|
|
|
|
|
| The centering values used (typically the column means). Returns false (0) if centering was disabled. |
|
2731
|
|
|
|
|
|
|
|
|
2732
|
|
|
|
|
|
|
|
|
2733
|
|
|
|
|
|
|
| scale |
|
2734
|
|
|
|
|
|
|
| ArrayRef[Number] or 0 |
|
2735
|
|
|
|
|
|
|
| The scaling values used (typically the column standard deviations). Returns false (0) if scaling was disabled. |
|
2736
|
|
|
|
|
|
|
|
|
2737
|
|
|
|
|
|
|
|
|
2738
|
|
|
|
|
|
|
| varnames |
|
2739
|
|
|
|
|
|
|
| ArrayRef[String] |
|
2740
|
|
|
|
|
|
|
| The sorted names of the original variables. Note: Only present if the input data was a Hash of Arrays (HoA) or a Hash of Hashes (HoH). |
|
2741
|
|
|
|
|
|
|
|
|
2742
|
|
|
|
|
|
|
|
|
2743
|
|
|
|
|
|
|
| |
|
2744
|
|
|
|
|
|
|
|
|
2745
|
|
|
|
|
|
|
=end html |
|
2746
|
|
|
|
|
|
|
|
|
2747
|
|
|
|
|
|
|
|
|
2748
|
|
|
|
|
|
|
|
|
2749
|
|
|
|
|
|
|
=head3 Using array of arrays |
|
2750
|
|
|
|
|
|
|
|
|
2751
|
|
|
|
|
|
|
my $aoa = [ |
|
2752
|
|
|
|
|
|
|
[2, 4], |
|
2753
|
|
|
|
|
|
|
[4, 2], |
|
2754
|
|
|
|
|
|
|
[6, 6] |
|
2755
|
|
|
|
|
|
|
]; |
|
2756
|
|
|
|
|
|
|
|
|
2757
|
|
|
|
|
|
|
my $pca = prcomp($aoa); |
|
2758
|
|
|
|
|
|
|
|
|
2759
|
|
|
|
|
|
|
which returns |
|
2760
|
|
|
|
|
|
|
|
|
2761
|
|
|
|
|
|
|
{ |
|
2762
|
|
|
|
|
|
|
center [ |
|
2763
|
|
|
|
|
|
|
[0] 4, |
|
2764
|
|
|
|
|
|
|
[1] 4 |
|
2765
|
|
|
|
|
|
|
], |
|
2766
|
|
|
|
|
|
|
rotation [ |
|
2767
|
|
|
|
|
|
|
[0] [ |
|
2768
|
|
|
|
|
|
|
[0] 0.707106781186547, |
|
2769
|
|
|
|
|
|
|
[1] 0.707106781186548 |
|
2770
|
|
|
|
|
|
|
], |
|
2771
|
|
|
|
|
|
|
[1] [ |
|
2772
|
|
|
|
|
|
|
[0] 0.707106781186548, |
|
2773
|
|
|
|
|
|
|
[1] -0.707106781186547 |
|
2774
|
|
|
|
|
|
|
] |
|
2775
|
|
|
|
|
|
|
], |
|
2776
|
|
|
|
|
|
|
scale 0, |
|
2777
|
|
|
|
|
|
|
sdev [ |
|
2778
|
|
|
|
|
|
|
[0] 2.44948974278318, |
|
2779
|
|
|
|
|
|
|
[1] 1.4142135623731 |
|
2780
|
|
|
|
|
|
|
], |
|
2781
|
|
|
|
|
|
|
x [ |
|
2782
|
|
|
|
|
|
|
[0] [ |
|
2783
|
|
|
|
|
|
|
[0] -1.41421356237309, |
|
2784
|
|
|
|
|
|
|
[1] -1.4142135623731 |
|
2785
|
|
|
|
|
|
|
], |
|
2786
|
|
|
|
|
|
|
[1] [ |
|
2787
|
|
|
|
|
|
|
[0] -1.4142135623731, |
|
2788
|
|
|
|
|
|
|
[1] 1.41421356237309 |
|
2789
|
|
|
|
|
|
|
], |
|
2790
|
|
|
|
|
|
|
[2] [ |
|
2791
|
|
|
|
|
|
|
[0] 2.82842712474619, |
|
2792
|
|
|
|
|
|
|
[1] 2.22044604925031e-16 |
|
2793
|
|
|
|
|
|
|
] |
|
2794
|
|
|
|
|
|
|
] |
|
2795
|
|
|
|
|
|
|
} |
|
2796
|
|
|
|
|
|
|
|
|
2797
|
|
|
|
|
|
|
=head3 Hash of Arrays |
|
2798
|
|
|
|
|
|
|
|
|
2799
|
|
|
|
|
|
|
my $hoa = { B => [4, 2, 6], A => [2, 4, 6] }; |
|
2800
|
|
|
|
|
|
|
my $pca = prcomp($hoa); |
|
2801
|
|
|
|
|
|
|
|
|
2802
|
|
|
|
|
|
|
=head2 quantile |
|
2803
|
|
|
|
|
|
|
|
|
2804
|
|
|
|
|
|
|
Calculates sample quantiles using R's continuous Type 7 interpolation. |
|
2805
|
|
|
|
|
|
|
|
|
2806
|
|
|
|
|
|
|
my $quantile = quantile('x' => [1..99], probs => [0.05, 0.1, 0.25]); |
|
2807
|
|
|
|
|
|
|
|
|
2808
|
|
|
|
|
|
|
If the C parameter is omitted, it behaves identically to R by defaulting to the 0, 25, 50, 75, and 100 percentiles (C). The returned hash keys match R's standardized naming convention (e.g., C<"25%">, C<"33.3%">). |
|
2809
|
|
|
|
|
|
|
|
|
2810
|
|
|
|
|
|
|
=head2 rbinom |
|
2811
|
|
|
|
|
|
|
|
|
2812
|
|
|
|
|
|
|
Create a binomial distribution of numbers |
|
2813
|
|
|
|
|
|
|
|
|
2814
|
|
|
|
|
|
|
my $binom = rbinom( n => $n, prob => 0.5, size => 9); |
|
2815
|
|
|
|
|
|
|
|
|
2816
|
|
|
|
|
|
|
It hooks directly into Perl's internal PRNG system, respecting C seeds. |
|
2817
|
|
|
|
|
|
|
|
|
2818
|
|
|
|
|
|
|
=head2 read_table |
|
2819
|
|
|
|
|
|
|
|
|
2820
|
|
|
|
|
|
|
I've tried to make this as simple as possible, trying to follow from R: |
|
2821
|
|
|
|
|
|
|
|
|
2822
|
|
|
|
|
|
|
my $test_data = read_table('t/HepatitisCdata.csv'); |
|
2823
|
|
|
|
|
|
|
|
|
2824
|
|
|
|
|
|
|
=head3 options |
|
2825
|
|
|
|
|
|
|
|
|
2826
|
|
|
|
|
|
|
|
|
2827
|
|
|
|
|
|
|
|
|
2828
|
|
|
|
|
|
|
=begin html |
|
2829
|
|
|
|
|
|
|
|
|
2830
|
|
|
|
|
|
|
|
|
2871
|
|
|
|
|
|
|
|
|
2872
|
|
|
|
|
|
|
=end html |
|
2873
|
|
|
|
|
|
|
|
|
2874
|
|
|
|
|
|
|
|
|
2875
|
|
|
|
|
|
|
|
|
2876
|
|
|
|
|
|
|
output types can be AOH (aoa), HOA (hoa), HOH (hoh) |
|
2877
|
|
|
|
|
|
|
|
|
2878
|
|
|
|
|
|
|
read_table($filename, 'output.type' => 'aoh'); |
|
2879
|
|
|
|
|
|
|
|
|
2880
|
|
|
|
|
|
|
read_table($filename, 'output.type' => 'hoa'); |
|
2881
|
|
|
|
|
|
|
|
|
2882
|
|
|
|
|
|
|
and, like Text::CSV_XS, filters can be applied in order to save RAM on big files: |
|
2883
|
|
|
|
|
|
|
|
|
2884
|
|
|
|
|
|
|
$test_data = read_table( |
|
2885
|
|
|
|
|
|
|
't/HepatitisCdata.csv', |
|
2886
|
|
|
|
|
|
|
filter => { |
|
2887
|
|
|
|
|
|
|
Sex => sub {$_ eq 'f'} # where "Sex" is the column name, and "$_" is the value for that column |
|
2888
|
|
|
|
|
|
|
}, |
|
2889
|
|
|
|
|
|
|
'output.type' => 'aoh' |
|
2890
|
|
|
|
|
|
|
); |
|
2891
|
|
|
|
|
|
|
|
|
2892
|
|
|
|
|
|
|
the default delimiter is C<,> |
|
2893
|
|
|
|
|
|
|
Suffixes C<.csv> and C<.tsv> are automatically detected from file names, but if specified, are overridden by C and/or C. C is given priority. |
|
2894
|
|
|
|
|
|
|
|
|
2895
|
|
|
|
|
|
|
=head2 rnorm |
|
2896
|
|
|
|
|
|
|
|
|
2897
|
|
|
|
|
|
|
Make a normal distribution of numbers, with pre-set mean C, standard deviation C, and number C. |
|
2898
|
|
|
|
|
|
|
|
|
2899
|
|
|
|
|
|
|
my ($rmean, $sd, $n) = (10, 2, 9999); |
|
2900
|
|
|
|
|
|
|
my $normals = rnorm( n => $n, mean => $rmean, sd => $sd); |
|
2901
|
|
|
|
|
|
|
|
|
2902
|
|
|
|
|
|
|
=head2 runif |
|
2903
|
|
|
|
|
|
|
|
|
2904
|
|
|
|
|
|
|
Make an approximately uniform distribution into an array |
|
2905
|
|
|
|
|
|
|
|
|
2906
|
|
|
|
|
|
|
=head3 named arguments |
|
2907
|
|
|
|
|
|
|
|
|
2908
|
|
|
|
|
|
|
my $unif = runif( n => $n, min => 0, max => 1); |
|
2909
|
|
|
|
|
|
|
|
|
2910
|
|
|
|
|
|
|
where C is the number of items, the values are between C and C |
|
2911
|
|
|
|
|
|
|
|
|
2912
|
|
|
|
|
|
|
=head3 positional args |
|
2913
|
|
|
|
|
|
|
|
|
2914
|
|
|
|
|
|
|
this is to match R's behavior: |
|
2915
|
|
|
|
|
|
|
|
|
2916
|
|
|
|
|
|
|
runif( 9 ) |
|
2917
|
|
|
|
|
|
|
|
|
2918
|
|
|
|
|
|
|
will make 9 numbers in [0,1] |
|
2919
|
|
|
|
|
|
|
|
|
2920
|
|
|
|
|
|
|
runif(9, 0, 99) |
|
2921
|
|
|
|
|
|
|
|
|
2922
|
|
|
|
|
|
|
will match C, C, and C respectively |
|
2923
|
|
|
|
|
|
|
|
|
2924
|
|
|
|
|
|
|
=head2 sample |
|
2925
|
|
|
|
|
|
|
|
|
2926
|
|
|
|
|
|
|
take a sample of hash or array slices. |
|
2927
|
|
|
|
|
|
|
|
|
2928
|
|
|
|
|
|
|
my $h = sample(\%h, 4); # take 4 hash keys and their values into $h |
|
2929
|
|
|
|
|
|
|
|
|
2930
|
|
|
|
|
|
|
or, alternatively, with arrays: |
|
2931
|
|
|
|
|
|
|
|
|
2932
|
|
|
|
|
|
|
my $arr = sample(\@arr, 3); # take 3 indices of an array |
|
2933
|
|
|
|
|
|
|
|
|
2934
|
|
|
|
|
|
|
=head2 scale |
|
2935
|
|
|
|
|
|
|
|
|
2936
|
|
|
|
|
|
|
my @scaled_results = scale(1..5); |
|
2937
|
|
|
|
|
|
|
|
|
2938
|
|
|
|
|
|
|
You can also pass an options hash to disable centering or scaling: |
|
2939
|
|
|
|
|
|
|
|
|
2940
|
|
|
|
|
|
|
my @scaled_results = scale(1..5, { center => false, scale => 1 }); |
|
2941
|
|
|
|
|
|
|
|
|
2942
|
|
|
|
|
|
|
It fully supports matrix operations. By passing an array of arrays, C processes the data column by column independently: |
|
2943
|
|
|
|
|
|
|
|
|
2944
|
|
|
|
|
|
|
my $scaled_mat = scale([[1, 2], [3, 4], [5, 6]]); |
|
2945
|
|
|
|
|
|
|
|
|
2946
|
|
|
|
|
|
|
=head2 sd |
|
2947
|
|
|
|
|
|
|
|
|
2948
|
|
|
|
|
|
|
my $stdev = sd(2,4,4,4,5,5,7,9); |
|
2949
|
|
|
|
|
|
|
|
|
2950
|
|
|
|
|
|
|
Correct answer is 2.1380899352994 |
|
2951
|
|
|
|
|
|
|
|
|
2952
|
|
|
|
|
|
|
C can accept both array references as well as arrays: |
|
2953
|
|
|
|
|
|
|
|
|
2954
|
|
|
|
|
|
|
my $stdev = sd([2,4,4,4,5,5,7,9]); |
|
2955
|
|
|
|
|
|
|
|
|
2956
|
|
|
|
|
|
|
As of version 0.02, sd will croak/die if any undefined values are provided. |
|
2957
|
|
|
|
|
|
|
|
|
2958
|
|
|
|
|
|
|
=head2 seq |
|
2959
|
|
|
|
|
|
|
|
|
2960
|
|
|
|
|
|
|
Works as closely as I can to R's seq, which is very similar to Perl's C loops. Returns an array, not an array reference. |
|
2961
|
|
|
|
|
|
|
|
|
2962
|
|
|
|
|
|
|
=head3 Standard integer sequence |
|
2963
|
|
|
|
|
|
|
|
|
2964
|
|
|
|
|
|
|
say 'seq(1, 5):'; |
|
2965
|
|
|
|
|
|
|
my @seq = seq(1, 5); |
|
2966
|
|
|
|
|
|
|
say join(', ', @seq), "\n"; |
|
2967
|
|
|
|
|
|
|
|
|
2968
|
|
|
|
|
|
|
say 'seq(1, 2, 0.25):'; |
|
2969
|
|
|
|
|
|
|
@seq = seq(1, 2, 0.25); |
|
2970
|
|
|
|
|
|
|
|
|
2971
|
|
|
|
|
|
|
=head3 Fractional steps |
|
2972
|
|
|
|
|
|
|
|
|
2973
|
|
|
|
|
|
|
say 'seq(1, 2, 0.25):'; |
|
2974
|
|
|
|
|
|
|
@seq = seq(1, 2, 0.25); |
|
2975
|
|
|
|
|
|
|
say join(", ", @seq), "\n"; |
|
2976
|
|
|
|
|
|
|
for (my $idx = 2; $idx >= 1; $idx -= 0.25) { # count down to pop |
|
2977
|
|
|
|
|
|
|
is_approx(pop @seq, $idx, "seq item $idx with fractional step"); |
|
2978
|
|
|
|
|
|
|
} |
|
2979
|
|
|
|
|
|
|
|
|
2980
|
|
|
|
|
|
|
=head3 Negative steps |
|
2981
|
|
|
|
|
|
|
|
|
2982
|
|
|
|
|
|
|
say 'seq(10, 5, -1):'; |
|
2983
|
|
|
|
|
|
|
@seq = seq(10, 5, -1); |
|
2984
|
|
|
|
|
|
|
say join(", ", @seq), "\n"; |
|
2985
|
|
|
|
|
|
|
for (my $idx = 5; $idx <= 10; $idx++) { # count down to pop |
|
2986
|
|
|
|
|
|
|
is_approx(pop @seq, $idx, "seq item $idx with negative step"); |
|
2987
|
|
|
|
|
|
|
} |
|
2988
|
|
|
|
|
|
|
|
|
2989
|
|
|
|
|
|
|
=head2 shapiro_test |
|
2990
|
|
|
|
|
|
|
|
|
2991
|
|
|
|
|
|
|
tests to see if an array reference is normally distributed, returns a p-value and a statistic |
|
2992
|
|
|
|
|
|
|
|
|
2993
|
|
|
|
|
|
|
my $shapiro = shapiro_test( |
|
2994
|
|
|
|
|
|
|
[1..5] |
|
2995
|
|
|
|
|
|
|
); |
|
2996
|
|
|
|
|
|
|
|
|
2997
|
|
|
|
|
|
|
and returns the hash reference: |
|
2998
|
|
|
|
|
|
|
|
|
2999
|
|
|
|
|
|
|
{ |
|
3000
|
|
|
|
|
|
|
p.value 0.589650577093106, |
|
3001
|
|
|
|
|
|
|
p_value 0.589650577093106, |
|
3002
|
|
|
|
|
|
|
statistic 0.960870680168535, |
|
3003
|
|
|
|
|
|
|
W 0.960870680168535 |
|
3004
|
|
|
|
|
|
|
} |
|
3005
|
|
|
|
|
|
|
|
|
3006
|
|
|
|
|
|
|
=head2 sum |
|
3007
|
|
|
|
|
|
|
|
|
3008
|
|
|
|
|
|
|
returns sum, but using both arrays and array references. |
|
3009
|
|
|
|
|
|
|
|
|
3010
|
|
|
|
|
|
|
my $test_data = [1..8]; |
|
3011
|
|
|
|
|
|
|
sum($test_data) |
|
3012
|
|
|
|
|
|
|
|
|
3013
|
|
|
|
|
|
|
which I prefer, compared to List::Util's required casting into an array: |
|
3014
|
|
|
|
|
|
|
|
|
3015
|
|
|
|
|
|
|
sum(@{ $test_data }); |
|
3016
|
|
|
|
|
|
|
|
|
3017
|
|
|
|
|
|
|
which passing a reference is shorter and much easier to read. Stats::LikeR, however, will work for B |
|
3018
|
|
|
|
|
|
|
|
|
3019
|
|
|
|
|
|
|
as of version 0.02, C will cause the script to die if any undefined values are provided |
|
3020
|
|
|
|
|
|
|
|
|
3021
|
|
|
|
|
|
|
=head2 summary |
|
3022
|
|
|
|
|
|
|
|
|
3023
|
|
|
|
|
|
|
Analogous to R's C, but does not deal with outputs from other functions. |
|
3024
|
|
|
|
|
|
|
C only describes data as it is entered. |
|
3025
|
|
|
|
|
|
|
An option C or its synonym C specifies the maximum number of rows that will print. |
|
3026
|
|
|
|
|
|
|
|
|
3027
|
|
|
|
|
|
|
=head3 array of array input |
|
3028
|
|
|
|
|
|
|
|
|
3029
|
|
|
|
|
|
|
my @arr; |
|
3030
|
|
|
|
|
|
|
foreach my $i (0..18) { |
|
3031
|
|
|
|
|
|
|
push @arr, runif(22); |
|
3032
|
|
|
|
|
|
|
} |
|
3033
|
|
|
|
|
|
|
|
|
3034
|
|
|
|
|
|
|
and then C, or C |
|
3035
|
|
|
|
|
|
|
|
|
3036
|
|
|
|
|
|
|
--------------------------------------------------------------------------- |
|
3037
|
|
|
|
|
|
|
Index # values Min. 1st Qu. Median Mean 3rd Qu. Max. |
|
3038
|
|
|
|
|
|
|
--------------------------------------------------------------------------- |
|
3039
|
|
|
|
|
|
|
0 22 0.04312 0.286 0.4975 0.5121 0.7296 0.9633 |
|
3040
|
|
|
|
|
|
|
1 22 0.05932 0.1483 0.495 0.4737 0.7699 0.9371 |
|
3041
|
|
|
|
|
|
|
2 22 0.02742 0.1588 0.4045 0.4325 0.6682 0.9878 |
|
3042
|
|
|
|
|
|
|
3 22 0.009233 0.2552 0.5398 0.5147 0.7755 0.9808 |
|
3043
|
|
|
|
|
|
|
4 22 0.06727 0.2432 0.5019 0.4855 0.7121 0.9043 |
|
3044
|
|
|
|
|
|
|
5 22 0.001032 0.1646 0.3021 0.3727 0.5704 0.9556 |
|
3045
|
|
|
|
|
|
|
|
|
3046
|
|
|
|
|
|
|
=head3 hash of array input |
|
3047
|
|
|
|
|
|
|
|
|
3048
|
|
|
|
|
|
|
$test_data = summary( |
|
3049
|
|
|
|
|
|
|
{ |
|
3050
|
|
|
|
|
|
|
A => runif(9), |
|
3051
|
|
|
|
|
|
|
B => runif(9) |
|
3052
|
|
|
|
|
|
|
}, |
|
3053
|
|
|
|
|
|
|
); |
|
3054
|
|
|
|
|
|
|
|
|
3055
|
|
|
|
|
|
|
=head2 t_test |
|
3056
|
|
|
|
|
|
|
|
|
3057
|
|
|
|
|
|
|
There are 1-sample and 2-sample t-tests, from one or two arrays: |
|
3058
|
|
|
|
|
|
|
|
|
3059
|
|
|
|
|
|
|
my $t_test = t_test( $array1, mu => 0.2334 ); |
|
3060
|
|
|
|
|
|
|
|
|
3061
|
|
|
|
|
|
|
or 2-sample: |
|
3062
|
|
|
|
|
|
|
|
|
3063
|
|
|
|
|
|
|
$t_test = t_test( |
|
3064
|
|
|
|
|
|
|
$array1, $array2, |
|
3065
|
|
|
|
|
|
|
paired => 1 |
|
3066
|
|
|
|
|
|
|
); |
|
3067
|
|
|
|
|
|
|
|
|
3068
|
|
|
|
|
|
|
returns a hash reference, which looks like: |
|
3069
|
|
|
|
|
|
|
|
|
3070
|
|
|
|
|
|
|
conf_int => [ |
|
3071
|
|
|
|
|
|
|
-0.06672889, 0.25672889 |
|
3072
|
|
|
|
|
|
|
], |
|
3073
|
|
|
|
|
|
|
df => 5, |
|
3074
|
|
|
|
|
|
|
estimate => 0.095, |
|
3075
|
|
|
|
|
|
|
p_value => 0.19143688433660, |
|
3076
|
|
|
|
|
|
|
statistic => 1.50996688705414 |
|
3077
|
|
|
|
|
|
|
|
|
3078
|
|
|
|
|
|
|
the two groups compared can be specified, though not necessarily, as C and C, just like in R: |
|
3079
|
|
|
|
|
|
|
|
|
3080
|
|
|
|
|
|
|
$t_test = t_test( |
|
3081
|
|
|
|
|
|
|
'x' => $array1, 'y' => $array2, |
|
3082
|
|
|
|
|
|
|
paired => 1 |
|
3083
|
|
|
|
|
|
|
); |
|
3084
|
|
|
|
|
|
|
|
|
3085
|
|
|
|
|
|
|
=head3 Parameters |
|
3086
|
|
|
|
|
|
|
|
|
3087
|
|
|
|
|
|
|
|
|
3088
|
|
|
|
|
|
|
|
|
3089
|
|
|
|
|
|
|
=begin html |
|
3090
|
|
|
|
|
|
|
|
|
3091
|
|
|
|
|
|
|
|
|
3145
|
|
|
|
|
|
|
|
|
3146
|
|
|
|
|
|
|
=end html |
|
3147
|
|
|
|
|
|
|
|
|
3148
|
|
|
|
|
|
|
|
|
3149
|
|
|
|
|
|
|
|
|
3150
|
|
|
|
|
|
|
=head3 Return Hash |
|
3151
|
|
|
|
|
|
|
|
|
3152
|
|
|
|
|
|
|
|
|
3153
|
|
|
|
|
|
|
|
|
3154
|
|
|
|
|
|
|
=begin html |
|
3155
|
|
|
|
|
|
|
|
|
3156
|
|
|
|
|
|
|
|
|
3194
|
|
|
|
|
|
|
|
|
3195
|
|
|
|
|
|
|
=end html |
|
3196
|
|
|
|
|
|
|
|
|
3197
|
|
|
|
|
|
|
|
|
3198
|
|
|
|
|
|
|
|
|
3199
|
|
|
|
|
|
|
=head2 transpose |
|
3200
|
|
|
|
|
|
|
|
|
3201
|
|
|
|
|
|
|
Transposes a two-dimensional data structure, swapping rows and columns. Accepts either an array of arrays or a hash of hashes. |
|
3202
|
|
|
|
|
|
|
Returns a new reference of the same type; the input is never modified. |
|
3203
|
|
|
|
|
|
|
|
|
3204
|
|
|
|
|
|
|
=head3 Array of array input |
|
3205
|
|
|
|
|
|
|
|
|
3206
|
|
|
|
|
|
|
Takes a reference to an array of array references and returns a new AoA where C |
|
3207
|
|
|
|
|
|
|
|
|
3208
|
|
|
|
|
|
|
my $matrix = [[1, 2, 3], [4, 5, 6]]; |
|
3209
|
|
|
|
|
|
|
my $t = transpose($matrix); |
|
3210
|
|
|
|
|
|
|
# [[1, 4], |
|
3211
|
|
|
|
|
|
|
# [2, 5], |
|
3212
|
|
|
|
|
|
|
# [3, 6]] |
|
3213
|
|
|
|
|
|
|
|
|
3214
|
|
|
|
|
|
|
All rows must be the same length; a ragged input is a fatal error. |
|
3215
|
|
|
|
|
|
|
C is valid as an element value and is preserved exactly. An empty outer array or an array of empty rows both return C<[]>. |
|
3216
|
|
|
|
|
|
|
|
|
3217
|
|
|
|
|
|
|
Dies if: |
|
3218
|
|
|
|
|
|
|
- any inner element is not an array reference |
|
3219
|
|
|
|
|
|
|
- rows differ in length (ragged array) |
|
3220
|
|
|
|
|
|
|
|
|
3221
|
|
|
|
|
|
|
=head3 Hash of hash input |
|
3222
|
|
|
|
|
|
|
|
|
3223
|
|
|
|
|
|
|
Takes a reference to a hash of hash references and returns a new HoH where C |
|
3224
|
|
|
|
|
|
|
|
|
3225
|
|
|
|
|
|
|
my $table = { alice => { score => 97, grade => 'A' }, bob => { score => 84, grade => 'B' } }; |
|
3226
|
|
|
|
|
|
|
my $t = transpose($table); |
|
3227
|
|
|
|
|
|
|
# { score => { alice => 97, bob => 84 }, |
|
3228
|
|
|
|
|
|
|
# grade => { alice => 'A', bob => 'B' } } |
|
3229
|
|
|
|
|
|
|
|
|
3230
|
|
|
|
|
|
|
Inner keys do not need to be uniform across rows. If a given column key appears in only some rows, the output hash for that column will simply contain only those rows — no padding or C-filling is performed. |
|
3231
|
|
|
|
|
|
|
|
|
3232
|
|
|
|
|
|
|
my $sparse = { |
|
3233
|
|
|
|
|
|
|
a => { x => 1, y => 2 }, |
|
3234
|
|
|
|
|
|
|
b => { x => 3, z => 4 } }; |
|
3235
|
|
|
|
|
|
|
|
|
3236
|
|
|
|
|
|
|
my $t = transpose($sparse); |
|
3237
|
|
|
|
|
|
|
# { x => { a => 1, b => 3 }, |
|
3238
|
|
|
|
|
|
|
# y => { a => 2 }, |
|
3239
|
|
|
|
|
|
|
# z => { b => 4 } } |
|
3240
|
|
|
|
|
|
|
|
|
3241
|
|
|
|
|
|
|
An empty outer hash or an outer hash whose inner hashes are all empty both return C<{}>. |
|
3242
|
|
|
|
|
|
|
|
|
3243
|
|
|
|
|
|
|
Dies if any inner element is not a hash reference |
|
3244
|
|
|
|
|
|
|
|
|
3245
|
|
|
|
|
|
|
=head2 value_counts |
|
3246
|
|
|
|
|
|
|
|
|
3247
|
|
|
|
|
|
|
Count the values in a given data set, return a hash reference showing how many times each particular value is present. |
|
3248
|
|
|
|
|
|
|
|
|
3249
|
|
|
|
|
|
|
=head3 Scalar |
|
3250
|
|
|
|
|
|
|
|
|
3251
|
|
|
|
|
|
|
$hash = value_counts('c'); |
|
3252
|
|
|
|
|
|
|
|
|
3253
|
|
|
|
|
|
|
returns C<< { c =E 1 } >> |
|
3254
|
|
|
|
|
|
|
|
|
3255
|
|
|
|
|
|
|
=head3 Array reference |
|
3256
|
|
|
|
|
|
|
|
|
3257
|
|
|
|
|
|
|
value_counts(['a','b','b']); |
|
3258
|
|
|
|
|
|
|
|
|
3259
|
|
|
|
|
|
|
returns C<< { a =E 1, b =E 2} >> |
|
3260
|
|
|
|
|
|
|
|
|
3261
|
|
|
|
|
|
|
=head3 Array |
|
3262
|
|
|
|
|
|
|
|
|
3263
|
|
|
|
|
|
|
my $value_counts = value_counts('a','b','b'); |
|
3264
|
|
|
|
|
|
|
|
|
3265
|
|
|
|
|
|
|
like an array reference above, returns C<< { a =E 1, b =E 2} >> |
|
3266
|
|
|
|
|
|
|
|
|
3267
|
|
|
|
|
|
|
=head3 Hash |
|
3268
|
|
|
|
|
|
|
|
|
3269
|
|
|
|
|
|
|
my $value_counts = value_counts( { A => 'a', B => 'a', C => 'b' } ); |
|
3270
|
|
|
|
|
|
|
|
|
3271
|
|
|
|
|
|
|
returns C<< { a =E 2, b =E 1} >> |
|
3272
|
|
|
|
|
|
|
|
|
3273
|
|
|
|
|
|
|
=head3 Hash of array |
|
3274
|
|
|
|
|
|
|
|
|
3275
|
|
|
|
|
|
|
my $value_counts = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']}); |
|
3276
|
|
|
|
|
|
|
|
|
3277
|
|
|
|
|
|
|
without a key (like above), the occurences of C, C, and C are counted. |
|
3278
|
|
|
|
|
|
|
|
|
3279
|
|
|
|
|
|
|
With a key, like C for above, only values within that hash key are counted: |
|
3280
|
|
|
|
|
|
|
|
|
3281
|
|
|
|
|
|
|
my $vc = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']}, 'a'); |
|
3282
|
|
|
|
|
|
|
|
|
3283
|
|
|
|
|
|
|
=head3 Hash of hash (table) |
|
3284
|
|
|
|
|
|
|
|
|
3285
|
|
|
|
|
|
|
$hash = value_counts( { |
|
3286
|
|
|
|
|
|
|
A => { |
|
3287
|
|
|
|
|
|
|
a => 'x', |
|
3288
|
|
|
|
|
|
|
b => 'z' |
|
3289
|
|
|
|
|
|
|
}, |
|
3290
|
|
|
|
|
|
|
B => { |
|
3291
|
|
|
|
|
|
|
a => 'x' |
|
3292
|
|
|
|
|
|
|
}, |
|
3293
|
|
|
|
|
|
|
C => { |
|
3294
|
|
|
|
|
|
|
a => 'y' |
|
3295
|
|
|
|
|
|
|
} |
|
3296
|
|
|
|
|
|
|
}, 'a'); |
|
3297
|
|
|
|
|
|
|
|
|
3298
|
|
|
|
|
|
|
the column, or second hash key, that you wish to count, is specified at the command line |
|
3299
|
|
|
|
|
|
|
|
|
3300
|
|
|
|
|
|
|
=head2 var |
|
3301
|
|
|
|
|
|
|
|
|
3302
|
|
|
|
|
|
|
as simple as possible: |
|
3303
|
|
|
|
|
|
|
|
|
3304
|
|
|
|
|
|
|
var(2, 4, 5, 8, 9) |
|
3305
|
|
|
|
|
|
|
|
|
3306
|
|
|
|
|
|
|
as of version 0.02, C will die if any undefined values are provided |
|
3307
|
|
|
|
|
|
|
|
|
3308
|
|
|
|
|
|
|
like C, C, etc., C can accept array references, to make code simpler: |
|
3309
|
|
|
|
|
|
|
|
|
3310
|
|
|
|
|
|
|
my $ref = \@arr; |
|
3311
|
|
|
|
|
|
|
var($ref) = var(@arr) |
|
3312
|
|
|
|
|
|
|
|
|
3313
|
|
|
|
|
|
|
=head2 var_test |
|
3314
|
|
|
|
|
|
|
|
|
3315
|
|
|
|
|
|
|
As described by R: Performs an F test to compare the variances of two samples from normal populations |
|
3316
|
|
|
|
|
|
|
|
|
3317
|
|
|
|
|
|
|
use Stats::LikeR; |
|
3318
|
|
|
|
|
|
|
|
|
3319
|
|
|
|
|
|
|
my @x = (2.9, 3.0, 2.5, 2.6, 3.2); |
|
3320
|
|
|
|
|
|
|
my @y = (3.8, 2.7, 4.0, 2.4); |
|
3321
|
|
|
|
|
|
|
|
|
3322
|
|
|
|
|
|
|
my $vt = var_test(\@x, \@y); |
|
3323
|
|
|
|
|
|
|
|
|
3324
|
|
|
|
|
|
|
also, conf_level can be set: |
|
3325
|
|
|
|
|
|
|
|
|
3326
|
|
|
|
|
|
|
$vt = var_test(\@x, \@y, conf_level => 0.99); |
|
3327
|
|
|
|
|
|
|
|
|
3328
|
|
|
|
|
|
|
as well as a ratio (from R: the hypothesized ratio of the population variances of C and C: |
|
3329
|
|
|
|
|
|
|
|
|
3330
|
|
|
|
|
|
|
$test_data = var_test(\@xk, \@yk, ratio => 2); |
|
3331
|
|
|
|
|
|
|
|
|
3332
|
|
|
|
|
|
|
=head2 view |
|
3333
|
|
|
|
|
|
|
|
|
3334
|
|
|
|
|
|
|
An R-style C for the structures C returns. Prints the first |
|
3335
|
|
|
|
|
|
|
few rows of a dataframe as an aligned text table, with numeric columns |
|
3336
|
|
|
|
|
|
|
right-justified, string columns left-justified, and undefined cells shown as |
|
3337
|
|
|
|
|
|
|
C. Works on all three C values: |
|
3338
|
|
|
|
|
|
|
|
|
3339
|
|
|
|
|
|
|
|
|
3340
|
|
|
|
|
|
|
|
|
3341
|
|
|
|
|
|
|
=begin html |
|
3342
|
|
|
|
|
|
|
|
|
3343
|
|
|
|
|
|
|
|
|
3369
|
|
|
|
|
|
|
|
|
3370
|
|
|
|
|
|
|
=end html |
|
3371
|
|
|
|
|
|
|
|
|
3372
|
|
|
|
|
|
|
|
|
3373
|
|
|
|
|
|
|
|
|
3374
|
|
|
|
|
|
|
=head3 Synopsis |
|
3375
|
|
|
|
|
|
|
|
|
3376
|
|
|
|
|
|
|
my $aoh = read_table('all.data.tsv', 'output.type' => 'aoh'); |
|
3377
|
|
|
|
|
|
|
|
|
3378
|
|
|
|
|
|
|
view($aoh); # first 6 rows, like head() |
|
3379
|
|
|
|
|
|
|
view($aoh, n => 20); # first 20 rows |
|
3380
|
|
|
|
|
|
|
view($aoh, cols => [qw(id age tt)]); # force a column order |
|
3381
|
|
|
|
|
|
|
view($aoh, 'row.names' => 'id'); # use column 'id' as the row label |
|
3382
|
|
|
|
|
|
|
view($aoh, na => '.', max_width => 30); |
|
3383
|
|
|
|
|
|
|
|
|
3384
|
|
|
|
|
|
|
my $txt = view($aoh, return_only => 1); # capture the string, print nothing |
|
3385
|
|
|
|
|
|
|
view($aoh, to => \*STDERR); # print somewhere other than STDOUT |
|
3386
|
|
|
|
|
|
|
|
|
3387
|
|
|
|
|
|
|
=head3 Output |
|
3388
|
|
|
|
|
|
|
|
|
3389
|
|
|
|
|
|
|
# AoH: 7 rows x 3 cols (showing 6) |
|
3390
|
|
|
|
|
|
|
row_name Testosterone, total (nmol/L) age sex |
|
3391
|
|
|
|
|
|
|
p1 18.2 41 M |
|
3392
|
|
|
|
|
|
|
p2 NA 7 F |
|
3393
|
|
|
|
|
|
|
p3 1.05 33 F |
|
3394
|
|
|
|
|
|
|
p4 22.9 55 M |
|
3395
|
|
|
|
|
|
|
p5 14 29 M |
|
3396
|
|
|
|
|
|
|
p6 NA 62 F |
|
3397
|
|
|
|
|
|
|
# ... 1 more row |
|
3398
|
|
|
|
|
|
|
|
|
3399
|
|
|
|
|
|
|
The banner reports the structure type, full dimensions, and how many rows are |
|
3400
|
|
|
|
|
|
|
displayed. A footer appears only when rows are hidden. |
|
3401
|
|
|
|
|
|
|
|
|
3402
|
|
|
|
|
|
|
=head3 Arguments |
|
3403
|
|
|
|
|
|
|
|
|
3404
|
|
|
|
|
|
|
All arguments after the data reference are optional name/value pairs. |
|
3405
|
|
|
|
|
|
|
|
|
3406
|
|
|
|
|
|
|
|
|
3407
|
|
|
|
|
|
|
|
|
3408
|
|
|
|
|
|
|
=begin html |
|
3409
|
|
|
|
|
|
|
|
|
3410
|
|
|
|
|
|
|
|
|
3471
|
|
|
|
|
|
|
|
|
3472
|
|
|
|
|
|
|
=end html |
|
3473
|
|
|
|
|
|
|
|
|
3474
|
|
|
|
|
|
|
|
|
3475
|
|
|
|
|
|
|
|
|
3476
|
|
|
|
|
|
|
C always returns the formatted string, whether or not it also prints. |
|
3477
|
|
|
|
|
|
|
|
|
3478
|
|
|
|
|
|
|
=head3 A note on column order |
|
3479
|
|
|
|
|
|
|
|
|
3480
|
|
|
|
|
|
|
C stores rows as hashes, so the original CSV column order is not |
|
3481
|
|
|
|
|
|
|
preserved. C therefore sorts columns by name for a stable, reproducible |
|
3482
|
|
|
|
|
|
|
layout. Two conveniences soften this: |
|
3483
|
|
|
|
|
|
|
|
|
3484
|
|
|
|
|
|
|
=over |
|
3485
|
|
|
|
|
|
|
|
|
3486
|
|
|
|
|
|
|
=item * A column literally named C (the label C assigns to a |
|
3487
|
|
|
|
|
|
|
leading blank header) is detected automatically and moved to the left as the |
|
3488
|
|
|
|
|
|
|
row label. |
|
3489
|
|
|
|
|
|
|
|
|
3490
|
|
|
|
|
|
|
=item * Pass C<< cols =E [ ... ] >> to control both the order and the selection of columns |
|
3491
|
|
|
|
|
|
|
shown. |
|
3492
|
|
|
|
|
|
|
|
|
3493
|
|
|
|
|
|
|
=back |
|
3494
|
|
|
|
|
|
|
|
|
3495
|
|
|
|
|
|
|
When no label column is present, C numbers the rows C<1, 2, 3, …>, the way |
|
3496
|
|
|
|
|
|
|
R prints row names for an unnamed data frame. |
|
3497
|
|
|
|
|
|
|
|
|
3498
|
|
|
|
|
|
|
=head3 Edge cases |
|
3499
|
|
|
|
|
|
|
|
|
3500
|
|
|
|
|
|
|
=over |
|
3501
|
|
|
|
|
|
|
|
|
3502
|
|
|
|
|
|
|
=item * Empty input (C<[]> or C<{}>) prints a clean C<0 rows x 0 cols> banner. |
|
3503
|
|
|
|
|
|
|
|
|
3504
|
|
|
|
|
|
|
=item * Tabs, carriage returns, and newlines inside a cell are escaped (C<\t>, C<\r>, |
|
3505
|
|
|
|
|
|
|
C<\n>) so one record always stays on one line. |
|
3506
|
|
|
|
|
|
|
|
|
3507
|
|
|
|
|
|
|
=item * A non-reference argument, or a hash whose values are plain scalars, dies with |
|
3508
|
|
|
|
|
|
|
a clear message rather than producing garbled output. |
|
3509
|
|
|
|
|
|
|
|
|
3510
|
|
|
|
|
|
|
=back |
|
3511
|
|
|
|
|
|
|
|
|
3512
|
|
|
|
|
|
|
=head3 Tests |
|
3513
|
|
|
|
|
|
|
|
|
3514
|
|
|
|
|
|
|
The behavior above is covered by C (run with C): the three |
|
3515
|
|
|
|
|
|
|
structure types, C boundaries, alignment, C rendering, truncation, |
|
3516
|
|
|
|
|
|
|
C/C handling, control-character escaping, the C and |
|
3517
|
|
|
|
|
|
|
C output paths, empty structures, and the error cases. |
|
3518
|
|
|
|
|
|
|
|
|
3519
|
|
|
|
|
|
|
=head2 wilcox_test |
|
3520
|
|
|
|
|
|
|
|
|
3521
|
|
|
|
|
|
|
$test_data = wilcox_test( |
|
3522
|
|
|
|
|
|
|
[1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30], |
|
3523
|
|
|
|
|
|
|
[0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29] |
|
3524
|
|
|
|
|
|
|
); |
|
3525
|
|
|
|
|
|
|
|
|
3526
|
|
|
|
|
|
|
Computes the Wilcoxon rank-sum / Mann-Whitney test (two samples) or the Wilcoxon signed-rank test (one sample or paired), following R's C conventions. |
|
3527
|
|
|
|
|
|
|
This is an alternative to the t-test, that does not assume a normal distribution. |
|
3528
|
|
|
|
|
|
|
With two array refs and no C flag it runs the two-sample rank-sum test; with a single sample, or with C<< paired =E 1 >>, it runs the signed-rank test. It calculates exact p-values by default for C<< N E 50 >> without ties; when ties (or, for the signed-rank case, zero differences) are present it automatically switches to the normal approximation with continuity correction. |
|
3529
|
|
|
|
|
|
|
|
|
3530
|
|
|
|
|
|
|
=head3 Calling conventions |
|
3531
|
|
|
|
|
|
|
|
|
3532
|
|
|
|
|
|
|
The first one or two array-ref arguments are taken positionally as C and C; everything after that is parsed as C<< key =E value >> pairs. The named forms C<< x =E >> and C<< y =E >> are also accepted and override the positional values. The flat argument list following the positional refs must contain an even number of elements, or the call dies with a usage message. |
|
3533
|
|
|
|
|
|
|
|
|
3534
|
|
|
|
|
|
|
# positional |
|
3535
|
|
|
|
|
|
|
wilcox_test(\@x, \@y, paired => 1); |
|
3536
|
|
|
|
|
|
|
|
|
3537
|
|
|
|
|
|
|
# fully named |
|
3538
|
|
|
|
|
|
|
wilcox_test(x => \@x, y => \@y, alternative => "greater", exact => 0); |
|
3539
|
|
|
|
|
|
|
|
|
3540
|
|
|
|
|
|
|
=head3 Input parameters |
|
3541
|
|
|
|
|
|
|
|
|
3542
|
|
|
|
|
|
|
|
|
3543
|
|
|
|
|
|
|
|
|
3544
|
|
|
|
|
|
|
=begin html |
|
3545
|
|
|
|
|
|
|
|
|
3546
|
|
|
|
|
|
|
|
3547
|
|
|
|
|
|
|
|
3548
|
|
|
|
|
|
|
|
|
3549
|
|
|
|
|
|
|
| Parameter |
|
3550
|
|
|
|
|
|
|
| Type |
|
3551
|
|
|
|
|
|
|
| Default |
|
3552
|
|
|
|
|
|
|
| Description |
|
3553
|
|
|
|
|
|
|
|
|
3554
|
|
|
|
|
|
|
| |
|
3555
|
|
|
|
|
|
|
|
|
3556
|
|
|
|
|
|
|
|
|
3557
|
|
|
|
|
|
|
| x |
|
3558
|
|
|
|
|
|
|
| ARRAY ref |
|
3559
|
|
|
|
|
|
|
| (required) |
|
3560
|
|
|
|
|
|
|
| The first sample. Passed positionally or as x =>. Non-numeric and undefined elements are silently dropped; an empty or all-missing x is fatal. In the two-sample test mu is subtracted from each x value. |
|
3561
|
|
|
|
|
|
|
|
|
3562
|
|
|
|
|
|
|
|
|
3563
|
|
|
|
|
|
|
| y |
|
3564
|
|
|
|
|
|
|
| ARRAY ref |
|
3565
|
|
|
|
|
|
|
| undef |
|
3566
|
|
|
|
|
|
|
| The second sample. If present and paired is false, a two-sample rank-sum test is run. If paired is true, y is required and must be the same length as x. Omit it for the one-sample signed-rank test. |
|
3567
|
|
|
|
|
|
|
|
|
3568
|
|
|
|
|
|
|
|
|
3569
|
|
|
|
|
|
|
| paired |
|
3570
|
|
|
|
|
|
|
| boolean |
|
3571
|
|
|
|
|
|
|
| 0 (false) |
|
3572
|
|
|
|
|
|
|
| Run a paired signed-rank test on the per-element differences x[i] - y[i] - mu. Requires y of equal length. |
|
3573
|
|
|
|
|
|
|
|
|
3574
|
|
|
|
|
|
|
|
|
3575
|
|
|
|
|
|
|
| correct |
|
3576
|
|
|
|
|
|
|
| boolean |
|
3577
|
|
|
|
|
|
|
| 1 (true) |
|
3578
|
|
|
|
|
|
|
| Apply the continuity correction (±0.5) when using the normal approximation. Ignored when an exact p-value is computed. |
|
3579
|
|
|
|
|
|
|
|
|
3580
|
|
|
|
|
|
|
|
|
3581
|
|
|
|
|
|
|
| mu |
|
3582
|
|
|
|
|
|
|
| number |
|
3583
|
|
|
|
|
|
|
| 0.0 |
|
3584
|
|
|
|
|
|
|
| Null-hypothesis location shift. Subtracted from x (two-sample) or from each difference (one-sample / paired). |
|
3585
|
|
|
|
|
|
|
|
|
3586
|
|
|
|
|
|
|
|
|
3587
|
|
|
|
|
|
|
| exact |
|
3588
|
|
|
|
|
|
|
| boolean / undef |
|
3589
|
|
|
|
|
|
|
| undef (auto) |
|
3590
|
|
|
|
|
|
|
| Tri-state. undef (or absent) selects exact automatically: when both group sizes are < 50 and there are no ties (two-sample), or n < 50 with no ties (signed-rank). A true value forces the exact test, a false value forces the approximation. Exact is impossible with ties — or, for the signed-rank test, with zero differences — and falls back to the approximation with a warning. |
|
3591
|
|
|
|
|
|
|
|
|
3592
|
|
|
|
|
|
|
|
|
3593
|
|
|
|
|
|
|
| alternative |
|
3594
|
|
|
|
|
|
|
| string |
|
3595
|
|
|
|
|
|
|
| "two.sided" |
|
3596
|
|
|
|
|
|
|
| One of "two.sided", "less", or "greater". Selects the tail(s) used for the p-value. |
|
3597
|
|
|
|
|
|
|
|
|
3598
|
|
|
|
|
|
|
|
|
3599
|
|
|
|
|
|
|
| |
|
3600
|
|
|
|
|
|
|
|
|
3601
|
|
|
|
|
|
|
=end html |
|
3602
|
|
|
|
|
|
|
|
|
3603
|
|
|
|
|
|
|
|
|
3604
|
|
|
|
|
|
|
|
|
3605
|
|
|
|
|
|
|
=head3 Output |
|
3606
|
|
|
|
|
|
|
|
|
3607
|
|
|
|
|
|
|
Returns a hash ref with the following keys: |
|
3608
|
|
|
|
|
|
|
|
|
3609
|
|
|
|
|
|
|
|
|
3610
|
|
|
|
|
|
|
|
|
3611
|
|
|
|
|
|
|
=begin html |
|
3612
|
|
|
|
|
|
|
|
|
3613
|
|
|
|
|
|
|
|
3614
|
|
|
|
|
|
|
|
3615
|
|
|
|
|
|
|
|
|
3616
|
|
|
|
|
|
|
| Key |
|
3617
|
|
|
|
|
|
|
| Type |
|
3618
|
|
|
|
|
|
|
| Description |
|
3619
|
|
|
|
|
|
|
|
|
3620
|
|
|
|
|
|
|
| |
|
3621
|
|
|
|
|
|
|
|
|
3622
|
|
|
|
|
|
|
|
|
3623
|
|
|
|
|
|
|
| statistic |
|
3624
|
|
|
|
|
|
|
| number |
|
3625
|
|
|
|
|
|
|
| The test statistic. For the two-sample test this is the Mann-Whitney W (the x rank sum minus nx*(nx+1)/2). For the signed-rank test it is V, the sum of the ranks assigned to the positive differences. |
|
3626
|
|
|
|
|
|
|
|
|
3627
|
|
|
|
|
|
|
|
|
3628
|
|
|
|
|
|
|
| p_value |
|
3629
|
|
|
|
|
|
|
| number |
|
3630
|
|
|
|
|
|
|
| The p-value for the chosen alternative, capped at 1.0. Two-sided p-values are 2 * min(p_less, p_greater). |
|
3631
|
|
|
|
|
|
|
|
|
3632
|
|
|
|
|
|
|
|
|
3633
|
|
|
|
|
|
|
| method |
|
3634
|
|
|
|
|
|
|
| string |
|
3635
|
|
|
|
|
|
|
| A human-readable description of the exact test variant that was run (see below). |
|
3636
|
|
|
|
|
|
|
|
|
3637
|
|
|
|
|
|
|
|
|
3638
|
|
|
|
|
|
|
| alternative |
|
3639
|
|
|
|
|
|
|
| string |
|
3640
|
|
|
|
|
|
|
| Echoes the alternative actually used ("two.sided", "less", or "greater"). |
|
3641
|
|
|
|
|
|
|
|
|
3642
|
|
|
|
|
|
|
|
|
3643
|
|
|
|
|
|
|
| |
|
3644
|
|
|
|
|
|
|
|
|
3645
|
|
|
|
|
|
|
=end html |
|
3646
|
|
|
|
|
|
|
|
|
3647
|
|
|
|
|
|
|
|
|
3648
|
|
|
|
|
|
|
|
|
3649
|
|
|
|
|
|
|
The C string reports which path executed: |
|
3650
|
|
|
|
|
|
|
|
|
3651
|
|
|
|
|
|
|
=over |
|
3652
|
|
|
|
|
|
|
|
|
3653
|
|
|
|
|
|
|
=item * Two-sample: C<"Wilcoxon rank sum exact test">, C<"Wilcoxon rank sum test with continuity correction">, or C<"Wilcoxon rank sum test">. |
|
3654
|
|
|
|
|
|
|
|
|
3655
|
|
|
|
|
|
|
=item * One-sample / paired: C<"Wilcoxon exact signed rank test">, C<"Wilcoxon signed rank test with continuity correction">, or C<"Wilcoxon signed rank test">. |
|
3656
|
|
|
|
|
|
|
|
|
3657
|
|
|
|
|
|
|
=back |
|
3658
|
|
|
|
|
|
|
|
|
3659
|
|
|
|
|
|
|
=head3 Notes and edge cases |
|
3660
|
|
|
|
|
|
|
|
|
3661
|
|
|
|
|
|
|
Missing data is handled by listwise removal of non-numeric / undefined cells before ranking; in the paired case a pair is dropped if either member is missing. An empty C (or, in the two-sample case, an empty C) after this filtering is fatal. |
|
3662
|
|
|
|
|
|
|
|
|
3663
|
|
|
|
|
|
|
For the signed-rank test, exact zero differences are discarded before ranking (matching R), and their presence disables the exact computation. Both empty-after-filtering and all-zero-difference inputs are fatal. |
|
3664
|
|
|
|
|
|
|
|
|
3665
|
|
|
|
|
|
|
Ties are detected during ranking and trigger the tie-corrected variance in the normal approximation; they also rule out the exact p-value. When C is left on auto, the size thresholds (C<< E 50 >> per group, or C<< E 50 >> differences) are what gate the exact vs. approximate decision. |
|
3666
|
|
|
|
|
|
|
|
|
3667
|
|
|
|
|
|
|
=head2 write_table |
|
3668
|
|
|
|
|
|
|
|
|
3669
|
|
|
|
|
|
|
mimics R's C, with data as first argument to subroutine, and output file as second |
|
3670
|
|
|
|
|
|
|
|
|
3671
|
|
|
|
|
|
|
write_table(\@data_aoh, $tmp_file, sep => "\t", 'row.names' => 1); |
|
3672
|
|
|
|
|
|
|
|
|
3673
|
|
|
|
|
|
|
You can also precisely filter and reorder which columns are written by passing an array reference to C: |
|
3674
|
|
|
|
|
|
|
|
|
3675
|
|
|
|
|
|
|
write_table(\@data, $tmp_file, sep => "\t", 'col.names' => ['c', 'a']); |
|
3676
|
|
|
|
|
|
|
|
|
3677
|
|
|
|
|
|
|
undefined variables are printed as C by default, but can be set as you wish using C |
|
3678
|
|
|
|
|
|
|
|
|
3679
|
|
|
|
|
|
|
write_table(\%data_hoa, '/tmp/undef.val.tsv', sep => "\t", 'undef.val' => 'nan') |
|
3680
|
|
|
|
|
|
|
|
|
3681
|
|
|
|
|
|
|
as of version 0.07, C determines comma and tab-separated delimiters from the filename, but will override if C or C are explicitly set. |
|
3682
|
|
|
|
|
|
|
|
|
3683
|
|
|
|
|
|
|
Args can also be accepted: |
|
3684
|
|
|
|
|
|
|
|
|
3685
|
|
|
|
|
|
|
write_table( 'data' => \%flat, 'file' => $f ); |
|
3686
|
|
|
|
|
|
|
|
|
3687
|
|
|
|
|
|
|
=head1 changes |
|
3688
|
|
|
|
|
|
|
|
|
3689
|
|
|
|
|
|
|
=head2 0.16 |
|
3690
|
|
|
|
|
|
|
|
|
3691
|
|
|
|
|
|
|
changes to dist.ini, the minimum Perl version disappeared when I fixed other problems |
|
3692
|
|
|
|
|
|
|
|
|
3693
|
|
|
|
|
|
|
clarifications between run time and test dependencies |
|
3694
|
|
|
|
|
|
|
|
|
3695
|
|
|
|
|
|
|
addition of C function to sort AoH and HoA |
|
3696
|
|
|
|
|
|
|
|
|
3697
|
|
|
|
|
|
|
addition of C to translate array of hashes into a hash of arrays |
|
3698
|
|
|
|
|
|
|
|
|
3699
|
|
|
|
|
|
|
fix of long double functions: https://www.cpantesters.org/cpan/report/5d5d9836-6a5f-11f1-aadb-63fd6d8775ea |
|
3700
|
|
|
|
|
|
|
|
|
3701
|
|
|
|
|
|
|
=head3 C |
|
3702
|
|
|
|
|
|
|
|
|
3703
|
|
|
|
|
|
|
output residual keys now use names, not integers |
|
3704
|
|
|
|
|
|
|
|
|
3705
|
|
|
|
|
|
|
=head3 C |
|
3706
|
|
|
|
|
|
|
|
|
3707
|
|
|
|
|
|
|
=head3 Bug fixes |
|
3708
|
|
|
|
|
|
|
|
|
3709
|
|
|
|
|
|
|
B When |
|
3710
|
|
|
|
|
|
|
C<< valid_n E= p >>, the cleanup freed the C I but not the |
|
3711
|
|
|
|
|
|
|
per-row name strings it held (those had been transferred out of C, |
|
3712
|
|
|
|
|
|
|
whose own array was already freed). The strings leaked on every such error. |
|
3713
|
|
|
|
|
|
|
Added the per-entry C loop before freeing the array, matching the |
|
3714
|
|
|
|
|
|
|
normal path. |
|
3715
|
|
|
|
|
|
|
|
|
3716
|
|
|
|
|
|
|
B Only the first hash value was |
|
3717
|
|
|
|
|
|
|
checked to be a C; subsequent values were C'd unconditionally, so |
|
3718
|
|
|
|
|
|
|
a malformed row (C<< { a =E {...}, b =E 5 } >>) dereferenced a non-reference. Every |
|
3719
|
|
|
|
|
|
|
row is now validated, with the partial allocations cleaned up before the |
|
3720
|
|
|
|
|
|
|
C, mirroring the existing AoH path. |
|
3721
|
|
|
|
|
|
|
|
|
3722
|
|
|
|
|
|
|
B<< C on a possibly-signed C. >> C is undefined for |
|
3723
|
|
|
|
|
|
|
byte values ≥ 0x80 on platforms where C is signed. Cast to |
|
3724
|
|
|
|
|
|
|
C<(unsigned char)> before the call. |
|
3725
|
|
|
|
|
|
|
|
|
3726
|
|
|
|
|
|
|
=head3 Speed / RAM improvements |
|
3727
|
|
|
|
|
|
|
|
|
3728
|
|
|
|
|
|
|
B C silently |
|
3729
|
|
|
|
|
|
|
truncated any longer formula. Replaced with a buffer sized to |
|
3730
|
|
|
|
|
|
|
C, so there is no fixed limit and no truncation. |
|
3731
|
|
|
|
|
|
|
|
|
3732
|
|
|
|
|
|
|
B<< C<.>-expansion buffer is now a growable heap buffer. >> C |
|
3733
|
|
|
|
|
|
|
silently dropped expanded terms once full. It is now a buffer that doubles on |
|
3734
|
|
|
|
|
|
|
demand. Appends also went from C (which rescans from the start every |
|
3735
|
|
|
|
|
|
|
time — O(n²) over many columns) to an O(1) amortised append that tracks the |
|
3736
|
|
|
|
|
|
|
write position. |
|
3737
|
|
|
|
|
|
|
|
|
3738
|
|
|
|
|
|
|
B The original |
|
3739
|
|
|
|
|
|
|
C'd a C buffer, filled it, copied it into C, and freed it |
|
3740
|
|
|
|
|
|
|
I — C allocations plus C copies. Each candidate row is now |
|
3741
|
|
|
|
|
|
|
written straight into C at its prospective commit slot; a row that fails |
|
3742
|
|
|
|
|
|
|
listwise deletion is simply overwritten by the next candidate. This removes the |
|
3743
|
|
|
|
|
|
|
C allocate/free cycles and the copy loop entirely. |
|
3744
|
|
|
|
|
|
|
|
|
3745
|
|
|
|
|
|
|
B<< Categorical levels sorted with C. >> The level list used an O(n²) bubble |
|
3746
|
|
|
|
|
|
|
sort; replaced with C (relevant only for high-cardinality factors). |
|
3747
|
|
|
|
|
|
|
|
|
3748
|
|
|
|
|
|
|
B<< Unused tail of C reclaimed after listwise deletion. >> C is allocated for |
|
3749
|
|
|
|
|
|
|
all C rows up front (C is unknown until rows are scanned). When rows |
|
3750
|
|
|
|
|
|
|
are dropped, C is now Ced down to C, returning the unused |
|
3751
|
|
|
|
|
|
|
tail to the allocator before the OLS phase. |
|
3752
|
|
|
|
|
|
|
|
|
3753
|
|
|
|
|
|
|
B The argument-parsing index was widened from |
|
3754
|
|
|
|
|
|
|
C to C to match C, and the HoH row count now uses |
|
3755
|
|
|
|
|
|
|
C rather than relying on C's return value. |
|
3756
|
|
|
|
|
|
|
|
|
3757
|
|
|
|
|
|
|
=head3 Known limitations (left unchanged) |
|
3758
|
|
|
|
|
|
|
|
|
3759
|
|
|
|
|
|
|
=over |
|
3760
|
|
|
|
|
|
|
|
|
3761
|
|
|
|
|
|
|
=item * A multi-way term such as C is split only on the first C<*>, so it yields |
|
3762
|
|
|
|
|
|
|
C, C, and C rather than a full three-way expansion. Deeper |
|
3763
|
|
|
|
|
|
|
interactions silently fail (the unparsable term evaluates to C and the |
|
3764
|
|
|
|
|
|
|
rows are dropped). This matches the documented two-way C<*> support. |
|
3765
|
|
|
|
|
|
|
|
|
3766
|
|
|
|
|
|
|
=item * HoA input takes the row count from the first column; columns shorter than |
|
3767
|
|
|
|
|
|
|
that simply contribute dropped rows rather than raising an error. |
|
3768
|
|
|
|
|
|
|
|
|
3769
|
|
|
|
|
|
|
=back |
|
3770
|
|
|
|
|
|
|
|
|
3771
|
|
|
|
|
|
|
=head3 C |
|
3772
|
|
|
|
|
|
|
|
|
3773
|
|
|
|
|
|
|
=head4 Bug fixes |
|
3774
|
|
|
|
|
|
|
|
|
3775
|
|
|
|
|
|
|
B Nearly every C after an allocation |
|
3776
|
|
|
|
|
|
|
leaked memory. C does a C, so anything allocated but not yet |
|
3777
|
|
|
|
|
|
|
freed is lost. Affected paths: |
|
3778
|
|
|
|
|
|
|
|
|
3779
|
|
|
|
|
|
|
=over |
|
3780
|
|
|
|
|
|
|
|
|
3781
|
|
|
|
|
|
|
=item * AoA and hash first-pass errors leaked C and any C entries |
|
3782
|
|
|
|
|
|
|
allocated so far. |
|
3783
|
|
|
|
|
|
|
|
|
3784
|
|
|
|
|
|
|
=item * Formula-mode "not found as an array ref" errors leaked C and C. |
|
3785
|
|
|
|
|
|
|
|
|
3786
|
|
|
|
|
|
|
=back |
|
3787
|
|
|
|
|
|
|
|
|
3788
|
|
|
|
|
|
|
All post-allocation errors now route through a single C label that frees |
|
3789
|
|
|
|
|
|
|
every pointer unconditionally. Pointers are initialised to C and C |
|
3790
|
|
|
|
|
|
|
is zero-allocated with C, so the cleanup is always safe to run. |
|
3791
|
|
|
|
|
|
|
|
|
3792
|
|
|
|
|
|
|
B<< Undefined and non-numeric cells silently coerced to C<0.0>. >> The original |
|
3793
|
|
|
|
|
|
|
second pass used C<(svp && *svp) ? SvNV(*svp) : 0.0>, meaning an C or |
|
3794
|
|
|
|
|
|
|
non-numeric cell was quietly treated as zero, silently corrupting the |
|
3795
|
|
|
|
|
|
|
F-statistic. Each cell is now validated with C and C; |
|
3796
|
|
|
|
|
|
|
the call dies naming the group and observation index, consistent with the rest |
|
3797
|
|
|
|
|
|
|
of C (C, C, C, etc.). |
|
3798
|
|
|
|
|
|
|
|
|
3799
|
|
|
|
|
|
|
B C |
|
3800
|
|
|
|
|
|
|
cast to C I adding, so an empty array (C returns C<-1>) |
|
3801
|
|
|
|
|
|
|
produced C rather than C<0>. Changed to |
|
3802
|
|
|
|
|
|
|
C so the C<+1> is done in signed arithmetic |
|
3803
|
|
|
|
|
|
|
before the cast. |
|
3804
|
|
|
|
|
|
|
|
|
3805
|
|
|
|
|
|
|
B<< Unreliable group count from C. >> C returns the |
|
3806
|
|
|
|
|
|
|
number of buckets in use rather than the number of keys for tied hashes. |
|
3807
|
|
|
|
|
|
|
Replaced with C, which always returns the correct key count. |
|
3808
|
|
|
|
|
|
|
|
|
3809
|
|
|
|
|
|
|
=head4 Improvements |
|
3810
|
|
|
|
|
|
|
|
|
3811
|
|
|
|
|
|
|
B<< C accepted as an alias for C. >> R users write |
|
3812
|
|
|
|
|
|
|
C; the argument parser now accepts both spellings. |
|
3813
|
|
|
|
|
|
|
|
|
3814
|
|
|
|
|
|
|
B C and manual C replaced |
|
3815
|
|
|
|
|
|
|
with C, C, C, and C. C additionally |
|
3816
|
|
|
|
|
|
|
preserves embedded NUL bytes in group key strings, which the previous |
|
3817
|
|
|
|
|
|
|
C-based copies silently truncated. |
|
3818
|
|
|
|
|
|
|
|
|
3819
|
|
|
|
|
|
|
=head4 Known limitations (not changed) |
|
3820
|
|
|
|
|
|
|
|
|
3821
|
|
|
|
|
|
|
=over |
|
3822
|
|
|
|
|
|
|
|
|
3823
|
|
|
|
|
|
|
=item * A factor column named C or C in a formula call will |
|
3824
|
|
|
|
|
|
|
collide with reserved top-level keys in the result hash. |
|
3825
|
|
|
|
|
|
|
|
|
3826
|
|
|
|
|
|
|
=item * Group names containing an embedded NUL are stored correctly but are still |
|
3827
|
|
|
|
|
|
|
truncated at C when written into the output hash keys. |
|
3828
|
|
|
|
|
|
|
|
|
3829
|
|
|
|
|
|
|
=back |
|
3830
|
|
|
|
|
|
|
|
|
3831
|
|
|
|
|
|
|
=head3 C |
|
3832
|
|
|
|
|
|
|
|
|
3833
|
|
|
|
|
|
|
default view shifted to 80 characters to match Linux window length |
|
3834
|
|
|
|
|
|
|
|
|
3835
|
|
|
|
|
|
|
=head4 New features |
|
3836
|
|
|
|
|
|
|
|
|
3837
|
|
|
|
|
|
|
=over |
|
3838
|
|
|
|
|
|
|
|
|
3839
|
|
|
|
|
|
|
=item * B<< C is accepted as a synonym for C >> (the number of rows shown). |
|
3840
|
|
|
|
|
|
|
Passing both C and C is an error. |
|
3841
|
|
|
|
|
|
|
|
|
3842
|
|
|
|
|
|
|
=item * B C validates its argument names |
|
3843
|
|
|
|
|
|
|
against the documented set (C, C, C, C, C, |
|
3844
|
|
|
|
|
|
|
C, C, C, C, C, C, C) and |
|
3845
|
|
|
|
|
|
|
dies listing any it does not recognise, so a misspelt option (e.g. C) |
|
3846
|
|
|
|
|
|
|
is caught instead of silently ignored. |
|
3847
|
|
|
|
|
|
|
|
|
3848
|
|
|
|
|
|
|
=item * B<< C / C is validated. >> It must be a non-negative integer; C or |
|
3849
|
|
|
|
|
|
|
a non-numeric value now dies with a clear message instead of producing |
|
3850
|
|
|
|
|
|
|
warnings and being treated as C<0>. |
|
3851
|
|
|
|
|
|
|
|
|
3852
|
|
|
|
|
|
|
=item * B |
|
3853
|
|
|
|
|
|
|
|
|
3854
|
|
|
|
|
|
|
=back |
|
3855
|
|
|
|
|
|
|
|
|
3856
|
|
|
|
|
|
|
=head4 Bug fixes |
|
3857
|
|
|
|
|
|
|
|
|
3858
|
|
|
|
|
|
|
=over |
|
3859
|
|
|
|
|
|
|
|
|
3860
|
|
|
|
|
|
|
=item * B<< C<< n =E 0 >> now still prints the column header. >> Column names were collected |
|
3861
|
|
|
|
|
|
|
only from the rows being shown, so requesting zero rows produced an empty |
|
3862
|
|
|
|
|
|
|
header line. At least one row is now scanned (when data exists) so the |
|
3863
|
|
|
|
|
|
|
header always lists the columns. |
|
3864
|
|
|
|
|
|
|
|
|
3865
|
|
|
|
|
|
|
=item * B<< An empty hash (C<{}>) no longer dies. >> It was rejected as |
|
3866
|
|
|
|
|
|
|
I<"neither ARRAY nor HASH">; it is now shown as an empty table |
|
3867
|
|
|
|
|
|
|
(C<0 rows x 0 cols>), matching the handling of an empty array. |
|
3868
|
|
|
|
|
|
|
|
|
3869
|
|
|
|
|
|
|
=item * B<< The C alias now drives the Hash-of-Hashes label header. >> The |
|
3870
|
|
|
|
|
|
|
header for the row-label column consulted only C, so |
|
3871
|
|
|
|
|
|
|
C<< row_names =E 'id' >> displayed C instead of C. Both spellings are |
|
3872
|
|
|
|
|
|
|
now honoured consistently. |
|
3873
|
|
|
|
|
|
|
|
|
3874
|
|
|
|
|
|
|
=item * B A Hash-of-Arrays column or |
|
3875
|
|
|
|
|
|
|
Hash-of-Hashes row whose value is not actually an array/hash reference now |
|
3876
|
|
|
|
|
|
|
renders as empty cells rather than throwing a dereference error. |
|
3877
|
|
|
|
|
|
|
|
|
3878
|
|
|
|
|
|
|
=back |
|
3879
|
|
|
|
|
|
|
|
|
3880
|
|
|
|
|
|
|
=head4 Performance |
|
3881
|
|
|
|
|
|
|
|
|
3882
|
|
|
|
|
|
|
=over |
|
3883
|
|
|
|
|
|
|
|
|
3884
|
|
|
|
|
|
|
=item * Column gathering no longer sorts once per scanned row. Unique column names |
|
3885
|
|
|
|
|
|
|
are collected across the scanned rows and sorted a single time (same output |
|
3886
|
|
|
|
|
|
|
order), and the ellipsis length is computed once rather than per cell. |
|
3887
|
|
|
|
|
|
|
|
|
3888
|
|
|
|
|
|
|
=back |
|
3889
|
|
|
|
|
|
|
|
|
3890
|
|
|
|
|
|
|
=head4 Tests |
|
3891
|
|
|
|
|
|
|
|
|
3892
|
|
|
|
|
|
|
=over |
|
3893
|
|
|
|
|
|
|
|
|
3894
|
|
|
|
|
|
|
=item * C is self-contained (the C implementation is inlined; it loads |
|
3895
|
|
|
|
|
|
|
no other files) and covers the new argument handling, the bug fixes above, |
|
3896
|
|
|
|
|
|
|
and the existing AoH / HoA / HoH behaviour, alignment, truncation, and |
|
3897
|
|
|
|
|
|
|
output-path handling. |
|
3898
|
|
|
|
|
|
|
|
|
3899
|
|
|
|
|
|
|
=back |
|
3900
|
|
|
|
|
|
|
|
|
3901
|
|
|
|
|
|
|
=head3 C |
|
3902
|
|
|
|
|
|
|
|
|
3903
|
|
|
|
|
|
|
Corrected four bugs in the C XSUB plus a portability fix in its exact signed-rank helper. Behaviour on valid input is unchanged: the R-agreement cases (unpaired C, C ; paired one-sided C, C; separated exact C, C) all still match R's C. |
|
3904
|
|
|
|
|
|
|
|
|
3905
|
|
|
|
|
|
|
=head4 Bug fixes |
|
3906
|
|
|
|
|
|
|
|
|
3907
|
|
|
|
|
|
|
=over |
|
3908
|
|
|
|
|
|
|
|
|
3909
|
|
|
|
|
|
|
=item * B<< Invalid C is now rejected. >> Any value other than C or C previously fell through to the two-sided branch and returned a two-sided result mislabelled with the bad string, so a typo like C<< alternative =E "twosided" >> silently "worked". It now croaks unless C is one of C, C, C. |
|
3910
|
|
|
|
|
|
|
|
|
3911
|
|
|
|
|
|
|
=item * B When every observation is tied the approximation's variance collapses to 0 and the old code divided by C: C returned C (a "significant" difference between identical samples). It now warns and returns C . |
|
3912
|
|
|
|
|
|
|
|
|
3913
|
|
|
|
|
|
|
=item * B<< Two-sided continuity correction at C. >> R uses C, so the correction is C<0> when the statistic sits exactly on its mean; the old code used C<-0.5>. Example: C<< wilcox_test([1,4], [2,3], exact =E 0) >> changed from C to C (matches R). |
|
3914
|
|
|
|
|
|
|
|
|
3915
|
|
|
|
|
|
|
=item * B<< C no longer shadows libm. >> The local C accumulator (mean of the statistic) shadowed the C library C; renamed to C (two-sample) and C (signed-rank). No active miscompute, removed as a latent hazard. |
|
3916
|
|
|
|
|
|
|
|
|
3917
|
|
|
|
|
|
|
=back |
|
3918
|
|
|
|
|
|
|
|
|
3919
|
|
|
|
|
|
|
=head4 Cosmetic |
|
3920
|
|
|
|
|
|
|
|
|
3921
|
|
|
|
|
|
|
=over |
|
3922
|
|
|
|
|
|
|
|
|
3923
|
|
|
|
|
|
|
=item * Collapsed a no-op ternary that assigned the same signed-rank exact method string on both branches; the C field is now simply C. |
|
3924
|
|
|
|
|
|
|
|
|
3925
|
|
|
|
|
|
|
=back |
|
3926
|
|
|
|
|
|
|
|
|
3927
|
|
|
|
|
|
|
=head4 Portability (exact signed-rank helper) |
|
3928
|
|
|
|
|
|
|
|
|
3929
|
|
|
|
|
|
|
=over |
|
3930
|
|
|
|
|
|
|
|
|
3931
|
|
|
|
|
|
|
=item * B<< C no longer calls C. >> The C<2^n> normaliser is now built by exact repeated doubling, which has no long-double libm dependency. This fixes an C load failure reported by a CPAN smoker (FreeBSD, perl 5.20, C) whose libm lacks the long-double math functions; the symbol resolved on glibc, which is why local builds passed. C accumulation in the DP is retained — only the C call was at fault. |
|
3932
|
|
|
|
|
|
|
|
|
3933
|
|
|
|
|
|
|
=item * B<< C → C >> for C, C, and the DP loop counters, which also removes a C-to-C narrowing at the call site. The C result (C) stays signed so its negative-C sentinel still fires, and is cast to C only after the C<< k E 0 >> check. |
|
3934
|
|
|
|
|
|
|
|
|
3935
|
|
|
|
|
|
|
=back |
|
3936
|
|
|
|
|
|
|
|
|
3937
|
|
|
|
|
|
|
=head4 Tests |
|
3938
|
|
|
|
|
|
|
|
|
3939
|
|
|
|
|
|
|
=over |
|
3940
|
|
|
|
|
|
|
|
|
3941
|
|
|
|
|
|
|
=item * Added C (flat, no subtests): R-agreement cases, option handling (C, C, C, C, named/positional C/C, NA dropping), regressions for all four bug fixes, argument-error and C-validation checks, output shape, and C coverage of the two-sample, exact, and paired allocation paths. |
|
3942
|
|
|
|
|
|
|
|
|
3943
|
|
|
|
|
|
|
=back |
|
3944
|
|
|
|
|
|
|
|
|
3945
|
|
|
|
|
|
|
=head2 0.15 |
|
3946
|
|
|
|
|
|
|
|
|
3947
|
|
|
|
|
|
|
C function added, similar to R's C |
|
3948
|
|
|
|
|
|
|
|
|
3949
|
|
|
|
|
|
|
C: |
|
3950
|
|
|
|
|
|
|
filter => { |
|
3951
|
|
|
|
|
|
|
'Testosterone, total (nmol/L)' => sub { defined $_ }, |
|
3952
|
|
|
|
|
|
|
} |
|
3953
|
|
|
|
|
|
|
|
|
3954
|
|
|
|
|
|
|
was broken by the change in undefined variables in 0.14, but is back to being C |
|
3955
|
|
|
|
|
|
|
|
|
3956
|
|
|
|
|
|
|
C improvement in sectioning in README |
|
3957
|
|
|
|
|
|
|
|
|
3958
|
|
|
|
|
|
|
Numerous changes to prevent quadmath/long double CPAN test failures |
|
3959
|
|
|
|
|
|
|
|
|
3960
|
|
|
|
|
|
|
Minimum Scalar::Util version in dist.ini is now 1.22, see https://www.cpantesters.org/cpan/report/6b682236-6567-11f1-a3bc-a055f9c4ba34 |
|
3961
|
|
|
|
|
|
|
|
|
3962
|
|
|
|
|
|
|
C is no longer needed, and removed as a dependency |
|
3963
|
|
|
|
|
|
|
|
|
3964
|
|
|
|
|
|
|
=head3 C |
|
3965
|
|
|
|
|
|
|
|
|
3966
|
|
|
|
|
|
|
=head4 Bug fixes |
|
3967
|
|
|
|
|
|
|
|
|
3968
|
|
|
|
|
|
|
=over |
|
3969
|
|
|
|
|
|
|
|
|
3970
|
|
|
|
|
|
|
=item * B C strips a |
|
3971
|
|
|
|
|
|
|
leading comment marker from the header line (so a file may begin with |
|
3972
|
|
|
|
|
|
|
C<#id,val>), but that strip was dead code: the XS parser skipped I line |
|
3973
|
|
|
|
|
|
|
beginning with the comment string before the callback ever saw it, so a |
|
3974
|
|
|
|
|
|
|
commented header was silently dropped and the first data row was mistaken for |
|
3975
|
|
|
|
|
|
|
the header. The parser now delivers the first content line even when it |
|
3976
|
|
|
|
|
|
|
begins with the comment marker, and only skips comment lines after the header |
|
3977
|
|
|
|
|
|
|
has been seen. |
|
3978
|
|
|
|
|
|
|
|
|
3979
|
|
|
|
|
|
|
=item * B The parser stripped |
|
3980
|
|
|
|
|
|
|
C<\r> unconditionally, so a quoted value such as C<"x\ry"> lost its carriage |
|
3981
|
|
|
|
|
|
|
return and would not survive a C -> C round-trip. C<\r> |
|
3982
|
|
|
|
|
|
|
is now stripped only as part of a trailing CRLF line ending and as a stray CR |
|
3983
|
|
|
|
|
|
|
I quotes; inside quotes it is literal data. |
|
3984
|
|
|
|
|
|
|
|
|
3985
|
|
|
|
|
|
|
=item * B<< Duplicate column names no longer corrupt C output. >> With |
|
3986
|
|
|
|
|
|
|
C<< output.type =E 'hoa' >>, a repeated column name pushed the same cell once per |
|
3987
|
|
|
|
|
|
|
occurrence, so the affected columns came out longer than the others and the |
|
3988
|
|
|
|
|
|
|
arrays no longer lined up by row. Columns are now keyed by unique header name |
|
3989
|
|
|
|
|
|
|
(first-seen order preserved, later values win, one warning emitted). |
|
3990
|
|
|
|
|
|
|
|
|
3991
|
|
|
|
|
|
|
=item * B Passing a defined argument |
|
3992
|
|
|
|
|
|
|
that was not a CODE reference silently fell through to slurp mode and ignored |
|
3993
|
|
|
|
|
|
|
the argument; it now croaks |
|
3994
|
|
|
|
|
|
|
(I<"callback must be a CODE reference">). |
|
3995
|
|
|
|
|
|
|
|
|
3996
|
|
|
|
|
|
|
=item * B<< An undefined/empty C row-name now dies instead of keying on C<"">. >> |
|
3997
|
|
|
|
|
|
|
With C<< output.type =E 'hoh' >>, a row whose row-name column was empty/undef was |
|
3998
|
|
|
|
|
|
|
stored under the C<''> key and raised I<"uninitialized value"> warnings. It now |
|
3999
|
|
|
|
|
|
|
dies, naming the column and the offending data row. |
|
4000
|
|
|
|
|
|
|
|
|
4001
|
|
|
|
|
|
|
=item * B A 1-based numeric |
|
4002
|
|
|
|
|
|
|
filter key greater than the column count was accepted, then silently extended |
|
4003
|
|
|
|
|
|
|
every row through the C<$_> write-back. It is now rejected up front with a |
|
4004
|
|
|
|
|
|
|
message naming the column count. |
|
4005
|
|
|
|
|
|
|
|
|
4006
|
|
|
|
|
|
|
=item * B<< C and C together now die. >> Supplying both silently preferred |
|
4007
|
|
|
|
|
|
|
C; passing both is now an explicit error (C remains an alias for |
|
4008
|
|
|
|
|
|
|
C when used alone). |
|
4009
|
|
|
|
|
|
|
|
|
4010
|
|
|
|
|
|
|
=item * B The unknown-argument path used |
|
4011
|
|
|
|
|
|
|
C to dump the offending names to STDOUT before dying; the names are now |
|
4012
|
|
|
|
|
|
|
carried in the C message itself. |
|
4013
|
|
|
|
|
|
|
|
|
4014
|
|
|
|
|
|
|
=back |
|
4015
|
|
|
|
|
|
|
|
|
4016
|
|
|
|
|
|
|
=head4 Better diagnostics |
|
4017
|
|
|
|
|
|
|
|
|
4018
|
|
|
|
|
|
|
=over |
|
4019
|
|
|
|
|
|
|
|
|
4020
|
|
|
|
|
|
|
=item * Alignment errors now report B is ragged |
|
4021
|
|
|
|
|
|
|
(I<"Alignment error on FILE data row N (X fields vs Y headers)">), instead of |
|
4022
|
|
|
|
|
|
|
only the field/header counts. |
|
4023
|
|
|
|
|
|
|
|
|
4024
|
|
|
|
|
|
|
=back |
|
4025
|
|
|
|
|
|
|
|
|
4026
|
|
|
|
|
|
|
=head4 Memory-leak fixes (exception paths) |
|
4027
|
|
|
|
|
|
|
|
|
4028
|
|
|
|
|
|
|
The parser allocated its working buffers (C, C, and — in |
|
4029
|
|
|
|
|
|
|
slurp mode — C) in the XS C block, i.e. I any validation, and |
|
4030
|
|
|
|
|
|
|
freed them only by falling off the end of the function. Any non-local exit |
|
4031
|
|
|
|
|
|
|
therefore leaked: |
|
4032
|
|
|
|
|
|
|
|
|
4033
|
|
|
|
|
|
|
=over |
|
4034
|
|
|
|
|
|
|
|
|
4035
|
|
|
|
|
|
|
=item * the open-failure C leaked the row buffer and field (and the slurp |
|
4036
|
|
|
|
|
|
|
accumulator); |
|
4037
|
|
|
|
|
|
|
|
|
4038
|
|
|
|
|
|
|
=item * far more commonly, a C thrown B — which |
|
4039
|
|
|
|
|
|
|
C does routinely on alignment errors, bad row names, and filter |
|
4040
|
|
|
|
|
|
|
exceptions — unwound straight out of the XS frame and leaked the field, the |
|
4041
|
|
|
|
|
|
|
current row, the line buffer, the slurp accumulator, I
|
|
4042
|
|
|
|
|
|
|
handle>. |
|
4043
|
|
|
|
|
|
|
|
|
4044
|
|
|
|
|
|
|
=back |
|
4045
|
|
|
|
|
|
|
|
|
4046
|
|
|
|
|
|
|
Allocations now happen in C after every croak-able check, and every |
|
4047
|
|
|
|
|
|
|
long-lived resource (the file handle via C, the buffers via |
|
4048
|
|
|
|
|
|
|
C) is tied to the save stack, which an exception unwinds. Measured |
|
4049
|
|
|
|
|
|
|
with C: a C mid-file went from 5 leaked SVs to 0, and an |
|
4050
|
|
|
|
|
|
|
open failure from 2 to 0. This is the likely source of the constant-size leaks |
|
4051
|
|
|
|
|
|
|
seen in CPAN-tester reports for the exception-path tests. |
|
4052
|
|
|
|
|
|
|
|
|
4053
|
|
|
|
|
|
|
=head4 Performance |
|
4054
|
|
|
|
|
|
|
|
|
4055
|
|
|
|
|
|
|
=over |
|
4056
|
|
|
|
|
|
|
|
|
4057
|
|
|
|
|
|
|
=item * B<~2.5x faster parsing> (57 -> 145 MB/s on a 100k-row quoted file). The core |
|
4058
|
|
|
|
|
|
|
loop appended one character at a time with C; it now |
|
4059
|
|
|
|
|
|
|
scans runs of ordinary bytes with C / a bounded scan and appends each |
|
4060
|
|
|
|
|
|
|
run in a single C, copying field contents in bulk rather than byte |
|
4061
|
|
|
|
|
|
|
by byte. |
|
4062
|
|
|
|
|
|
|
|
|
4063
|
|
|
|
|
|
|
=back |
|
4064
|
|
|
|
|
|
|
|
|
4065
|
|
|
|
|
|
|
=head4 Internal / non-behavioral |
|
4066
|
|
|
|
|
|
|
|
|
4067
|
|
|
|
|
|
|
=over |
|
4068
|
|
|
|
|
|
|
|
|
4069
|
|
|
|
|
|
|
=item * XS declarations moved from C to C; allocations deferred into |
|
4070
|
|
|
|
|
|
|
C (see the leak fixes above). |
|
4071
|
|
|
|
|
|
|
|
|
4072
|
|
|
|
|
|
|
=item * The filter loop now aliases the row hash with C |
|
4073
|
|
|
|
|
|
|
instead of copying it with C. This removes a full |
|
4074
|
|
|
|
|
|
|
per-row hash copy for every filtered row and fixes a latent staleness bug: |
|
4075
|
|
|
|
|
|
|
after a filter mutated C<$_> and the change was written back, C<%_> still |
|
4076
|
|
|
|
|
|
|
reflected the pre-mutation copy, so a subsequent filter in the same row saw |
|
4077
|
|
|
|
|
|
|
stale values. With aliasing, C<%_> I the row, so write-backs are always |
|
4078
|
|
|
|
|
|
|
visible. |
|
4079
|
|
|
|
|
|
|
|
|
4080
|
|
|
|
|
|
|
=back |
|
4081
|
|
|
|
|
|
|
|
|
4082
|
|
|
|
|
|
|
=head4 Known limitation (not changed) |
|
4083
|
|
|
|
|
|
|
|
|
4084
|
|
|
|
|
|
|
=over |
|
4085
|
|
|
|
|
|
|
|
|
4086
|
|
|
|
|
|
|
=item * B<< C does not round-trip back to C. >> C renders an |
|
4087
|
|
|
|
|
|
|
C cell as an empty field by default, and C maps an empty |
|
4088
|
|
|
|
|
|
|
field back to C, so the I round-trip is clean. But if a file is |
|
4089
|
|
|
|
|
|
|
written with a token such as C<< 'undef.val' =E 'NA' >>, C has no |
|
4090
|
|
|
|
|
|
|
inverse option and reads C back as the string C<'NA'>. C also |
|
4091
|
|
|
|
|
|
|
cannot distinguish a deliberately quoted empty string (C<"">) from a missing |
|
4092
|
|
|
|
|
|
|
value -- both become C. Adding an C-style option to |
|
4093
|
|
|
|
|
|
|
C (mapping configurable tokens and/or empty fields to C) |
|
4094
|
|
|
|
|
|
|
would close this gap. |
|
4095
|
|
|
|
|
|
|
|
|
4096
|
|
|
|
|
|
|
=back |
|
4097
|
|
|
|
|
|
|
|
|
4098
|
|
|
|
|
|
|
=head3 C |
|
4099
|
|
|
|
|
|
|
|
|
4100
|
|
|
|
|
|
|
=head4 Behavior change |
|
4101
|
|
|
|
|
|
|
|
|
4102
|
|
|
|
|
|
|
=over |
|
4103
|
|
|
|
|
|
|
|
|
4104
|
|
|
|
|
|
|
=item * B<< C cells now write as an empty field, not an empty string. >> A missing |
|
4105
|
|
|
|
|
|
|
or C value renders as nothing between separators (C) rather than a |
|
4106
|
|
|
|
|
|
|
quoted empty string (C / C). Supplying C<< 'undef.val' =E 'NA' >> |
|
4107
|
|
|
|
|
|
|
(or any other token) still overrides this, exactly as before. This is the |
|
4108
|
|
|
|
|
|
|
only change that can alter the bytes of an existing output file; if you relied |
|
4109
|
|
|
|
|
|
|
on the previous default, pass C<< 'undef.val' =E '' >> to keep an explicit empty |
|
4110
|
|
|
|
|
|
|
field, or your chosen placeholder. |
|
4111
|
|
|
|
|
|
|
|
|
4112
|
|
|
|
|
|
|
=back |
|
4113
|
|
|
|
|
|
|
|
|
4114
|
|
|
|
|
|
|
=head4 Bug fixes |
|
4115
|
|
|
|
|
|
|
|
|
4116
|
|
|
|
|
|
|
=over |
|
4117
|
|
|
|
|
|
|
|
|
4118
|
|
|
|
|
|
|
=item * B |
|
4119
|
|
|
|
|
|
|
Previously, cells were looked up with the raw bytes of the column name |
|
4120
|
|
|
|
|
|
|
(C), which fails to match a |
|
4121
|
|
|
|
|
|
|
UTF-8-flagged hash key: the column header printed correctly but every cell |
|
4122
|
|
|
|
|
|
|
under it came back empty. All lookups now fetch by SV (C), header |
|
4123
|
|
|
|
|
|
|
lists are gathered and sorted as SVs (C + C, preserving the |
|
4124
|
|
|
|
|
|
|
flag) instead of being round-tripped through C, and the C |
|
4125
|
|
|
|
|
|
|
column is matched with C rather than C. Embedded NUL bytes in |
|
4126
|
|
|
|
|
|
|
keys are handled correctly as a side effect. |
|
4127
|
|
|
|
|
|
|
|
|
4128
|
|
|
|
|
|
|
=item * B<< C<< col.names =E [] >> no longer loops forever. >> An empty C array made |
|
4129
|
|
|
|
|
|
|
C return C<-1>, which — compared against an unsigned C loop |
|
4130
|
|
|
|
|
|
|
index — wrapped to C and ran effectively without end. This was fixed |
|
4131
|
|
|
|
|
|
|
for flat hashes previously; it was still present for hash-of-hashes, |
|
4132
|
|
|
|
|
|
|
hash-of-arrays, and array-of-hashes, plus both C header-filtering |
|
4133
|
|
|
|
|
|
|
loops. All such loops now use a signed index. |
|
4134
|
|
|
|
|
|
|
|
|
4135
|
|
|
|
|
|
|
=item * B One header loop used an |
|
4136
|
|
|
|
|
|
|
C index that silently wrapped past 65,535 and never terminated. |
|
4137
|
|
|
|
|
|
|
It now uses C like the rest of the code. |
|
4138
|
|
|
|
|
|
|
|
|
4139
|
|
|
|
|
|
|
=item * B Every other input shape |
|
4140
|
|
|
|
|
|
|
rejects a nested reference with |
|
4141
|
|
|
|
|
|
|
I<"Cannot write nested reference types to table">; a flat hash instead |
|
4142
|
|
|
|
|
|
|
stringified it (e.g. C) into the file. It now croaks |
|
4143
|
|
|
|
|
|
|
consistently. |
|
4144
|
|
|
|
|
|
|
|
|
4145
|
|
|
|
|
|
|
=item * B<< C<< 'undef.val' =E undef >> is handled cleanly. >> It previously called |
|
4146
|
|
|
|
|
|
|
C on C, raising an I<"uninitialized value"> warning and |
|
4147
|
|
|
|
|
|
|
yielding an empty string by accident. It is now treated explicitly as an empty |
|
4148
|
|
|
|
|
|
|
field, with no warning. |
|
4149
|
|
|
|
|
|
|
|
|
4150
|
|
|
|
|
|
|
=back |
|
4151
|
|
|
|
|
|
|
|
|
4152
|
|
|
|
|
|
|
=head4 Memory-leak fixes (exception paths) |
|
4153
|
|
|
|
|
|
|
|
|
4154
|
|
|
|
|
|
|
=over |
|
4155
|
|
|
|
|
|
|
|
|
4156
|
|
|
|
|
|
|
=item * The row-key list gathered for hash-of-hashes input was leaked when the output |
|
4157
|
|
|
|
|
|
|
file could not be opened. |
|
4158
|
|
|
|
|
|
|
|
|
4159
|
|
|
|
|
|
|
=item * The I<"Could not get headers"> croak on hash-of-arrays input leaked both the |
|
4160
|
|
|
|
|
|
|
already-open filehandle and the headers array. |
|
4161
|
|
|
|
|
|
|
|
|
4162
|
|
|
|
|
|
|
=back |
|
4163
|
|
|
|
|
|
|
|
|
4164
|
|
|
|
|
|
|
=head4 Internal / non-behavioral |
|
4165
|
|
|
|
|
|
|
|
|
4166
|
|
|
|
|
|
|
=over |
|
4167
|
|
|
|
|
|
|
|
|
4168
|
|
|
|
|
|
|
=item * Numeric row labels are now formatted into a reused stack buffer instead of a |
|
4169
|
|
|
|
|
|
|
per-row C / C allocation (no functional change; removes a |
|
4170
|
|
|
|
|
|
|
cast-away-C and one allocation per row). |
|
4171
|
|
|
|
|
|
|
|
|
4172
|
|
|
|
|
|
|
=item * Several signed/unsigned index types were made consistent (C vs |
|
4173
|
|
|
|
|
|
|
C) to match C and silence the conditions behind the loop bugs |
|
4174
|
|
|
|
|
|
|
above. |
|
4175
|
|
|
|
|
|
|
|
|
4176
|
|
|
|
|
|
|
=back |
|
4177
|
|
|
|
|
|
|
|
|
4178
|
|
|
|
|
|
|
=head4 Tests |
|
4179
|
|
|
|
|
|
|
|
|
4180
|
|
|
|
|
|
|
=over |
|
4181
|
|
|
|
|
|
|
|
|
4182
|
|
|
|
|
|
|
=item * C expanded from 17 to 69 assertions. New coverage targets each |
|
4183
|
|
|
|
|
|
|
fix above: the empty-field default and C<< undef.val =E undef >> (no warning), |
|
4184
|
|
|
|
|
|
|
C<< col.names =E [] >> termination across all four input shapes, the |
|
4185
|
|
|
|
|
|
|
>65,535-column header loop (gated behind C), in-sequence |
|
4186
|
|
|
|
|
|
|
numeric row labels, nested-reference rejection, CSV quoting corners |
|
4187
|
|
|
|
|
|
|
(carriage return, separators inside column names, multi-character separators), |
|
4188
|
|
|
|
|
|
|
empty input writing no file, and UTF-8 column names and row keys. Two leak |
|
4189
|
|
|
|
|
|
|
assertions cover the exception paths above. |
|
4190
|
|
|
|
|
|
|
|
|
4191
|
|
|
|
|
|
|
=back |
|
4192
|
|
|
|
|
|
|
|
|
4193
|
|
|
|
|
|
|
=head2 0.14 |
|
4194
|
|
|
|
|
|
|
|
|
4195
|
|
|
|
|
|
|
C function added for rows |
|
4196
|
|
|
|
|
|
|
|
|
4197
|
|
|
|
|
|
|
C reads undefined values to C instead of C, which makes calculations easier |
|
4198
|
|
|
|
|
|
|
|
|
4199
|
|
|
|
|
|
|
C writes undef by default as an empty string C<''> |
|
4200
|
|
|
|
|
|
|
|
|
4201
|
|
|
|
|
|
|
C transforms a hash of hashes into an hash of arrays |
|
4202
|
|
|
|
|
|
|
|
|
4203
|
|
|
|
|
|
|
C uses C instead of C to allow for high-precision 128-bit floats to be used on quadmath machines when available: https://www.cpantesters.org/cpan/report/296f4868-631f-11f1-abba-ff15558d240b |
|
4204
|
|
|
|
|
|
|
|
|
4205
|
|
|
|
|
|
|
Numerous switches from C to C for local precision, like above |
|
4206
|
|
|
|
|
|
|
|
|
4207
|
|
|
|
|
|
|
numerous changes to C for ease of use and working with datasets with numerous undefined values |
|
4208
|
|
|
|
|
|
|
|
|
4209
|
|
|
|
|
|
|
dist.ini now links to math library when compiling: https://www.cpantesters.org/cpan/report/785e26d8-6397-11f1-89c0-dc066e8775ea |
|
4210
|
|
|
|
|
|
|
|
|
4211
|
|
|
|
|
|
|
C now should be complete, errors with confidence intervals fixed |
|
4212
|
|
|
|
|
|
|
|
|
4213
|
|
|
|
|
|
|
=head2 0.13 |
|
4214
|
|
|
|
|
|
|
|
|
4215
|
|
|
|
|
|
|
C: speed improvements; commented headers are now allowed |
|
4216
|
|
|
|
|
|
|
|
|
4217
|
|
|
|
|
|
|
C: fix for |
|
4218
|
|
|
|
|
|
|
|
|
4219
|
|
|
|
|
|
|
Attempt to free temp prematurely: SV 0x56417a2ae610 at t/write_table.t line 182. |
|
4220
|
|
|
|
|
|
|
main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203 |
|
4221
|
|
|
|
|
|
|
Attempt to free unreferenced scalar: SV 0x56417a2ae610 at t/write_table.t line 183. |
|
4222
|
|
|
|
|
|
|
main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203 |
|
4223
|
|
|
|
|
|
|
|
|
4224
|
|
|
|
|
|
|
C gives better warnings for incorrect types of data given |
|
4225
|
|
|
|
|
|
|
|
|
4226
|
|
|
|
|
|
|
Numerous changes to dist.ini to improve CPAN testing, especially for Win32 |
|
4227
|
|
|
|
|
|
|
|
|
4228
|
|
|
|
|
|
|
=head2 0.12 |
|
4229
|
|
|
|
|
|
|
|
|
4230
|
|
|
|
|
|
|
C can also take hash of arrays, and various mixes of data types |
|
4231
|
|
|
|
|
|
|
|
|
4232
|
|
|
|
|
|
|
C: Addition of C keywords in many places; should improve CPU performance |
|
4233
|
|
|
|
|
|
|
|
|
4234
|
|
|
|
|
|
|
Better POD formatting, correction of output hash for README's C |
|
4235
|
|
|
|
|
|
|
|
|
4236
|
|
|
|
|
|
|
C can now accept hash of hashes as input |
|
4237
|
|
|
|
|
|
|
|
|
4238
|
|
|
|
|
|
|
new C function for switching 2D hash keys and 2D array indices, and C for comparing columns against columns |
|
4239
|
|
|
|
|
|
|
|
|
4240
|
|
|
|
|
|
|
removed unused function from C helpers |
|
4241
|
|
|
|
|
|
|
|
|
4242
|
|
|
|
|
|
|
C: addition of restrict keywords in preinit, should improve CPU performance |
|
4243
|
|
|
|
|
|
|
|
|
4244
|
|
|
|
|
|
|
MANIFEST.skip changed to MANIFEST.SKIP to improve CPAN testing |
|
4245
|
|
|
|
|
|
|
|
|
4246
|
|
|
|
|
|
|
using C for tests of C, which may or may not work with CPAN testers (experimental) |
|
4247
|
|
|
|
|
|
|
|
|
4248
|
|
|
|
|
|
|
Added function name to warnings, so I actually know which function is producing the error |
|
4249
|
|
|
|
|
|
|
|
|
4250
|
|
|
|
|
|
|
C can also take C and C as args, in addition to positions |
|
4251
|
|
|
|
|
|
|
|
|
4252
|
|
|
|
|
|
|
fixed C as it could hang if given empty C or C |
|
4253
|
|
|
|
|
|
|
|
|
4254
|
|
|
|
|
|
|
Added C<__EXTENSIONS__> to source XS file for better CPAN testing |
|
4255
|
|
|
|
|
|
|
|
|
4256
|
|
|
|
|
|
|
=head2 0.11 |
|
4257
|
|
|
|
|
|
|
|
|
4258
|
|
|
|
|
|
|
better POD formatting for tables |
|
4259
|
|
|
|
|
|
|
|
|
4260
|
|
|
|
|
|
|
addition of MANIFEST.skip to get better testing results on CPAN |
|
4261
|
|
|
|
|
|
|
|
|
4262
|
|
|
|
|
|
|
C: bugfix for when there is no intercept in the formula, new test cases in t/glm.t |
|
4263
|
|
|
|
|
|
|
|
|
4264
|
|
|
|
|
|
|
C now accepts simple hashes as input, in addition to hash of arrays, hash of hashes, and arrays of hashes |
|
4265
|
|
|
|
|
|
|
|
|
4266
|
|
|
|
|
|
|
Better documentation for t-test |
|
4267
|
|
|
|
|
|
|
|
|
4268
|
|
|
|
|
|
|
=head2 0.10 |
|
4269
|
|
|
|
|
|
|
|
|
4270
|
|
|
|
|
|
|
changes to compilation for CPAN, trying to get this work on Windows |
|
4271
|
|
|
|
|
|
|
|
|
4272
|
|
|
|
|
|
|
Addition of C and C |
|
4273
|
|
|
|
|
|
|
|
|
4274
|
|
|
|
|
|
|
C will work without key names, just like in R. Testing for C has improved. |
|
4275
|
|
|
|
|
|
|
|
|
4276
|
|
|
|
|
|
|
=head2 0.09 |
|
4277
|
|
|
|
|
|
|
|
|
4278
|
|
|
|
|
|
|
context changes in XS C, C, and C to get better CPAN testing results |
|
4279
|
|
|
|
|
|
|
|
|
4280
|
|
|
|
|
|
|
C keywords added to C to increase speed |
|
4281
|
|
|
|
|
|
|
|
|
4282
|
|
|
|
|
|
|
=head2 0.08 |
|
4283
|
|
|
|
|
|
|
|
|
4284
|
|
|
|
|
|
|
Speed improvement in C of hashes. |
|
4285
|
|
|
|
|
|
|
|
|
4286
|
|
|
|
|
|
|
Addition of C, C, C, C, and C functions |
|
4287
|
|
|
|
|
|
|
|
|
4288
|
|
|
|
|
|
|
Chi-squared function no longer has Perl wrapper, and all code is in XS, which should result in a minor speed increase with 1 less function call. |
|
4289
|
|
|
|
|
|
|
|
|
4290
|
|
|
|
|
|
|
Compiler changes for GNU source and inclusion of C, to ensure more CPAN testing works better. |
|
4291
|
|
|
|
|
|
|
|
|
4292
|
|
|
|
|
|
|
C now returns hash-of-hash in {row}{column} |
|
4293
|
|
|
|
|
|
|
|
|
4294
|
|
|
|
|
|
|
=head2 0.07 |
|
4295
|
|
|
|
|
|
|
|
|
4296
|
|
|
|
|
|
|
Addition of C function. |
|
4297
|
|
|
|
|
|
|
|
|
4298
|
|
|
|
|
|
|
Formulas can now be omitted from C, resulting in a stacked calculation as R would think. |
|
4299
|
|
|
|
|
|
|
|
|
4300
|
|
|
|
|
|
|
Addition of C for multi-group comparisons that does not assume normality like C does. |
|
4301
|
|
|
|
|
|
|
|
|
4302
|
|
|
|
|
|
|
C and C now automatically set separators for C<.csv> files as C<,> and C<.tsv> files as C<"\t">, respectively, so these values no longer need to be specified separately from the file name. |
|
4303
|
|
|
|
|
|
|
|
|
4304
|
|
|
|
|
|
|
=head2 0.06 |
|
4305
|
|
|
|
|
|
|
|
|
4306
|
|
|
|
|
|
|
Changed compiler options so that Solaris will work |
|
4307
|
|
|
|
|
|
|
|
|
4308
|
|
|
|
|
|
|
signed integers changed to unsigned in C |
|
4309
|
|
|
|
|
|
|
|
|
4310
|
|
|
|
|
|
|
Added restrict keywords to C, and made C to C |
|
4311
|
|
|
|
|
|
|
|
|
4312
|
|
|
|
|
|
|
=head2 0.05 |
|
4313
|
|
|
|
|
|
|
|
|
4314
|
|
|
|
|
|
|
Leak testing for C |
|
4315
|
|
|
|
|
|
|
|
|
4316
|
|
|
|
|
|
|
removal of Data::Printer dependency for easier CPAN testing |
|
4317
|
|
|
|
|
|
|
|
|
4318
|
|
|
|
|
|
|
switched several C variable to C so that clang doesn't complain |
|
4319
|
|
|
|
|
|
|
|
|
4320
|
|
|
|
|
|
|
added restrict keyword for C |
|
4321
|
|
|
|
|
|
|
|
|
4322
|
|
|
|
|
|
|
=head2 0.04 |
|
4323
|
|
|
|
|
|
|
|
|
4324
|
|
|
|
|
|
|
addition of C function |
|
4325
|
|
|
|
|
|
|
|
|
4326
|
|
|
|
|
|
|
GNU source, to maximize compatibility and ease installation |
|
4327
|
|
|
|
|
|
|
|
|
4328
|
|
|
|
|
|
|
removal of JSON dependency to ease installation |
|
4329
|
|
|
|
|
|
|
|
|
4330
|
|
|
|
|
|
|
=head2 0.03 |
|
4331
|
|
|
|
|
|
|
|
|
4332
|
|
|
|
|
|
|
Compatibility back to Perl 5.10 |
|
4333
|
|
|
|
|
|
|
|
|
4334
|
|
|
|
|
|
|
=head2 0.02 |
|
4335
|
|
|
|
|
|
|
|
|
4336
|
|
|
|
|
|
|
back-compatible to Perl 5.10, instead of original 5.40, ensuring more people can use it |
|
4337
|
|
|
|
|
|
|
|
|
4338
|
|
|
|
|
|
|
added var_test |
|
4339
|
|
|
|
|
|
|
|
|
4340
|
|
|
|
|
|
|
mean, min, sum, median, var, and max die with undefined values, and print the offending indices |
|
4341
|
|
|
|
|
|
|
|
|
4342
|
|
|
|
|
|
|
"group_stats" added to aov, for TukeyHSD in the future |
|
4343
|
|
|
|
|
|
|
|
|
4344
|
|
|
|
|
|
|
"cor" dies when given data with standard deviation of 0 |
|
4345
|
|
|
|
|
|
|
|
|
4346
|
|
|
|
|
|
|
C now has C option, which shows how undefined values are printed to tables, which is C by default. |