File Coverage

blib/lib/Stats/LikeR.pm
Criterion Covered Total %
statement 387 395 97.9
branch 212 236 89.8
condition 60 81 74.0
subroutine 35 35 100.0
pod 4 4 100.0
total 698 751 92.9


are exported by default.
line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2             # ABSTRACT: Get basic statistical functions, like in R, but with Perl using XS for performance
3             require 5.010;
4 23     23   1201268 use strict;
  23         37  
  23         810  
5 23     23   151 use feature 'say';
  23         37  
  23         6359  
6             package Stats::LikeR;
7             our $VERSION = 0.16;
8             require XSLoader;
9 23     23   14427 use Devel::Confess 'color';
  23         224576  
  23         87  
10 23     23   2358 use warnings FATAL => 'all';
  23         60  
  23         1067  
11 23     23   9284 use autodie ':default';
  23         341909  
  23         80  
12 23     23   110512 use Exporter 'import';
  23         38  
  23         891  
13 23     23   106 use Scalar::Util 'looks_like_number';
  23         32  
  23         14366  
14             XSLoader::load('Stats::LikeR', $VERSION);
15             our @EXPORT_OK = qw(add_data aoh2hoa aov cfilter chisq_test col col2col cor cor_test cov csort dnorm filter fisher_test glm group_by hoh2hoa hist kruskal_test ks_test ljoin lm matrix max mean median min mode oneway_test p_adjust power_t_test prcomp quantile rbinom read_table rnorm runif sample scale sd seq shapiro_test sum summary t_test transpose value_counts var var_test view wilcox_test write_table);
16             our @EXPORT = @EXPORT_OK;
17              
18             require XSLoader;
19             # ---- filter DSL: col() builds a predicate via overloading (pure Perl) -------
20             # Exported: filter (XS) and col. Place col()/Col/Pred near the top of the .pm;
21             # they need no XS. filter() is the XSUB.
22 50     50 1 274588 sub col { Stats::LikeR::Col->new($_[0]) }
23             {
24             package Stats::LikeR::Col;
25 50   33 50   351 sub new { bless { name => $_[1] }, ref($_[0]) || $_[0] }
26             # build a comparison leaf; if operands were swapped (4 > col('x')), flip the op
27             sub _c {
28 50     50   98 my ($self, $val, $swapped, $op, $flip) = @_;
29 50 100       127 Stats::LikeR::Pred->_leaf($self->{name}, $swapped ? $flip : $op, $val);
30             }
31             use overload
32 19     19   42 '>' => sub { $_[0]->_c($_[1],$_[2],'>','<') },
33 7     7   23 '<' => sub { $_[0]->_c($_[1],$_[2],'<','>') },
34 2     2   7 '>=' => sub { $_[0]->_c($_[1],$_[2],'>=','<=') },
35 3     3   9 '<=' => sub { $_[0]->_c($_[1],$_[2],'<=','>=') },
36 4     4   11 '==' => sub { $_[0]->_c($_[1],$_[2],'==','==') },
37 2     2   6 '!=' => sub { $_[0]->_c($_[1],$_[2],'!=','!=') },
38 2     2   6 'lt' => sub { $_[0]->_c($_[1],$_[2],'lt','gt') },
39 2     2   6 'gt' => sub { $_[0]->_c($_[1],$_[2],'gt','lt') },
40 2     2   6 'le' => sub { $_[0]->_c($_[1],$_[2],'le','ge') },
41 2     2   5 'ge' => sub { $_[0]->_c($_[1],$_[2],'ge','le') },
42 3     3   10 'eq' => sub { $_[0]->_c($_[1],$_[2],'eq','eq') },
43 2     2   7 'ne' => sub { $_[0]->_c($_[1],$_[2],'ne','ne') },
44 23     23   206 fallback => 1;
  23         43  
  23         489  
45             }
46             {
47             package Stats::LikeR::Pred;
48 50     50   504 sub _leaf { bless { op => $_[2], col => $_[1], val => $_[3] }, 'Stats::LikeR::Pred' }
49 8     8   68 sub _node { bless { op => $_[0], l => $_[1], r => $_[2] }, 'Stats::LikeR::Pred' }
50             use overload
51 4     4   11 '&' => sub { Stats::LikeR::Pred::_node('and', $_[0], $_[1]) },
52 2     2   6 '|' => sub { Stats::LikeR::Pred::_node('or', $_[0], $_[1]) },
53 2     2   6 '!' => sub { Stats::LikeR::Pred::_node('not', $_[0], undef) },
54 23     23   7861 fallback => 1;
  23         31  
  23         157  
55             }
56             sub summary {
57 15     15 1 799852 my ($data, %args);
58 15         128 my $current_sub = (split(/::/,(caller(0))[3]))[-1];
59              
60 15 100 66     99 if (@_ && ref $_[0]) {
61             # Handles: summary(\@arr) or summary(\@arr, nrows => 5) or summary(\%h, nrow => 3)
62 10         16 $data = shift;
63 10         22 %args = @_; # capture any trailing key/value pairs
64             } else {
65             # Handles: summary(@runif) or summary(@runif, nrows => 2)
66             # Extract known trailing named arguments from the flat list
67 5   33     51 while (@_ >= 2 && defined $_[-2] && !ref($_[-2]) && $_[-2] =~ /^(?:nrows|nrow)$/) {
      33        
      66        
68 2         3 my $val = pop @_;
69 2         3 my $key = pop @_;
70 2         11 $args{$key} = $val;
71             }
72             # The remaining items in @_ make up the actual data array
73 5         17 my @list = @_;
74 5         14 $data = \@list;
75             }
76             # Normalize nrow -> nrows, default to 10
77 15   100     105 $args{nrows} //= delete($args{nrow}) // 10;
      66        
78 15         28 my $ref_type = ref $data;
79 15 100 100     72 if (($ref_type ne 'ARRAY') && ($ref_type ne 'HASH')) {
80 1         14 die "$current_sub' data must either be a hash or an array, not \"$ref_type\"";
81             }
82 14         21 my $single_arr = 0;
83 14 100 100     53 if (($ref_type eq 'ARRAY') && (ref $data->[0] eq '')) {
84 8         15 $single_arr = 1;
85             }
86 14         37 my @header = ('# values', 'Min.', '1st Qu.', 'Median', 'Mean', '3rd Qu.', 'Max.');
87 14         19 my @out;
88 14 100       40 if ($single_arr == 1) {
    100          
    50          
89 8         14 push @out, '-' x 75;
90 8         42 my $header = sprintf('%9s ' x scalar @header, @header);
91 8         14 push @out, $header;
92 8         12 push @out, '-' x 75;
93 8         12 my @undef = grep {!defined $data->[$_]} 0..scalar @{ $data }-1;
  219         402  
  8         30  
94 8 100       45 if (scalar @undef > 0) {
95 1         27 say STDERR join (',', @undef);
96 1         13 die "The above indices are not defined in $current_sub";
97             }
98 7         10 my @numeric = grep {looks_like_number($_)} @{ $data };
  216         334  
  7         28  
99 7         158 my $q = quantile(\@numeric, probs => [0.25, 0.75]);
100 7         202 my $vals = sprintf('%9.4g ' x scalar @header, scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));
101 7         30 push @out, $vals;
102             } elsif ($ref_type eq 'ARRAY') {
103 3         7 push @out, '-' x 75;
104 3         19 my $header = sprintf('%9s ' x scalar @header, @header);
105 3         7 unshift @header, 'Index';
106 3         7 $header = 'Index ' . $header;
107 3         5 push @out, $header;
108 3         5 push @out, '-' x 75;
109 3         5 my $rows_printed = 0;
110 3         10 foreach my $index (0..$#$data) {
111 6         11 my @undef = grep {!defined $data->[$index][$_]} 0..scalar @{ $data->[$index] }-1;
  36         90  
  6         15  
112 6 50       18 if (scalar @undef > 0) {
113 0         0 say STDERR join (',', @undef);
114 0         0 die "The above indices are not defined for index $index in $current_sub";
115             }
116 6         13 my @numeric = grep {looks_like_number($_)} @{ $data->[$index] };
  36         86  
  6         13  
117 6         60 my $q = quantile(\@numeric, probs => [0.25, 0.75]);
118 6         118 my $vals = sprintf('%6.4g', $index) . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));
119 6         14 push @out, $vals;
120 6         9 $rows_printed++;
121 6 100       28 last if $rows_printed >= $args{nrows}; # Changed to >= just to be safe
122             }
123             } elsif ($ref_type eq 'HASH') {
124 3         8 push @out, '-' x 78;
125 3         18 my $header = sprintf('%9s ' x scalar @header, @header);
126 3         10 unshift @header, 'Key';
127 3         9 $header = ' Key ' . $header;
128 3         6 push @out, $header;
129 3         6 push @out, '-' x 78;
130 3         6 my $rows_printed = 0;
131 3         7 foreach my $key (sort {lc $a cmp lc $b} keys %{ $data }) {
  4         17  
  3         19  
132 5         11 my @undef = grep {!defined $data->{$key}[$_]} 0..scalar @{ $data->{$key} }-1;
  33         64  
  5         17  
133 5 50       16 if (scalar @undef > 0) {
134 0         0 say STDERR join (',', @undef);
135 0         0 die "The above indices are not defined for key $key in $current_sub";
136             }
137 5         9 my @numeric = grep {looks_like_number($_)} @{ $data->{$key} };
  33         73  
  5         11  
138 5         60 my $q = quantile(\@numeric, probs => [0.25, 0.75]);
139 5         15 my $print_key = substr($key, 0, 9);
140 5 50       15 if ((length $print_key) < 9) { # make sure that short keys line up correctly
141 5         12 $print_key .= ' ' x (9 - length $print_key);
142             }
143 5         116 my $vals = $print_key . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));
144 5         15 push @out, $vals;
145 5         7 $rows_printed++;
146 5 100       45 last if $rows_printed >= $args{nrows};
147             }
148             }
149 13         110 say join ("\n", @out);
150 13         77 return \@out;
151             }
152              
153             sub read_table {
154 552     552 1 975312 my $file = shift;
155 552 100       5058 die "read_table: \"$file\" is not a file\n" unless -f $file;
156 551 50       3342 die "read_table: \"$file\" is not readable\n" unless -r $file;
157              
158 551         727 my %input_args = @_;
159 551 100       1009 if (exists $input_args{delim}) {
160             # FIX: sep + delim together used to silently prefer delim
161             die "read_table: pass either 'sep' or 'delim', not both\n"
162 2 100       13 if exists $input_args{sep};
163 1         3 $input_args{sep} = delete $input_args{delim};
164             }
165              
166 550 100       1475 my $default_sep = $file =~ /\.tsv$/i ? "\t" : ',';
167 550         1196 my %args = (
168             sep => $default_sep,
169             comment => '#',
170             %input_args,
171             );
172              
173 550         724 my %allowed_args = map { $_ => 1 } (
  2750         3378  
174             'comment', 'output.type', 'filter', 'row.names', 'sep',
175             );
176 550         962 my @undef_args = sort grep { !$allowed_args{$_} } keys %args;
  1139         1649  
177 550 100       851 if (@undef_args) {
178 1         3 my $current_sub = ( split /::/, (caller(0))[3] )[-1];
179             # FIX: no more printing to STDOUT from a library ('say'); the die
180             # message carries the offending argument names itself
181 1         39 die "the args \"@undef_args\" aren't defined for $current_sub\n";
182             }
183 549   100     1219 my $otype = $args{'output.type'} // 'aoh';
184 549 100       1236 die "read_table: output.type \"$otype\" isn't allowed (aoh, hoa, hoh)\n"
185             unless $otype =~ m/^(?:aoh|hoa|hoh)$/;
186              
187 547         590 my $filter = $args{filter};
188 547 100 100     1350 if (defined $filter && ref($filter) eq 'CODE') {
    100 100        
189 1         5 $filter = { 0 => $filter };
190             } elsif (defined $filter && ref($filter) ne 'HASH') {
191 1         6 die "'filter' must be a CODE or HASH reference\n";
192             }
193              
194 546         560 my (@data, %data, @header, @uniq_header,
195             %mapped_filters, @sorted_filter_flds, %seen_rownames);
196 546         508 my $data_row = 0;
197             _parse_csv_file($file, $args{sep} // '', $args{comment} // '', sub {
198 6780     6780   7810 my ($line_ref) = @_;
199 6780 100       8376 if (!@header) {
200             # --- HEADER PROCESSING (copy made only here; runs once) ---
201 546         918 my @line = @$line_ref;
202             $line[0] =~ s/^\Q$args{comment}\E//
203 546 50 33     3461 if @line && defined $line[0] && length( $args{comment} // '' );
      50        
      33        
204 546         763 @header = @line;
205 546 100 66     1164 if (@header && $header[0] eq '') {
206 11         28 $header[0] = 'row_name';
207             }
208 546         501 my %seen_h;
209             # FIX: hoa output with duplicate column names used to push the
210             # same cell once PER OCCURRENCE, silently corrupting the column
211             # lengths. Iterate unique names (order preserved) instead.
212 546         576 @uniq_header = grep { !$seen_h{$_}++ } @header;
  1223         2272  
213 546         574 my @dup_cols = grep { $seen_h{$_} > 1 } @uniq_header;
  1221         1462  
214 546 100       1147 warn "read_table: duplicate column name(s) in $file: @dup_cols (later values win)\n"
215             if @dup_cols;
216 546 100 100     811 if ($otype eq 'hoh' && !defined $args{'row.names'}) {
217 6         12 $args{'row.names'} = $header[0];
218             }
219 546 100 100     814 if (defined $args{'row.names'}
220 71         112 && !grep { $_ eq $args{'row.names'} } @header) {
221 1         7 die "\"$args{'row.names'}\" isn't in the header of $file\n";
222             }
223 545 100       687 if ($filter) {
224 11         28 for my $k (keys %$filter) {
225 12 100       40 if ($k =~ /^\d+$/) {
226             # FIX: a numeric key past the last column used to be
227             # accepted and then silently extended every row via
228             # the $_ write-back
229 3 100       32 die "read_table: numeric filter key $k exceeds the "
230             . scalar(@header) . " columns of $file\n"
231             if $k > @header;
232 2         4 $mapped_filters{$k} = $filter->{$k};
233             } else {
234 9         20 my ($idx) = grep { $header[$_] eq $k } 0 .. $#header;
  54         67  
235 9 100       22 die "Filter column '$k' not found in header\n"
236             unless defined $idx;
237 8         26 $mapped_filters{ $idx + 1 } = $filter->{$k};
238             }
239             }
240 9         22 @sorted_filter_flds = sort { $a <=> $b } keys %mapped_filters;
  1         5  
241             }
242 543         2052 return;
243             }
244             # --- DATA PROCESSING (operate on $line_ref directly; no row copy) ---
245 6234         5488 $data_row++;
246 6234 100       7575 if (@$line_ref != @header) {
247             # FIX: alignment errors now say WHICH row is ragged
248 2         18 die sprintf "Alignment error on %s data row %d (%d fields vs %d headers).\n",
249             $file, $data_row, scalar @$line_ref, scalar @header;
250             }
251 6232         5715 my %line_hash;
252 6232         8025 for my $i (0 .. $#header) {
253 74819         72881 my $v = $line_ref->[$i];
254 74819 100 66     158118 $line_hash{ $header[$i] } = ( !defined($v) || $v eq '' ) ? undef : $v;
255             }
256             # --- APPLY FILTERS ---
257 6232 100       7821 if (@sorted_filter_flds) {
258             # FIX: 'local %_ = %line_hash' copied the whole row for every
259             # filtered row AND went stale after a $_ write-back. Aliasing the
260             # glob makes %_ THE row hash: zero copy, mutations always visible.
261 1862         1903 local *_ = \%line_hash;
262 1862         1720 my $skip = 0;
263 1862         1899 foreach my $fld (@sorted_filter_flds) {
264 1865 100       2706 local $_ = $fld == 0 ? $line_ref : $line_hash{ $header[ $fld - 1 ] };
265 1865 100       2533 if ( !$mapped_filters{$fld}->( $line_ref, \%line_hash ) ) {
266 1137         2247 $skip = 1;
267 1137         1111 last;
268             }
269 728 100       1820 if ( $fld > 0 ) { # write back any mutation made to $_
270 727         709 $line_ref->[ $fld - 1 ] = $_;
271 727 100 100     1635 $line_hash{ $header[ $fld - 1 ] }
272             = ( !defined($_) || $_ eq '' ) ? undef : $_;
273             }
274             }
275 1862 100       8849 return if $skip;
276             }
277             # Populate requested data structure
278 5095 100       7354 if ($otype eq 'aoh') {
    100          
    50          
279 1886         14820 push @data, \%line_hash;
280             } elsif ($otype eq 'hoa') {
281 1118         1172 push @{ $data{$_} }, $line_hash{$_} for @uniq_header;
  15767         29527  
282             } elsif ($otype eq 'hoh') {
283 2091         2318 my $row_name = $line_hash{ $args{'row.names'} };
284             # FIX: an undef/empty row-name cell used to become the '' hash
285             # key with "uninitialized" warnings; now it dies with the row
286             die sprintf "read_table: undefined row name (column '%s') in %s data row %d\n",
287 2091 100       2439 $args{'row.names'}, $file, $data_row
288             unless defined $row_name;
289             warn "read_table: duplicate row name '$row_name' in $file (later values win)\n"
290 2090 100       3509 if $seen_rownames{$row_name}++;
291 2090         2036 foreach my $col (@uniq_header) {
292 29180 100       33709 next if $col eq $args{'row.names'};
293 27090         54994 $data{$row_name}{$col} = $line_hash{$col};
294             }
295             }
296 546   50     15670 });
      50        
297 540 100       6861 if ($otype eq 'aoh') {
298 524         2113 return \@data;
299             } else { # hoa or hoh
300 16         7933 return \%data;
301             }
302             }
303             # ---------------------------------------------------------------------------
304             # view(): an R-style `head` for the structures read_table() returns.
305             #
306             # Handles all three output.type values:
307             # aoh -> ARRAY of HASH refs
308             # hoa -> HASH of ARRAY refs
309             # hoh -> HASH of HASH refs
310             #
311             # Pure Perl; the only dependency is Scalar::Util (core). There is nothing for
312             # XS to speed up here: view only touches the first n rows, and the total-row
313             # count is O(1) for every structure (scalar @$aoh, array length, scalar keys).
314             # Keeping it pure Perl also keeps install portable (no compiler needed).
315             #
316             # Usage:
317             # view($data); # first 6 rows, like head()
318             # view($data, n => 20); # first 20 rows
319             # view($data, cols => [...]); # pin explicit column order
320             # view($data, na => '.', max_width => 30, gap => 1, ellipsis => '~');
321             # view($data, 'row.names' => 'id'); # use column 'id' as the row label
322             # view($data, to => \*STDERR); # print elsewhere
323             # my $s = view($data, return_only => 1); # capture string, suppress print
324             #
325             # Column order: read_table stores rows as hashes, so the original CSV column
326             # order is gone. view sorts columns by name for a stable layout and treats a
327             # column literally named 'row_name' (read_table's label for a leading blank
328             # header) as the row label. Pass cols => [...] to override the order.
329             # ---------------------------------------------------------------------------
330              
331             sub view {
332 41     41 1 164210 my $data = shift;
333 41         112 my %args = @_;
334              
335             # --- reject unknown arguments (mirrors read_table/write_table) ---
336 41         74 my %allowed = map { $_ => 1 } qw(
  492         651  
337             n rows na max_width ellipsis gap cols columns
338             to return_only row.names row_names
339             );
340 41         98 my @bad = sort grep { !$allowed{$_} } keys %args;
  60         114  
341 41 100       90 die "view: unknown argument(s): @bad\n" if @bad;
342              
343             # --- n / rows (synonyms); reject conflicting or non-integer values ---
344             die "view: pass either 'n' or 'rows', not both\n"
345 40 100 100     112 if exists $args{n} && exists $args{rows};
346             my $n = exists $args{rows} ? $args{rows}
347             : exists $args{n} ? $args{n}
348 39 100       89 : 6;
    100          
349 39 100 66     285 die "view: 'n'/'rows' must be a non-negative integer\n"
350             unless defined $n && $n =~ /^\d+$/;
351              
352 37 100       53 my $na = exists $args{na} ? $args{na} : 'NA';
353 37 100       62 my $maxw = exists $args{max_width} ? $args{max_width} : 80;
354 37 100       48 my $ell = exists $args{ellipsis} ? $args{ellipsis} : '...';
355 37 50       45 my $gap = exists $args{gap} ? (' ' x $args{gap}) : ' ';
356 37   100     80 my $ucols = $args{cols} || $args{columns};
357 37         42 my $fh = $args{to};
358 37         41 my $quiet = $args{return_only};
359              
360             # 'row.names' takes precedence over the row_names alias (both accepted)
361             my $label_col = exists $args{'row.names'} ? $args{'row.names'}
362             : exists $args{row_names} ? $args{row_names}
363 37 100       67 : undef;
    100          
364              
365 37         58 my $rt = ref $data;
366 37 100 100     85 die "view: expected an ARRAY (AoH) or HASH (HoA/HoH) reference, got "
      100        
367             . ($rt || 'a non-reference') . "\n"
368             unless $rt eq 'ARRAY' or $rt eq 'HASH';
369              
370 35         48 my ($kind, @cols, @labels, @raw, $total, $lab_header);
371 35 100       52 if ($rt eq 'ARRAY') { # ---- AoH ----
    50          
372 22         23 $kind = 'AoH';
373 22         25 $total = scalar @$data;
374 22 100       27 my $show = $n < $total ? $n : $total;
375              
376 22 100       42 if ($ucols) {
377 3         5 @cols = @$ucols;
378             } else {
379             # BUG FIX: scan at least one row when data exists, so the header
380             # still lists columns even when showing 0 rows (n => 0). PERF:
381             # collect unique keys once, then sort once -- not sort-per-row.
382 19 100       32 my $scan = $show > 0 ? $show : ($total > 0 ? 1 : 0);
    100          
383 19         22 my %seen;
384 19         37 for my $i (0 .. $scan - 1) {
385 29         28 my $row = $data->[$i];
386 29 100       47 next unless ref $row eq 'HASH';
387 28         74 $seen{$_} = 1 for keys %$row;
388             }
389 19         57 @cols = sort keys %seen;
390             }
391             my $lc = defined $label_col ? $label_col
392 22 100       36 : (grep { $_ eq 'row_name' } @cols) ? 'row_name' : undef;
  32 100       44  
393 22 100       53 if (defined $lc) {
394 3         3 @cols = grep { $_ ne $lc } @cols;
  6         10  
395 3         5 $lab_header = $lc;
396             }
397 22         30 for my $i (0 .. $show - 1) {
398 31         33 my $row = $data->[$i];
399 31 100       40 $row = {} unless ref $row eq 'HASH';
400 31 100       42 push @labels, defined $lc ? $row->{$lc} : ($i + 1);
401 31         34 push @raw, [ map { $row->{$_} } @cols ];
  50         83  
402             }
403 22 100       39 $lab_header = '' unless defined $lab_header;
404             } elsif ($rt eq 'HASH') {
405 13         23 my @keys = keys %$data;
406 13         12 my $sample;
407 13 100       19 for my $k (@keys) { $sample = $data->{$k}; last if defined $sample; }
  13         14  
  13         21  
408 13         15 my $vt = ref $sample;
409              
410 13 100       28 if (!@keys) { # ---- empty ----
    100          
    100          
411             # BUG FIX: an empty hash used to die ("neither ARRAY nor HASH");
412             # treat it as an empty table, mirroring an empty AoH.
413 1         2 $kind = 'Hash';
414 1         1 $total = 0;
415 1         1 $lab_header = '';
416             } elsif ($vt eq 'ARRAY') { # ---- HoA ----
417 5         5 $kind = 'HoA';
418 5 100       14 my @allcols = $ucols ? @$ucols : sort @keys;
419 5         6 $total = 0;
420 5         6 for my $k (@keys) {
421 10 100       14 next unless ref $data->{$k} eq 'ARRAY';
422 9         8 my $l = scalar @{ $data->{$k} };
  9         9  
423 9 100       16 $total = $l if $l > $total;
424             }
425 5 50       7 my $show = $n < $total ? $n : $total;
426             my $lc = defined $label_col ? $label_col
427 5 50       11 : (grep { $_ eq 'row_name' } @allcols) ? 'row_name' : undef;
  7 100       11  
428 5 100       7 @cols = grep { !defined $lc || $_ ne $lc } @allcols;
  9         41  
429 5 100       11 $lab_header = defined $lc ? $lc : '';
430 5         12 for my $i (0 .. $show - 1) {
431             push @labels, defined $lc
432 13 50       23 ? (ref $data->{$lc} eq 'ARRAY' ? $data->{$lc}[$i] : undef)
    100          
433             : ($i + 1);
434             push @raw, [ map {
435 13 100       18 ref $data->{$_} eq 'ARRAY' ? $data->{$_}[$i] : undef
  21         51  
436             } @cols ];
437             }
438             } elsif ($vt eq 'HASH') { # ---- HoH ----
439 6         6 $kind = 'HoH';
440 6         7 $total = scalar @keys;
441 6         18 my @rk = sort @keys;
442 6 100       9 my $show = $n < $total ? $n : $total;
443 6 50       32 my @shown = $show > 0 ? @rk[0 .. $show - 1] : ();
444 6 100       9 if ($ucols) {
445 1         1 @cols = @$ucols;
446             } else {
447             # PERF: gather unique inner keys, sort once.
448 5         6 my %seen;
449 5         6 for my $rkk (@shown) {
450 9 100       14 next unless ref $data->{$rkk} eq 'HASH';
451 8         8 $seen{$_} = 1 for keys %{ $data->{$rkk} };
  8         24  
452             }
453 5         12 @cols = sort keys %seen;
454             }
455 6 100       11 @cols = grep { $_ ne $label_col } @cols if defined $label_col;
  4         7  
456             # BUG FIX: honour the row_names alias here too (was 'row.names'
457             # only, so row_names => 'x' showed a 'row_name' header).
458 6 100       9 $lab_header = defined $label_col ? $label_col : 'row_name';
459 6         6 for my $rkk (@shown) {
460 11         12 push @labels, $rkk;
461 11 100       18 my $inner = ref $data->{$rkk} eq 'HASH' ? $data->{$rkk} : {};
462 11         12 push @raw, [ map { $inner->{$_} } @cols ];
  18         35  
463             }
464             } else { # ---- flat hash ----
465             # BUG/FEATURE: a hash whose values are plain scalars
466             # ({ a => 1, b => 2, ... }) used to die ("neither ARRAY nor
467             # HASH"). Render it like write_table's flat hash: one row, keys
468             # as columns, a numeric '1' row label.
469 1         2 $kind = 'Hash';
470 1         1 $total = 1;
471 1 50       3 my $show = $n < $total ? $n : $total;
472 1 50       3 if ($ucols) {
473 0         0 @cols = @$ucols;
474             } else {
475 1         4 @cols = sort @keys;
476             }
477             my $lc = defined $label_col ? $label_col
478 1 50       3 : (grep { $_ eq 'row_name' } @cols) ? 'row_name' : undef;
  3 50       5  
479 1 50       3 if (defined $lc) {
480 0         0 @cols = grep { $_ ne $lc } @cols;
  0         0  
481 0         0 $lab_header = $lc;
482             }
483 1 50       2 $lab_header = '' unless defined $lab_header;
484 1         3 for my $i (0 .. $show - 1) {
485 1 50       2 push @labels, defined $lc ? $data->{$lc} : ($i + 1);
486 1         2 push @raw, [ map { $data->{$_} } @cols ];
  3         5  
487             }
488             }
489             }
490              
491             # numeric detection per column (drives right/left alignment)
492 35         65 my @numeric = (1) x scalar @cols;
493 35         44 for my $r (@raw) {
494 56         79 for my $j (0 .. $#cols) {
495 92         90 my $v = $r->[$j];
496 92 100       126 next unless defined $v;
497 84 100       172 $numeric[$j] = 0 unless looks_like_number($v);
498             }
499             }
500 35 100       53 my $lab_numeric = @labels ? 1 : 0;
501 35         36 for my $l (@labels) {
502 48 100 66     134 $lab_numeric = 0, last unless defined $l && looks_like_number($l);
503             }
504              
505             # stringify + sanitize + truncate cells (column names left intact)
506 35         36 my $ell_len = length $ell;
507             my $clean = sub {
508 148     148   161 my $v = shift;
509 148 100       197 return $na unless defined $v;
510 140         136 $v = "$v";
511 140         162 $v =~ s/\t/\\t/g; $v =~ s/\r/\\r/g; $v =~ s/\n/\\n/g;
  140         149  
  140         147  
512 140 100 66     256 if ($maxw && length($v) > $maxw) {
513 2         2 my $keep = $maxw - $ell_len;
514 2 50       4 $keep = 0 if $keep < 0;
515 2         4 $v = substr($v, 0, $keep) . $ell;
516             }
517 140         236 return $v;
518 35         178 };
519 35         47 my @lab_s = map { $clean->($_) } @labels;
  56         96  
520 35         43 my @rows_s = map { [ map { $clean->($_) } @$_ ] } @raw;
  56         64  
  92         88  
521 35         54 my @head_s = @cols;
522              
523             # column widths
524 35         36 my $lab_w = length $lab_header;
525 35 100       44 for my $s (@lab_s) { my $l = length $s; $lab_w = $l if $l > $lab_w; }
  56         54  
  56         91  
526 35         35 my @w = map { length $_ } @head_s;
  54         66  
527 35         39 for my $r (@rows_s) {
528 56         66 for my $j (0 .. $#cols) {
529 92         103 my $l = length $r->[$j];
530 92 100       136 $w[$j] = $l if $l > $w[$j];
531             }
532             }
533              
534             my $pad = sub {
535 237     237   295 my ($s, $width, $right) = @_;
536 237 50       243 my $g = $width - length $s; $g = 0 if $g < 0;
  237         308  
537 237 100       502 return $right ? (' ' x $g) . $s : $s . (' ' x $g);
538 35         93 };
539              
540 35         37 my @out;
541 35         39 my $shown = scalar @rows_s;
542 35 100       167 push @out, sprintf("# %s: %d row%s x %d col%s (showing %d)",
    100          
543             $kind, $total, ($total == 1 ? '' : 's'),
544             scalar(@cols), (@cols == 1 ? '' : 's'), $shown);
545              
546 35         60 my @hcells = ( $pad->($lab_header, $lab_w, 0) );
547 35         83 push @hcells, $pad->($head_s[$_], $w[$_], $numeric[$_]) for 0 .. $#cols;
548 35         76 push @out, join($gap, @hcells);
549              
550 35         45 for my $ri (0 .. $#rows_s) {
551 56         66 my @cells = ( $pad->($lab_s[$ri], $lab_w, $lab_numeric) );
552 56         96 push @cells, $pad->($rows_s[$ri][$_], $w[$_], $numeric[$_]) for 0 .. $#cols;
553 56         133 push @out, join($gap, @cells);
554             }
555 35 50       59 push @out, sprintf("# ... %d more row%s", $total - $shown,
    100          
556             ($total - $shown == 1 ? '' : 's')) if $shown < $total;
557              
558 35         65 my $str = join("\n", @out) . "\n";
559 35 50       57 unless ($quiet) { defined $fh ? print {$fh} $str : print $str; }
  1 100       7  
  1         5  
560 35         461 return $str;
561             }
562              
563             1;
564             =encoding utf8
565              
566             =head1 Synopsis
567              
568             Get basic statistical functions working in Perl as if they were part of List::Util, like C, C, C, etc.
569             I've used Artificial Intelligence tools such as Claude, Gemini, and Grok to write this as well as using my own gray matter.
570             There are other similar tools on CPAN, but I want speed and a form like List::Util, which I've gotten here with the help of AI, which often required many attempts to do correctly.
571             This is meant to call subroutines directly through eXternal Subroutines (XS) for performance and portability.
572              
573             There B other modules on CPAN that can do B of this, but this works the way that I B it to.
574              
575             =head1 Functions/Subroutines
576              
577             ========================================================================
578              
579             =head2 add_data
580              
581             Add data to an existing hash or array reference. This function acts as the equivalent of adding new rows, as well as an C (described below). It dynamically infers your target data structure, handles deeply nested records, and seamlessly coerces mismatched data shapes to preserve the structural integrity of your primary reference.
582              
583             =head3 Hash of Hashes (HoH)
584              
585             When the target is a Hash of Hashes, incoming hash keys update existing rows, and new keys create new rows.
586              
587             $data = { 'Jack Smith' => { age => 30 } };
588            
589             $n = {
590             'Jack Smith' => { # Update existing (Hash)
591             dept => 'Engineering'
592             },
593             'Jane Doe' => { age => 25, dept => 'Sales' }, # Add new (Hash)
594             'Invalid' => 'Not a reference' # Edge case safety
595             };
596            
597             add_data($data, $n);
598              
599             B
600              
601             {
602             "Jack Smith": {
603             "age": 30,
604             "dept": "Engineering"
605             },
606             "Jane Doe": {
607             "age": 25,
608             "dept": "Sales"
609             }
610             }
611              
612             =head3 Hash of Arrays (HoA)
613              
614             When the target is a Hash of Arrays, incoming arrays are pushed onto the existing arrays, appending the new elements, similarly to R's C.
615              
616             $data = { 'Project Alpha' => [ 'task1', 'task2' ] };
617             $n = {
618             'Project Alpha' => [ 'task3' ], # Appends to existing array
619             'Project Beta' => [ 'task1', 'task2' ] # Creates new array row
620             };
621             add_data($data, $n);
622              
623             B
624              
625             {
626             "Project Alpha": [ "task1", "task2", "task3" ],
627             "Project Beta": [ "task1", "task2" ]
628             }
629              
630             =head3 Array of Hashes / Arrays (AoH / AoA)
631              
632             C now natively supports Array references at the root level. When targeting an Array, it iterates through the source array and merges data at the corresponding indices.
633              
634             $data = [
635             { id => 1, name => 'Alice' }
636             ];
637            
638             $n = [
639             { role => 'Admin' }, # Updates index 0
640             { id => 2, name => 'Bob' } # Creates index 1
641             ];
642            
643             add_data($data, $n);
644              
645             B
646              
647             [
648             { "id": 1, "name": "Alice", "role": "Admin" },
649             { "id": 2, "name": "Bob" }
650             ]
651              
652             =head3 Advanced Structural Coercion & Cross-Merging
653              
654             C strictly enforces the primary structure of your target reference (determined by inspecting its outer and inner bounds). If you mix Array and Hash types, the function automatically coerces the incoming data to match the target.
655              
656             B<1. Inner Coercion (Mixing Rows):>
657              
658             =over
659              
660             =item * B Source Array rows are read in pairs and converted to key-value pairs.
661              
662             =item * B Source Hash rows are flattened into key-value pairs and pushed onto the array.
663              
664             =back
665              
666             B<2. Root-Level Coercion (Mixing Outer Containers):>
667              
668             =over
669              
670             =item * B The function evaluates the Hash keys as numeric indices. (e.g., source key C<"0"> merges into target array index C<[0]>). Non-numeric keys are safely ignored.
671              
672             =item * B The function converts the Array indices into stringified Hash keys. (e.g., source array index C<[1]> merges into target hash key C<"1">).
673              
674             =back
675              
676             =head3 Source is a mixed Hash. Keys dictate the target array index!
677              
678             $n = {
679             '0' => { y => 20 }, # Merges into $data->[0]
680             '1' => [ 'z', 30 ], # Array pair coerced to Hash, creates $data->[1]
681             'ignored' => { k => 'v' } # Ignored: cannot map to an array index
682             };
683            
684             add_data($data, $n);
685              
686             B
687              
688             [
689             { "x": 10, "y": 20 },
690             { "z": 30 }
691             ]
692              
693             NB: If C is called on a completely empty target reference (e.g., C<$data = {}> or C<$data = []>), it will intelligently infer the required inner structure (Hashes vs Arrays) by inspecting the first valid row of the source data.
694              
695             =head2 aoh2hoa
696              
697             C — transpose an B (row-major) into a B (column-major).
698              
699             my $hoa = aoh2hoa([ { a => 1, b => 2 }, { a => 3 } ]);
700             # $hoa = { a => [1, 3], b => [2, undef] }
701              
702             Rows go in, columns come out: each distinct key across the input rows becomes one output column, and the values are gathered down that column in row order.
703              
704             =head3 Arguments
705              
706             C<$aoh> — an array ref of hash refs, one hash per row. This is the only argument, and it is required. Passing anything that is not an array ref is fatal:
707              
708             aoh2hoa({ a => 1 }); # dies: argument must be an arrayref of hashrefs
709              
710             =head3 Returns
711              
712             A hash ref of array refs. Each key is a column name (the union of all keys seen across the rows); each value is an array ref holding that column's cells. Every column has exactly C elements, so the result is rectangular even when the input is ragged.
713              
714             =head3 Behavior
715              
716             The column set is the B of every row's keys — a key that appears in only some rows still produces a full-length column, with C in the rows that lacked it.
717              
718             Each column is padded to exactly the row count. Cells missing from a given row come through as C, including trailing gaps (a column whose last contributing row is early still runs the full length). These absent cells are cheap holes in the array, not stored SVs.
719              
720             Values are B (C), so the returned structure is independent of the input — mutating C<$aoh> afterward won't disturb the result. The copy is shallow: a value that is itself a reference is copied the same way C<< $col-E[$i] = $row-E{$k} >> would, i.e. the ref is duplicated but its referent is shared.
721              
722             Keys are handled SV-first (C / C), so UTF-8 and otherwise non-trivial hash keys round-trip correctly.
723              
724             A row that is B a hash ref is skipped rather than fatal: it contributes C to every column at its index. So a stray C or scalar in the input thins the columns at that position instead of dying.
725              
726             =head3 Notes
727              
728             The output column order follows hash iteration order and is therefore not guaranteed — sort the keys if you need a stable layout. Round-tripping through C (or the reverse) reconstructs the data but not necessarily the original key/row ordering, and rows originally absent a key will gain it as an explicit C.
729              
730             =head2 aov
731              
732             Warning: assumes normal distribution
733              
734             aov(
735             {
736             yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
737             ctrl => [1, 1, 1, 0, 0, 0]
738             },
739             'yield ~ ctrl');
740              
741             which returns
742              
743             {
744             ctrl {
745             Df 1,
746             "F value" 25.6000000000001,
747             "Mean Sq" 1.70666666666667,
748             Pr(>F) 0.00718232855871859,
749             "Sum Sq" 1.70666666666667
750             },
751             Residuals {
752             Df 4,
753             "Mean Sq" 0.0666666666666665,
754             "Sum Sq" 0.266666666666666
755             }
756             }
757              
758             You can also perform Two-Way ANOVA with categorical interactions using the C<*> operator. The parser will implicitly evaluate the main effects alongside the interaction:
759              
760             my $res_2way = aov($data_2way, 'len ~ supp * dose');
761              
762             It is robust against rank deficiency; collinear terms will gracefully receive 0 degrees of freedom and 0 sum of squares, matching R's behavior.
763              
764             =head3 Input Parameters
765              
766              
767              
768             =begin html
769              
770            
771            
772            
773             Parameter
774             Type
775             Default
776             Description
777             Example
778            
779            
780            
781            
782             data_sv
783             HashRef or ArrayRef
784             (Required)
785             The dataset to analyze. Accepts a Hash of Arrays (HoA) or Array of Hashes (AoH). If no formula is provided, it must be an HoA to allow automatic stacking (mimicking R's stack() on a named list).
786            
787            
788            
789             formula_sv
790             String
791             undef
792             A symbolic description of the model to be fitted. If omitted, the formula automatically defaults to 'Value ~ Group' and the input data is stacked.
793             'yield ~ N * P'
794            
795            
796            
797              
798             =end html
799              
800              
801              
802             =head3 Output Variables
803              
804             The function returns a single C containing the evaluated statistical results. Because the keys map dynamically to the terms parsed from your formula, the structure will vary based on your inputs.
805              
806              
807              
808             =begin html
809              
810            
811            
812            
813             Parameter
814             Type
815             Default
816             Description
817             Example
818            
819            
820            
821            
822             (Term Name)
823             HashRef
824             undef
825             A nested hash for each independent term in the formula (e.g., 'Group', 'N:P'), containing its ANOVA table statistics.
826             {'Df' => 1, 'Sum Sq' => 14.2, 'Mean Sq' => 14.2, 'F value' => 25.81, 'Pr(>F)' => 0.0004}
827            
828            
829             Residuals
830             HashRef
831             undef
832             A nested hash containing the residual (error) statistics for the fitted model.
833             {'Df' => 10, 'Sum Sq' => 5.5, 'Mean Sq' => 0.55}
834            
835            
836             group_stats
837             HashRef
838             undef
839             A nested hash containing descriptive statistics (mean and size / count) for every column evaluated in the original unstacked data structure.
840             {'mean' => {'A' => 2.1, 'B' => 5.4}, 'size' => {'A' => 10, 'B' => 10}}
841            
842            
843            
844              
845             =end html
846              
847              
848              
849             =head3 omitting formula
850              
851             As of version 0.07, in the case of an omitted formula, stacking is done:
852              
853             aov(
854             {
855             yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
856             ctrl => [1, 1, 1, 0, 0, 0]
857             },
858             );
859              
860             is the equivalent of:
861              
862             yield <- c(5.5, 5.4, 5.8, 4.5, 4.8, 4.2)
863             ctrl <- c(1, 1, 1, 0, 0, 0)
864            
865             # Combine them into a named list (the R equivalent of your hash)
866             my_list <- list(yield = yield, ctrl = ctrl)
867            
868             # Convert the list into a "long" dataframe
869             # This creates two columns: "values" and "ind" (the group name)
870             my_data <- stack(my_list)
871            
872             # Rename columns for clarity (optional but good practice)
873             colnames(my_data) <- c("Value", "Group")
874             anova_model <- aov(Value ~ Group, data = my_data)
875             summary(anova_model)
876              
877             in R
878              
879             =head2 cfilter
880              
881             Select B out of a table and return it in the same shape. A column is
882             the inner (second-level) key of a B or an B,
883             or the outer key of a B:
884              
885             use Stats::LikeR;
886             my %hoa = ( x => [1,2,3], y => [4,5,6], z => [0,0,0] );
887             cfilter(\%hoa, keep => ['x','y']); # { x => [1,2,3], y => [4,5,6] }
888             cfilter(\%hoa, remove => ['z']); # { x => [1,2,3], y => [4,5,6] }
889              
890             C takes exactly one of C or C. C returns only the
891             matching columns; C returns everything except them. The result is the
892             same shape as the input (HoH → HoH, HoA → HoA, AoH → AoH), with cell values
893             copied and the original structure left untouched.
894              
895             =head3 Selecting by name
896              
897             Pass an array ref of column names. Naming a column that is not present in the
898             data is an error (it catches typos), and a row that happens not to contain a
899             kept column simply comes back without it:
900              
901             my @aoh = ( { a => 1, b => 2 }, { a => 3 } );
902             cfilter(\@aoh, keep => ['b']); # [ { b => 2 }, {} ]
903              
904             =head3 Selecting by a predicate
905              
906             Instead of names, C/C accept a B — a CODE ref or a
907             function name — evaluated once per column. It is called as
908              
909             $predicate->($column_values, $column_name)
910              
911             where C<$column_values> is an array ref of the column's B cells (undef
912             and missing cells are dropped, so functions like C get clean input).
913             With C, columns for which the predicate is true are kept; with C,
914             those columns are dropped.
915              
916             # Keep only the constant columns (standard deviation zero):
917             my $const = cfilter(\%hoa, keep => sub { sd($_[0]) == 0 }); # { z => [0,0,0] }
918             # Drop the constant columns instead:
919             my $varying = cfilter(\%hoa, remove => sub { sd($_[0]) == 0 }); # { x=>..., y=>... }
920             # A bare function name resolves in Stats::LikeR:: (use a package for your own):
921             cfilter(\%hoa, keep => 'some_predicate');
922              
923             A bare string is always treated as a B, not a single column
924             name, so to keep one column by name use an array ref: C<< keep =E ['x'] >>.
925              
926             =head3 Errors
927              
928             C dies (via C) when:
929              
930             =over
931              
932             =item * neither C nor C is given, or both are,
933              
934             =item * a named column is not present in the data,
935              
936             =item * the selector is neither an array ref nor a code ref / function name, or the
937             function name cannot be resolved,
938              
939             =item * an unknown option is given, or the options are not C<< name =E value >> pairs,
940              
941             =item * the data is not a hash/array reference of the expected shape (a hash of hash
942             refs or array refs, or an array of hash refs).
943              
944             =back
945              
946             =head2 chisq_test
947              
948             The C function performs chi-squared contingency table tests and goodness-of-fit tests. It natively accepts both arrays and hashes (1D and 2D) and mathematically mirrors R's C, returning a structured hash reference of the results.
949              
950             For 2x2 matrices, Yates' Continuity Correction is applied automatically.
951              
952             =head3 Accepted Inputs
953              
954              
955              
956             =begin html
957              
958            
959            
960            
961             Input Type
962             Data Structure
963             Applied Test
964            
965            
966            
967            
968             1D Array
969             [ $v1, $v2, ... ]
970             Chi-squared test for given probabilities
971            
972            
973             2D Array
974             [ [ $v1, $v2 ], [ $v3, $v4 ] ]
975             Pearson's Chi-squared test (Yates' correction if 2x2)
976            
977            
978             1D Hash
979             { key1 => $v1, key2 => $v2 }
980             Chi-squared test for given probabilities
981            
982            
983             2D Hash
984             { row1 => { c1 => $v1, c2 => $v2 } }
985             Pearson's Chi-squared test (Yates' correction if 2x2)
986            
987            
988            
989              
990             =end html
991              
992              
993              
994             =head3 Output Object Structure
995              
996             The function returns a single Hash Reference containing the following key-value pairs. The internal structure of C and C will always identically match the structure of your input.
997              
998              
999              
1000             =begin html
1001              
1002            
1003            
1004            
1005             Key
1006             Data Type
1007             Description
1008            
1009            
1010            
1011            
1012             data.name
1013             String
1014             Identifies the input type (e.g., "Perl ArrayRef" or "Perl HashRef").
1015            
1016            
1017             expected
1018             Array/Hash Ref
1019             The expected frequencies, matching the geometry of the input.
1020            
1021            
1022             method
1023             String
1024             The specific statistical test applied.
1025            
1026            
1027             observed
1028             Array/Hash Ref
1029             The original data passed to the function.
1030            
1031            
1032             p.value
1033             Float
1034             The calculated p-value of the test.
1035            
1036            
1037             parameter
1038             Hash Ref
1039             Contains the degrees of freedom (df).
1040            
1041            
1042             statistic
1043             Hash Ref
1044             Contains the test statistic (X-squared).
1045            
1046            
1047            
1048              
1049             =end html
1050              
1051              
1052              
1053             =head3 Two-Dimensional Array
1054              
1055             Passing an Array of Arrays (AoA) triggers a standard Pearson's Chi-squared test. If the input is exactly a 2x2 matrix, Yates' continuity correction is applied automatically.
1056              
1057             my $test_data = [
1058             [762, 327, 468],
1059             [484, 239, 477]
1060             ];
1061             my $res = chisq_test($test_data);
1062              
1063             B
1064              
1065             {
1066             'data.name' => 'Perl ArrayRef',
1067             'expected' => [
1068             [ 703.671381936888, 319.645266594124, 533.683351468988 ],
1069             [ 542.328618063112, 246.354733405876, 411.316648531012 ]
1070             ],
1071             'method' => "Pearson's Chi-squared test",
1072             'observed' => [
1073             [ 762, 327, 468 ],
1074             [ 484, 239, 477 ]
1075             ],
1076             'p.value' => 2.95358918321176e-07,
1077             'parameter' => { 'df' => 2 },
1078             'statistic' => { 'X-squared' => 30.0701490957547 }
1079             }
1080              
1081             =head3 1-Dimensional Array (Goodness of Fit)
1082              
1083             Passing a flat Array Reference triggers a Goodness of Fit test, assuming equal expected probabilities across all items.
1084              
1085             my $data = [10, 20, 30];
1086             my $res = chisq_test($data);
1087              
1088             B
1089              
1090             {
1091             'data.name' => 'Perl ArrayRef',
1092             'expected' => [ 20, 20, 20 ],
1093             'method' => 'Chi-squared test for given probabilities',
1094             'observed' => [ 10, 20, 30 ],
1095             'p.value' => 0.00673794699908547,
1096             'parameter' => { 'df' => 2 },
1097             'statistic' => { 'X-squared' => 10 }
1098             }
1099              
1100             =head3 2-Dimensional Hash (Pearson's Chi-squared)
1101              
1102             Passing a Hash of Hashes (HoH) applies the exact same logic as a 2D Array, but preserves your nested string keys in the output. This is particularly useful when mapping data extracted directly from JSON, databases, or categorical mappings.
1103              
1104             my $data = {
1105             GroupA => { Success => 10, Failure => 15 },
1106             GroupB => { Success => 20, Failure => 5 }
1107             };
1108            
1109             my $res = chisq_test($data);
1110              
1111             B
1112              
1113             {
1114             'data.name' => 'Perl HashRef',
1115             'expected' => {
1116             'GroupA' => { 'Failure' => 10, 'Success' => 15 },
1117             'GroupB' => { 'Failure' => 10, 'Success' => 15 }
1118             },
1119             'method' => "Pearson's Chi-squared test with Yates' continuity correction",
1120             'observed' => {
1121             'GroupA' => { 'Failure' => 15, 'Success' => 10 },
1122             'GroupB' => { 'Failure' => 5, 'Success' => 20 }
1123             },
1124             'p.value' => 0.00937475878430379,
1125             'parameter' => { 'df' => 1 },
1126             'statistic' => { 'X-squared' => 6.75 }
1127             }
1128              
1129             =head3 One-Dimensional Hash (Goodness of Fit)
1130              
1131             Flat Hash References evaluate Goodness of Fit while preserving your categorical keys in the C and C output blocks.
1132              
1133             my $data = {
1134             Apples => 10,
1135             Oranges => 20,
1136             Bananas => 30
1137             };
1138            
1139             my $res = chisq_test($data);
1140              
1141             =head2 C
1142              
1143             Apply a B to every pair of columns in a table and collect
1144             the answers in a hash of hashes.
1145              
1146             It's the workhorse behind things like correlation matrices: give it your data and
1147             the name of a function that takes two columns (C, C, …) and you get
1148             back every column compared against every other column.
1149              
1150             use Stats::LikeR;
1151            
1152             my %data = (
1153             height => [ 170, 165, 180, 175 ],
1154             weight => [ 70, 60, 85, 77 ],
1155             age => [ 30, 41, 25, 38 ],
1156             );
1157            
1158             my $result = col2col(\%data, 'cor');
1159            
1160             # $result->{height}{weight} == correlation of height vs weight
1161             # $result->{height}{age} == correlation of height vs age
1162             # ...and so on for every pair
1163              
1164             ========================================================================
1165              
1166             =head3 Arguments
1167              
1168             col2col( $data, $command, $cols, %options )
1169             col2col( $data, $command, \%options ) # options in place of $cols
1170              
1171              
1172              
1173             =begin html
1174              
1175            
1176            
1177            
1178             Position
1179             Argument
1180             What it is
1181            
1182            
1183            
1184            
1185             1
1186             $data
1187             Your table, as a reference (see Data shapes below).
1188            
1189            
1190             2
1191             $command
1192             A code block or the name of a two-column function.
1193            
1194            
1195             3
1196             $cols
1197             (optional) Which columns to use as the "from" side. Omit for all.
1198            
1199            
1200             4+
1201             %options
1202             (optional) na, skip.errors, … (see Options).
1203            
1204            
1205            
1206              
1207             =end html
1208              
1209              
1210              
1211             ========================================================================
1212              
1213             =head3 Data shapes
1214              
1215             C understands three layouts. In every case a B is the thing that
1216             gets compared, and the result is keyed by column name.
1217              
1218             B — keys are column names:
1219              
1220             my %hoa = ( a => [1, 2, 3], b => [4, 5, 6] );
1221              
1222             B — First keys are row names, second keys are columns:
1223              
1224             my %hoh = (
1225             row1 => { a => 1, b => 4 },
1226             row2 => { a => 2, b => 5 },
1227             );
1228              
1229             B — each element is a row, inner keys are columns:
1230              
1231             my @aoh = ( { a => 1, b => 4 }, { a => 2, b => 5 } );
1232              
1233             All three produce the same result for the same underlying numbers. Missing or
1234             C cells are handled by the C option (below).
1235              
1236             ========================================================================
1237              
1238             =head3 The command
1239              
1240             The second argument is the function applied to each pair of columns. It is called
1241             as:
1242              
1243             $command->( $column_a, $column_b ) # two ARRAY refs
1244              
1245             so inside a block the two columns arrive in C<@_>:
1246              
1247             my $result = col2col(\%data, sub {
1248             my ($x, $y) = @_; # $x and $y are array refs
1249             cor($x, $y);
1250             });
1251              
1252             You can also pass a B. A bare name is looked up in
1253             C, so these two are equivalent:
1254              
1255             col2col(\%data, 'cor');
1256             col2col(\%data, sub { cor($_[0], $_[1]) });
1257              
1258             ========================================================================
1259              
1260             =head3 The result
1261              
1262             Always a hash of hashes: B<< C<< $result-E{from}{to} >> >>.
1263              
1264             for my $from (sort keys %$result) {
1265             for my $to (sort keys %{ $result->{$from} }) {
1266             printf "%s vs %s = %s\n", $from, $to, $result->{$from}{$to};
1267             }
1268             }
1269              
1270             A column is never compared with itself, so C<< $result-E{a}{a} >> does not exist.
1271              
1272             ========================================================================
1273              
1274             =head3 Restricting columns (C<$cols>)
1275              
1276             By default every column is used as the "from" side. The third argument narrows
1277             that down — handy when you only care about one variable.
1278              
1279             # all columns vs all columns
1280             my $all = col2col(\%data, 'cor');
1281             # just ONE column vs every other column
1282             my $one = col2col(\%data, 'cor', 'height');
1283             my $cors = $one->{height}; # { weight => ..., age => ... }
1284             # a FEW specific columns vs every other column
1285             my $few = col2col(\%data, 'cor', ['height', 'weight']);
1286              
1287             The "to" side is always every other column; C<$cols> only limits the outer keys.
1288              
1289             ========================================================================
1290              
1291             =head3 Options
1292              
1293             Options can be given two ways:
1294              
1295             col2col(\%data, 'cor', $cols, 'skip.errors' => 0); # after $cols
1296             col2col(\%data, 'cor', { 'skip.errors' => 0 }); # hash ref, no $cols needed
1297              
1298             The hash-ref form is convenient when you have B column restriction — it saves
1299             you from passing a placeholder. (A hash ref I C<$cols>, so you can't use
1300             it to restrict columns at the same time; use the trailing form for that.)
1301              
1302             =head4 C — how undefined values are handled
1303              
1304             Real data has gaps. C decides what the function sees.
1305              
1306              
1307              
1308             =begin html
1309              
1310            
1311            
1312            
1313             Value
1314             Behaviour
1315             Use for
1316            
1317            
1318            
1319            
1320             'pairwise' (default)
1321             A row is used for a pair only if both columns are defined there. The two columns arrive aligned and equal-length.
1322             Paired stats like cor.
1323            
1324            
1325             'omit'
1326             Each column drops its own undefined values independently. The two columns may end up different lengths.
1327             Unpaired tests like t_test, kruskal_test, where a gap in one sample shouldn't discard a value in the other.
1328            
1329            
1330             'keep'
1331             Every row is passed through, undef and all.
1332             When your function does its own missing-data handling.
1333            
1334            
1335            
1336              
1337             =end html
1338              
1339              
1340              
1341             # correlation: keep only complete pairs (the default)
1342             col2col(\%data, 'cor');
1343             # two-sample test: each column keeps its own values
1344             col2col(\%data, 't_test', undef, na => 'omit');
1345             col2col(\%data, 't_test', { na => 'omit' }); # same, no placeholder
1346              
1347             C / C remain as boolean aliases for backward compatibility:
1348             C means C<'pairwise'>, C means C<'keep'>. Don't combine them with C.
1349              
1350             =head4 C — keep going when a pair fails I<(default: true)>
1351              
1352             Some functions croak on degenerate input — for example C dies if a column has
1353             zero variance. By default C B that croak per pair: instead of
1354             aborting the whole run, it stores the B of the error message in that
1355             cell, so the result tells you I pair failed and I. Every other cell is
1356             computed normally.
1357              
1358             my $r = col2col(\%data, 'cor');
1359             # a good pair: $r->{a}{b} == 0.83
1360             # a bad pair: $r->{a}{const} eq 'cor: standard deviation of y is 0'
1361              
1362             To restore the old "die on the first error" behaviour, turn it off:
1363              
1364             col2col(\%data, 'cor', undef, 'skip.errors' => 0);
1365             col2col(\%data, 'cor', { 'skip.errors' => 0 });
1366              
1367             Only errors from B are trapped. Mistakes in the call itself
1368             (unknown column, bad data, unknown function name, unknown option) always die.
1369              
1370             ========================================================================
1371              
1372             =head3 Worked examples
1373              
1374             B
1375              
1376             my $m = col2col(\%data, 'cor');
1377              
1378             B
1379              
1380             my $col = 'Testosterone, total (nmol/L)';
1381             my $cors = col2col($hoa, 'cor', $col)->{$col};
1382             for my $other (sort { ($cors->{$b} // -2) <=> ($cors->{$a} // -2) } keys %$cors) {
1383             next unless $cors->{$other} =~ /^-?\d/; # skip cells holding an error message
1384             printf "%-30s % .3f\n", $other, $cors->{$other};
1385             }
1386              
1387             B
1388              
1389             my $t = col2col($hoa, 't_test', undef, na => 'omit');
1390              
1391             B
1392              
1393             my $m = col2col($hoa, 'cor');
1394             for my $from (sort keys %$m) {
1395             for my $to (sort keys %{ $m->{$from} }) {
1396             my $v = $m->{$from}{$to};
1397             warn "$from vs $to: $v\n" if defined $v && $v !~ /^-?\d/; # non-numeric = error
1398             }
1399             }
1400              
1401             ========================================================================
1402              
1403             =head3 Gotchas
1404              
1405             =over
1406              
1407             =item * B, C<($col_a, $col_b)> — not a column and
1408             a name. Unpack with C.
1409              
1410             =item * B<< C<'pairwise'> can still hit a constant I. >> A column with overall
1411             variance can be flat on just the rows it shares with one partner, so C may
1412             still croak for that pair. With the default C, that shows up as a
1413             message in the single offending cell rather than killing the run.
1414              
1415             =item * B<< C does not modify your data. >> It reads the table and returns a new
1416             hash of hashes.
1417              
1418             =item * B — i.e.
1419             C is the inner ("to") key. So C<< $result-E{A}{B} >> reading C<…deviation of y is 0>
1420             means column C is the degenerate one for that pair.
1421              
1422             =back
1423              
1424             =head2 cor
1425              
1426             cor($array1, $array2, $method = 'pearson'),
1427              
1428             that is, C is the default and will be used if C<$method> is not specified.
1429              
1430             Just like R, C, C, and C are available
1431              
1432             If you provide an array of arrays (a matrix), C will compute the correlation matrix automatically.
1433              
1434             =head2 cor_test
1435              
1436             my $result = cor_test(
1437             'x' => $x,
1438             'y' => $y,
1439             alternative => 'two.sided',
1440             method => 'pearson',
1441             continuity => 1
1442             );
1443              
1444             C safely handles C (or C) values seamlessly by computing over pairwise complete observations.
1445              
1446             =head2 cov
1447              
1448             cov($array1, $array2, 'pearson')
1449              
1450             or
1451              
1452             cov($array1, $array2, 'spearman')
1453              
1454             or
1455              
1456             cov($array1, $array2, 'kendall')
1457              
1458             =head2 csort
1459              
1460             Sort a table by a column or by a custom comparator. Works on both common Perl table shapes and can transpose between them on the way out. Stable, non-destructive.
1461              
1462             =head3 Signature
1463              
1464             my $sorted = csort($data, $by);
1465             my $sorted = csort($data, $by, $output);
1466              
1467             =over
1468              
1469             =item * B<< C<$data> >> — your table, in either shape:
1470             =over
1471              
1472             =item * B — arrayref of hashrefs (a list of rows): C<< [ {id=E1, v=E10}, {id=E2, v=E20} ] >>
1473              
1474             =item * B — hashref of arrayrefs (parallel columns): C<< { id=E[1,2], v=E[10,20] } >>
1475              
1476             =back
1477              
1478             =item * B<< C<$by> >> — I to sort:
1479             =over
1480              
1481             =item * a B (string), or
1482              
1483             =item * a B coderef using C<$a> / C<$b>, just like Perl's C
1484              
1485             =back
1486              
1487             =item * B<< C<$output> >> I<(optional)> — C<'aoh'> or C<'hoa'> (case-insensitive). Defaults to the input shape; C also means "same as input".
1488              
1489             =back
1490              
1491             Returns a B structure. The input is never modified.
1492              
1493             =head3 What it does
1494              
1495             =over
1496              
1497             =item * B — numeric if every defined value in that column looks like a number, otherwise string comparison. Missing / C values sort B (matching R's C).
1498              
1499             =item * B — C<$a> and C<$b> are set in the comparator's I package, so a named sub from another package still sees its own C<$a>/C<$b>. For AoH they're the row hashrefs; for HoA they're per-row hashref views synthesized from the columns.
1500              
1501             =item * B — equal keys keep their original order (merge sort, same as Perl C and R C).
1502              
1503             =item * B — keep the input shape, or transpose: AoH→HoA builds the union of all row keys (ordered by first appearance, gaps filled with C); HoA→AoH emits one hashref per row.
1504              
1505             =back
1506              
1507             =head3 Examples
1508              
1509             # by column, ascending numeric, AoH in / AoH out
1510             my $rows = csort($aoh, 'score');
1511            
1512             # custom comparator (descending), HoA in / HoA out
1513             my $cols = csort($hoa, sub { $b->{score} <=> $a->{score} });
1514            
1515             # sort an AoH but hand it back as columns (HoA)
1516             my $cols = csort($aoh, 'name', 'hoa');
1517              
1518             =head3 Notes
1519              
1520             =over
1521              
1522             =item * B AoH output reuses the original row hashrefs (re-ordered); HoA output permutes every column in lockstep.
1523              
1524             =item * Empty and single-row tables are handled for all four in/out combinations.
1525              
1526             =item * An invalid C<$output> value croaks.
1527              
1528             =back
1529              
1530             =head2 dnorm
1531              
1532             gives the density of the normal distribution, with the specified mean and standard deviation.
1533              
1534             In other words, the predicted height of the value C, given a mean, standard deviation, and whether or not to use a log value.
1535              
1536             returns a single scalar/number if a single value is given, otherwise returns an array reference.
1537              
1538             Usage:
1539              
1540             dnorm(4) # assumes a mean of 0 and standard deviation of 1
1541              
1542             but default mean, standard deviation, and log can be passed as parameters:
1543              
1544             $x = dnorm(0, mean => 0, sd => 2, 'log' => 0);
1545              
1546             =head2 filter
1547              
1548             Return a new data frame containing only the rows of C<$df> that match a predicate. The original C<$df> is never modified.
1549              
1550             my $df2 = filter($df, col('column.name') > 4);
1551              
1552             C accepts a predicate in one of two forms:
1553              
1554             =over
1555              
1556             =item 1. a B<< C expression >> — a small, composable comparison built with overloaded operators, and
1557              
1558             =item 2. a B — for anything the operators can't express (multiple columns, regexes, arbitrary logic), in the same spirit as the C option of L<#>.
1559              
1560             =back
1561              
1562             Both C and C
1563              
1564             =head3 Arguments
1565              
1566              
1567              
1568             =begin html
1569              
1570            
1571            
1572            
1573             Position
1574             Name
1575             Description
1576            
1577            
1578            
1579            
1580             1
1581             $df
1582             The data frame to filter. Either an array of hashes (AoH — e.g. the default output of read_table) or a hash of arrays (HoA).
1583            
1584            
1585             2
1586             predicate
1587             Either a col() comparison object or a CODE reference.
1588            
1589            
1590            
1591              
1592             =end html
1593              
1594              
1595              
1596             The return value is a B data frame of the B as the input (AoH in → AoH out, HoA in → HoA out). For an HoA, every column is filtered in parallel by row index, so all returned columns stay the same length and aligned.
1597              
1598             =head3 The C form
1599              
1600             C is a deferred reference to a column. It carries no data — only the column name — so it can be compared with a literal (or another value) to build a predicate that C evaluates once per row.
1601              
1602             filter($df, col('age') >= 18); # keep rows where age >= 18
1603             filter($df, col('sex') eq 'f'); # keep rows where sex is 'f'
1604             filter($df, 18 <= col('age')); # operands may be in either order
1605              
1606             =head3 Comparison operators
1607              
1608              
1609              
1610             =begin html
1611              
1612            
1613            
1614            
1615             Kind
1616             Operators
1617             Comparison
1618            
1619            
1620            
1621            
1622             Numeric
1623             > < >= <= == !=
1624             numeric (the cell and the value are compared as numbers)
1625            
1626            
1627             String
1628             gt lt ge le eq ne
1629             string (the cell and the value are compared as strings)
1630            
1631            
1632            
1633              
1634             =end html
1635              
1636              
1637              
1638             C may appear on either side of the operator; C<< 4 E col('x') >> is automatically rewritten to the equivalent C<< col('x') E 4 >>.
1639              
1640             =head3 Combining predicates: C<&>, C<|>, C
1641              
1642             Predicates compose with bitwise C<&> (and), C<|> (or), and C (not):
1643              
1644             filter($df, (col('age') > 18) & (col('sex') eq 'f')); # and
1645             filter($df, (col('grp') eq 'a') | (col('grp') eq 'c')); # or
1646             filter($df, !(col('x') > 100)); # not
1647              
1648             Comparison operators bind more tightly than C<&> and C<|>, so C<< (col('a') E 4) & (col('b') E 2) >> is parsed correctly, but the parentheses are recommended for readability.
1649              
1650             =head3 The code-reference form
1651              
1652             For logic the operators can't express, pass a C. It is called once per row; the B, available both as C<$_> and as the first argument C<$_[0]>. Return a true value to keep the row.
1653              
1654             filter($df, sub { $_->{x} > 4 && $_->{grp} eq 'a' });
1655             filter($df, sub { $_->{name} =~ /^A/ });
1656             filter($df, sub { $_[0]{score} > $_[0]{threshold} });
1657              
1658             For an HoA, each row is assembled into a temporary hash reference (C<< { column =E value, ... } >>) before the sub is called, so the same C<< $_-E{column} >> syntax works regardless of the input shape.
1659              
1660             =head3 Examples
1661              
1662             use Stats::LikeR;
1663             my $df = read_table('patients.csv'); # array of hashes
1664             # numeric threshold
1665             my $adults = filter($df, col('Age') >= 18);
1666             # combine conditions
1667             my $target = filter($df, (col('Age') >= 18) & (col('Sex') eq 'f'));
1668             # arbitrary logic with a coderef
1669             my $flagged = filter($df, sub { $_->{ALT} > 40 || $_->{AST} > 40 });
1670             # hash-of-arrays input -> hash-of-arrays output, columns filtered in parallel
1671             my $hoa = read_table('patients.csv', 'output.type' => 'hoa');
1672             my $sub = filter($hoa, col('Age') > 32);
1673             # $sub->{Age}, $sub->{Sex}, ... are all the same length and row-aligned
1674              
1675             =head3 Behavior and notes
1676              
1677             =over
1678              
1679             =item * B C builds and returns a new frame; C<$df> is left untouched.
1680              
1681             =item * B<< A missing or C cell never matches >> a C comparison. For example C<< col('x') E 0 >> silently drops any row that has no C value or whose C is C.
1682              
1683             =item * B, into the returned frame: the returned array references the I row hashes as the input (fast, low-memory). Mutating a row in the result would therefore also change it in the original. HoA values are copied into fresh arrays.
1684              
1685             =item * B are well defined: a predicate true for every row returns a copy-shaped frame with all rows; a predicate true for none returns an empty frame (C<[]> for AoH, a hash of empty arrays for HoA).
1686              
1687             =item * B Passing a non-reference, an array element that is not a hash reference, or an HoA column that is not an array reference raises a descriptive error.
1688              
1689             =item * B The C/operator layer is pure Perl (operator overloading); the per-row evaluation is done in XS.
1690              
1691             =back
1692              
1693             =head3 See also
1694              
1695             C (whose C option applies the same coderef convention while reading a file), C.
1696              
1697             =head2 fisher_test
1698              
1699             =head3 array reference entry
1700              
1701             my $array_data = [
1702             [10, 2],
1703             [3, 15]
1704             ];
1705             my $res1 = fisher_test($array_data);
1706              
1707             which returns a hash reference:
1708              
1709             {
1710             alternative "two.sided",
1711             conf_int [
1712             [0] 2.75343836564204,
1713             [1] 300.682787419401
1714             ],
1715             conf_level 0.95,
1716             estimate {
1717             "odds ratio" 21.3053312750168
1718             },
1719             method "Fisher's Exact Test for Count Data",
1720             p_value 0.000536724119143435
1721             }
1722              
1723             =head3 hash reference entry
1724              
1725             $ft = fisher_test( {
1726             Guess => {
1727             Milk => 3, Tea => 1
1728             },
1729             Truth => {
1730             Milk => 1, Tea => 3
1731             }
1732             });
1733              
1734             =head2 glm
1735              
1736             takes a hash of an array as input
1737              
1738             my %tooth_growth = (
1739             dose => [qw(0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
1740             1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
1741             0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0
1742             2.0 2.0 2.0)],
1743             len => [qw(4.2 11.5 7.3 5.8 6.4 10.0 11.2 11.2 5.2 7.0 16.5 16.5 15.2 17.3 22.5
1744             17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5
1745             15.2 21.5 17.6 9.7 14.5 10.0 8.2 9.4 16.5 9.7 19.7 23.3 23.6 26.4 20.0
1746             25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0)],
1747             supp => [qw(VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC
1748             VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
1749             OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ)]
1750             );
1751            
1752             my $glm_teeth = glm(
1753             data => \%tooth_growth,
1754             formula => 'len ~ dose + supp',
1755             family => 'gaussian'
1756             );
1757              
1758             In addition to the C default, it fully supports logistic regression using the C family parameter via Iteratively Reweighted Least Squares (IRLS):
1759              
1760             my $glm_bin = glm(formula => 'am ~ wt + hp', data => \%mtcars, family => 'binomial');
1761              
1762             =head3 Input Parameters
1763              
1764              
1765              
1766             =begin html
1767              
1768            
1769            
1770            
1771             Parameter
1772             Type
1773             Default
1774             Description
1775             Example
1776            
1777            
1778            
1779            
1780             formula
1781             String
1782             None (Required)
1783             A symbolic description of the model to be fitted. Supports operators like +, :, *, ^, and -1 (to remove the intercept).
1784             'am ~ wt + hp', 'y ~ x - 1'
1785            
1786            
1787             data
1788             HashRef or ArrayRef
1789             None (Required)
1790             The dataset containing the variables used in the formula. Accepts either a Hash of Arrays (HoA) or an Array of Hashes (AoH).
1791             \%mtcars, [{x => 1, y => 2}, ...]
1792            
1793            
1794             family
1795             String
1796             'gaussian'
1797             A description of the error distribution and link function to be used in the model. Currently supports 'gaussian' (identity link) and 'binomial' (logit link).
1798             'binomial'
1799            
1800            
1801            
1802              
1803             =end html
1804              
1805              
1806              
1807             =head3 Output variables
1808              
1809              
1810              
1811             =begin html
1812              
1813            
1814            
1815            
1816             Variable
1817             Type
1818             Description
1819             Example
1820            
1821            
1822            
1823            
1824             aic
1825             Double
1826             Akaike's Information Criterion for the fitted model.
1827             123.45
1828            
1829            
1830             boundary
1831             Integer (Boolean)
1832             1 if the fitted values computationally reached the 0 or 1 boundary (specific to the binomial family), 0 otherwise.
1833             0
1834            
1835            
1836             coefficients
1837             HashRef
1838             A hash mapping the expanded model term names to their estimated coefficient values.
1839             {'Intercept' => 1.5, 'wt' => -0.5}
1840            
1841            
1842             converged
1843             Integer (Boolean)
1844             1 if the Iteratively Reweighted Least Squares (IRLS) algorithm converged within the maximum iterations, 0 otherwise.
1845             1
1846            
1847            
1848             deviance
1849             Double
1850             The residual deviance of the fitted model.
1851             15.2
1852            
1853            
1854             deviance.resid
1855             HashRef
1856             A hash mapping data row names to their computed deviance residuals.
1857             {'Mazda RX4' => 0.12}
1858            
1859            
1860             df.null
1861             Integer
1862             The residual degrees of freedom for the null model.
1863             31
1864            
1865            
1866             df.residual
1867             Integer
1868             The residual degrees of freedom for the fitted model.
1869             30
1870            
1871            
1872             family
1873             String
1874             The statistical family used to fit the model.
1875             "gaussian"
1876            
1877            
1878             fitted.values
1879             HashRef
1880             A hash mapping data row names to the fitted mean values (the model's predictions on the scale of the response).
1881             {'Mazda RX4' => 0.85}
1882            
1883            
1884             iter
1885             Integer
1886             The number of IRLS iterations performed before convergence or hitting the iteration limit.
1887             4
1888            
1889            
1890             null.deviance
1891             Double
1892             The deviance for the null model (a baseline model containing only an intercept, or an offset of 0 if the intercept is removed).
1893             43.5
1894            
1895            
1896             rank
1897             Integer
1898             The numeric rank of the fitted linear model (the number of estimated, non-aliased parameters).
1899             2
1900            
1901            
1902             summary
1903             HashRef
1904             A nested hash mapping each term to its detailed summary statistics, including Estimate, Std. Error, t value / z value, and Pr(> t ) / Pr(> z ). Aliased parameters return "NaN".
1905             {'wt' => {'Estimate' => -0.5, 'Std. Error' => 0.1, ...}}
1906            
1907            
1908             terms
1909             ArrayRef
1910             An ordered list of the expanded term names included in the model matrix.
1911             ['Intercept', 'wt', 'hp']
1912            
1913            
1914            
1915              
1916             =end html
1917              
1918              
1919              
1920             =head2 group_by
1921              
1922             Take a hash of arrays, hash of hashes, or array of hashes, and group a column by another column.
1923              
1924             my $aoh_data = [
1925             { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 },
1926             { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 },
1927             { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 },
1928             { 'Gender' => 'Female' } # Intentional missing target value
1929             ];
1930              
1931             as well as
1932              
1933             $hoh_data = {
1934             'Patient_A' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 },
1935             'Patient_B' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 },
1936             'Patient_C' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 },
1937             'Patient_D' => { 'Gender' => 'Female' }, # Intentional missing target value
1938             'Patient_E' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => undef } # Explicit undef
1939             };
1940              
1941             and
1942              
1943             my $hoa_data = {
1944             'Gender' => ['Male', 'Female', 'Male', 'Female'],
1945             'Testosterone, total (nmol/L)' => [22.1, 2.5, 19.4, undef ]
1946             };
1947              
1948             then run the function thus:
1949              
1950             group_by( $hoa_data, 'Testosterone, total (nmol/L)', 'Gender');
1951              
1952             The output can be thought of like a hash, with the first string broken down by the second.
1953              
1954             all become hash of arrays:
1955              
1956             {
1957             Female [
1958             [0] 1.8
1959             ],
1960             Male [
1961             [0] 18.2,
1962             [1] 20.5
1963             ]
1964             }
1965              
1966             returns an empty array of hashes if neither target nor group keys are found.
1967              
1968             =head3 Filtering
1969              
1970             Data can be further broken down with filter/subs like in C:
1971              
1972             my $testosterone = group_by($d, # group testosterone by "Gender"
1973             'Testosterone, total (nmol/L)',
1974             'Gender',
1975             { 'Race/Hispanic origin w/ NH Asian' => sub { $_ eq $n } },# filter
1976             { 'Testosterone, total (nmol/L)' => sub { $_ ne 'NA' } } # filter
1977             );
1978              
1979             where each filter filters on the columns, e.g. second hash keys.
1980              
1981             =head2 hoh2hoa
1982              
1983             Convert a B (row-major: outer key = row, inner key = column)
1984             into a B (column-major: key = column, value = that column's
1985             cells down the rows).
1986              
1987             use Stats::LikeR;
1988            
1989             my %hoh = (
1990             'r1' => { 'a' => 1, 'b' => 2 },
1991             'r2' => { 'a' => 3, 'b' => 4 },
1992             );
1993            
1994             my $hoa = hoh2hoa(\%hoh);
1995              
1996             which returns
1997             {
1998             a => [1, 3],
1999             b => [2, 4],
2000             }
2001              
2002             =head3 Behavior
2003              
2004             =over
2005              
2006             =item * B are the union of every inner key, so a key that appears in only
2007             some rows still becomes a column.
2008              
2009             =item * B are emitted in sorted outer-key (row-name) order, and that one order
2010             is used for every column, so the arrays stay aligned and the result is
2011             reproducible regardless of hash ordering.
2012              
2013             =item * B — a missing inner key, or a cell whose value is C — are filled
2014             with the fill value (see C below). Every column therefore has
2015             exactly one entry per row.
2016              
2017             =item * Values are B into the result; the original structure is left
2018             untouched.
2019              
2020             =item * An B hash of hashes returns an empty hash of arrays (it is not an
2021             error).
2022              
2023             =back
2024              
2025             =head3 Options
2026              
2027             Options are passed as trailing C<< name =E value >> pairs.
2028              
2029              
2030              
2031             =begin html
2032              
2033            
2034            
2035            
2036             Option
2037             Default
2038             Meaning
2039            
2040            
2041            
2042            
2043             undef.val
2044             undef
2045             Value used to fill a missing key or an undef cell. Any defined scalar works, including 0 and ''. Passing undef keeps the default.
2046            
2047            
2048             row.names
2049             (none)
2050             If set to a string, an extra column of that name is added holding the sorted row labels, aligned with the data. Dies if the name collides with an existing column.
2051            
2052            
2053            
2054              
2055             =end html
2056              
2057              
2058              
2059             # Ragged input with an explicit fill string:
2060             my %ragged = (
2061             'r1' => { 'a' => 1, 'b' => 2 },
2062             'r2' => { 'a' => 3, 'c' => 9 },
2063             );
2064             my $hoa = hoh2hoa(\%ragged, 'undef.val' => 'NA');
2065             # {
2066             # a => [1, 3 ],
2067             # b => [2, 'NA'],
2068             # c => ['NA', 9 ],
2069             # }
2070            
2071             # Keep the row labels as a column:
2072             my $with_ids = hoh2hoa(\%ragged, 'row.names' => 'id');
2073             # {
2074             # id => ['r1', 'r2'],
2075             # a => [1, 3 ],
2076             # b => [2, undef],
2077             # c => [undef, 9 ],
2078             # }
2079              
2080             =head3 Errors
2081              
2082             C dies (via C) when:
2083              
2084             =over
2085              
2086             =item * the argument is not a hash reference,
2087              
2088             =item * any value in the hash is not itself a hash reference,
2089              
2090             =item * an unknown option is given, or the options are not C<< name =E value >> pairs,
2091              
2092             =item * C is not a plain string, or it names an already-present column.
2093              
2094             =back
2095              
2096             =head2 hist
2097              
2098             Computes the histogram of the given data values, operating in single $O(N)$ pass performance. It returns the bin counts, computed breaks, midpoints, and density.
2099              
2100             my $res = hist([1, 2, 2, 3, 3, 3, 4, 4, 5], breaks => 4);
2101              
2102             If C is not explicitly provided, it defaults to calculating the number of bins using Sturges' formula.
2103              
2104             =head2 kruskal_test
2105              
2106             Essentially the test determines if all groups have the same median (same distribution) (an excellent review is at https://library.virginia.edu/data/articles/getting-started-with-the-kruskal-wallis-test)
2107              
2108             Performs a Kruskal-Wallis rank sum test, see
2109             https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/kruskal.test
2110              
2111             =head3 hash of array entry
2112              
2113             I feel that this is better, and more easily read, than what you get in R:
2114              
2115             my %x = (
2116             'normal.subjects' => [2.9, 3.0, 2.5, 2.6, 3.2],
2117             'obs. airway disease' => [3.8, 2.7, 4.0, 2.4],
2118             'asbestosis' => [2.8, 3.4, 3.7, 2.2, 2.0]
2119             );
2120             $kt = kruskal_test(\%x);
2121              
2122             =head3 R-like array entry
2123              
2124             my @xk = (2.9, 3.0, 2.5, 2.6, 3.2); # normal subjects
2125             my @yk = (3.8, 2.7, 4.0, 2.4); # with obstructive airway disease
2126             my @zk = (2.8, 3.4, 3.7, 2.2, 2.0); # with asbestosis
2127             my @x = (@xk, @yk, @zk);
2128             my @g = (
2129             (map {'Normal subjects'} 0..4),
2130             (map {'Subjects with obstructive airway disease'} 0..3),
2131             map {'Subjects with asbestosis'} 0..4
2132             );
2133             my $kt = kruskal_test(\@x, \@g);
2134              
2135             =head2 ks_test
2136              
2137             The Kolmogorov–Smirnov test checks whether two samples are drawn from the
2138             same distribution (two-sample), or whether a single sample is drawn from a
2139             given reference distribution (one-sample). It works by comparing the empirical
2140             cumulative distribution functions (ECDFs) and measuring the largest gap
2141             between them.
2142              
2143             Two-sample form — pass two array references:
2144              
2145             $ks = ks_test(\@x, \@y);
2146             $ks = ks_test(\@x, \@y, alternative => 'greater');
2147              
2148             One-sample form — pass one array reference and the name of a reference CDF.
2149             Currently only C<'pnorm'> is supported, i.e. the standard normal distribution
2150             (mean 0, standard deviation 1):
2151              
2152             $ks = ks_test(\@x, 'pnorm');
2153              
2154             Arguments may be given positionally (as above) or by name:
2155              
2156             $ks = ks_test(x => \@x, y => \@y, alternative => 'less', exact => 1);
2157              
2158             Non-numeric and undefined elements are silently dropped before the test runs.
2159              
2160             C selects which gap between the ECDFs is measured:
2161              
2162             =over
2163              
2164             =item * C<'two.sided'> (default) — the largest gap in either direction,
2165             D = sup |F_x − F_y|.
2166              
2167             =item * C<'greater'> — the largest gap where x's ECDF rises above the other,
2168             D⁺ = sup (F_x − F_y).
2169              
2170             =item * C<'less'> — the largest gap in the other direction, D⁻ = sup (F_y − F_x).
2171              
2172             =back
2173              
2174             These follow R's C convention: C<'greater'>/C<'less'> describe which CDF
2175             lies I the other, which (because a higher CDF means smaller values) is
2176             the opposite of which sample tends to be larger.
2177              
2178             C controls how the p-value is computed. Omit it to let the test choose:
2179             the exact distribution is used for small samples (two-sample when nx·ny
2180             10000, one-sample when n < 100) and the asymptotic (Kolmogorov limiting)
2181             approximation otherwise. Pass C<< exact =E 1 >> to force the exact computation or
2182             C<< exact =E 0 >> to force the asymptotic one. Exact p-values cannot be computed
2183             when the data contain ties; if ties are present on the exact path, the test
2184             warns and falls back to the asymptotic p-value. (The exact one-sample test is
2185             only available for the two-sided alternative; a one-sided one-sample request
2186             also falls back to asymptotic.)
2187              
2188             =head3 Return value
2189              
2190             C returns a hash reference with four keys:
2191              
2192             =over
2193              
2194             =item * B<< C >> — the KS statistic for the chosen C: D, D⁺, or
2195             D⁻. It is the maximum distance between the two ECDFs (or, for the one-sample
2196             test, between the ECDF and the reference CDF), always in the range [0, 1].
2197             Larger values mean the distributions are further apart.
2198              
2199             =item * B<< C >> — the probability, under the null hypothesis that the samples
2200             share a distribution, of observing a statistic at least this large. It is
2201             clamped to [0, 1]; a small value (e.g. < 0.05) is evidence against the null.
2202              
2203             =item * B<< C >> — a human-readable description of exactly what was run, handy
2204             for logging or reproducing a result. One of:
2205             C<"Two-sample Kolmogorov-Smirnov exact test">,
2206             C<"Two-sample Kolmogorov-Smirnov test (asymptotic)">,
2207             C<"One-sample Kolmogorov-Smirnov exact test">, or
2208             C<"One-sample Kolmogorov-Smirnov test (asymptotic)">.
2209              
2210             =item * B<< C >> — the alternative hypothesis that was applied
2211             (C<'two.sided'>, C<'greater'>, or C<'less'>), echoed back so the result is
2212             self-describing.
2213              
2214             =back
2215              
2216             For example:
2217              
2218             my $ks = ks_test(\@x, \@y);
2219             if ($ks->{p_value} < 0.05) {
2220             printf "reject H0: D=%.4f, p=%.4g (%s)\n",
2221             $ks->{statistic}, $ks->{p_value}, $ks->{method};
2222             }
2223              
2224             =head2 ljoin
2225              
2226             Consider a hash: C<$h{$row}{$col}>, and another hash C<$i{$row}{$col}>.
2227             C will add information for C<$col> in C<%i> for each C<$row> to C<%h>, where C<$row> exists in both C<%h> and C<%i>
2228              
2229             For example,
2230              
2231             {
2232             "Jack Smith" {
2233             age 30
2234             }
2235             }
2236              
2237             and a second hash,
2238             {
2239             "Jack Smith" {
2240             dept "Engineering"
2241             },
2242             "Jane Doe" {
2243             age 25
2244             }
2245             }
2246              
2247             in this case, running C will modify \%h to result:
2248              
2249             {
2250             "Jack Smith" {
2251             age 30,
2252             dept "Engineering"
2253             }
2254             }
2255              
2256             =head2 lm
2257              
2258             This is the linear models function.
2259              
2260             $lm = lm(formula => 'mpg ~ wt + hp', data => $mtcars);
2261              
2262             where C<$mtcars> is a hash of hashes
2263              
2264             C also supports generating interaction terms directly within the formula using the C<*> operator:
2265              
2266             my $lm = lm(formula => 'mpg ~ wt * hp^2', data => \%mtcars);
2267              
2268             If your data contains missing numbers (C or C), C handles listwise deletion dynamically to ensure mathematical integrity before fitting.
2269              
2270             the dot operator also works:
2271              
2272             $lm = lm(formula => 'y ~ .', data => $dot_data);
2273              
2274             =head2 matrix
2275              
2276             my $mat1 = matrix(
2277             data => [1..6],
2278             nrow => 2
2279             );
2280              
2281             You can also pass C<< byrow =E 1 >> if you want the matrix populated row-wise instead of column-wise.
2282              
2283             As of version 0.10, parameters do not need to be named, so that C works more like R:
2284              
2285             my $d = matrix(rnorm(32000), 1000, 32);
2286              
2287             works as C, C, and C
2288              
2289             =head2 max
2290              
2291             max(1,2,3);
2292              
2293             or
2294              
2295             my @arr = 1..8;
2296             max(@arr, 4, 5)
2297              
2298             as of version 0.02, max will die if any undefined values are provided
2299              
2300             =head2 mean
2301              
2302             mean(1,2,3);
2303              
2304             or
2305              
2306             my @arr = 1..8;
2307             mean(@arr, 4, 5)
2308              
2309             or
2310              
2311             mean([1,1], [2,2]) # 1.5
2312              
2313             as of version 0.02, mean will die if any undefined values are provided
2314              
2315             =head2 median
2316              
2317             works like mean, taking array references and arrays:
2318              
2319             median( $test_data[$i][0] )
2320              
2321             as of version 0.02, median will die if any undefined values are provided
2322              
2323             =head2 min
2324              
2325             min(1,2,3);
2326              
2327             or
2328              
2329             my @arr = 1..8;
2330             min(@arr, 4, 5)
2331              
2332             as of version 0.02, min will die if any undefined values are provided
2333              
2334             =head2 mode
2335              
2336             Takes either an array or an array reference, and returns an array of the most common scalars (numbers or strings)
2337              
2338             @arr = mode([1,3,3,3]); # returns (3)
2339            
2340             @arr = mode('a','a','c','c','z'); # returns ('a', 'c')
2341              
2342             =head2 oneway_test
2343              
2344             A one-way test for equality of group means that, unlike C/ANOVA, B
2345             assume equal variances>. By default it performs B (the
2346             same default as R's C), so the residual degrees of freedom are
2347             usually fractional. Pass C<< var_equal =E 1 >> for the classic equal-variance form.
2348              
2349             use Stats::LikeR qw(oneway_test);
2350              
2351             =head3 Input
2352              
2353             C accepts your data in one of three shapes. In every case each
2354             I is a vector of at least two numeric observations.
2355              
2356              
2357              
2358             =begin html
2359              
2360            
2361            
2362            
2363             Shape
2364             What it means
2365             Group labels
2366            
2367            
2368            
2369            
2370             Hash of arrays { a => [...], b => [...] }
2371             Each key is a group (R's stack() view of a named list)
2372             the hash keys
2373            
2374            
2375             Array of arrays [ [...], [...] ]
2376             Each element is a group
2377             "Index 0", "Index 1", …
2378            
2379            
2380             Hash + formula { resp => [...], grp => [...] }, formula => 'resp ~ grp'
2381             Long-format columns split by a factor column
2382             the distinct values of the factor
2383            
2384            
2385            
2386              
2387             =end html
2388              
2389              
2390              
2391             =head3 Options
2392              
2393              
2394              
2395             =begin html
2396              
2397            
2398            
2399            
2400             Option
2401             Default
2402             Meaning
2403            
2404            
2405            
2406            
2407             var_equal (alias var.equal)
2408             0 (false)
2409             0 → Welch's test (unequal variances). 1 → pooled-variance test.
2410            
2411            
2412             formula
2413             none
2414             'response ~ factor'. Only valid with a hash input; an error with an array of arrays.
2415            
2416            
2417            
2418              
2419             =end html
2420              
2421              
2422              
2423             =head3 Data validation
2424              
2425             Every observation must be B; an C or non-numeric
2426             cell makes the call C with the offending group and position. This matches
2427             the rest of C (C, C, C, … all die on C) and
2428             prevents missing values from being silently treated as C<0>.
2429              
2430             Each group needs at least two observations, and you need at least two groups.
2431              
2432             =head3 Output
2433              
2434             A hash reference with three top-level keys:
2435              
2436              
2437              
2438             =begin html
2439              
2440            
2441            
2442            
2443             Key
2444             Value
2445            
2446            
2447            
2448            
2449             factor name (Group, or the formula's factor, e.g. supp)
2450             the between-groups row: Df, Sum Sq, Mean Sq, F value, Pr(>F)
2451            
2452            
2453             Residuals
2454             the within-groups row: Df, Sum Sq, Mean Sq (Df is fractional under Welch)
2455            
2456            
2457             group_stats
2458             { mean => { group => mean, … }, size => { group => n, … } }
2459            
2460            
2461            
2462              
2463             =end html
2464              
2465              
2466              
2467             =head3 Examples
2468              
2469             =head4 Hash of arrays (each key is a group)
2470              
2471             my $res = oneway_test({
2472             yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
2473             ctrl => [1, 1, 1, 0, 0, 0 ],
2474             });
2475            
2476             {
2477             Group => {
2478             Df => 1,
2479             "Sum Sq" => 61.6533333333333,
2480             "Mean Sq" => 61.6533333333333,
2481             "F value" => 177.504798464491,
2482             "Pr(>F)" => 1.31343255160843e-07,
2483             },
2484             Residuals => {
2485             Df => 9.81767348326473, # fractional: Welch correction
2486             "Sum Sq" => 3.47333333333333,
2487             "Mean Sq" => 0.353783749200256,
2488             },
2489             group_stats => {
2490             mean => { ctrl => 0.5, yield => 5.03333333333333 },
2491             size => { ctrl => 6, yield => 6 },
2492             },
2493             }
2494              
2495             =head4 Array of arrays (groups named by index)
2496              
2497             my $res = oneway_test([
2498             [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
2499             [1, 1, 1, 0, 0, 0 ],
2500             ]);
2501              
2502             Identical to the hash form, except C is keyed by position:
2503              
2504             group_stats => {
2505             mean => { "Index 0" => 5.03333333333333, "Index 1" => 0.5 },
2506             size => { "Index 0" => 6, "Index 1" => 6 },
2507             }
2508              
2509             =head4 Long format with a formula
2510              
2511             When your data is in columns rather than pre-split groups, name the response
2512             and factor columns with a formula. The factor's I become the groups and
2513             the factor's I becomes the top-level key:
2514              
2515             my $res = oneway_test(
2516             {
2517             len => [4.2, 11.5, 7.3, 16.5, 17.3, 13.6, 23.6, 18.5, 33.9],
2518             supp => [qw(VC VC VC OJ OJ OJ HI HI HI)],
2519             },
2520             formula => 'len ~ supp',
2521             );
2522             # $res->{supp}, $res->{Residuals}, $res->{group_stats} ...
2523              
2524             =head3 Classic equal-variance form
2525              
2526             my $res = oneway_test(\%groups, var_equal => 1); # or 'var.equal' => 1
2527              
2528             =head3 Notes
2529              
2530             =over
2531              
2532             =item * The default (Welch) does B require equal group sizes or equal variances;
2533             the pooled form (C<< var_equal =E 1 >>) assumes equal variances.
2534              
2535             =item * C is only meaningful for a hash input. Passing it with an array of
2536             arrays is an error.
2537              
2538             =item * Group order in the output is not guaranteed for hash inputs (it follows hash
2539             iteration order); read results by name, not position.
2540              
2541             =item * Avoid naming a factor C or C in a formula, since those
2542             are reserved top-level keys in the result.
2543              
2544             =back
2545              
2546             =head2 p_adjust
2547              
2548             Returns array of false-discovery-rate-corrected p-values, where methods available are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr"
2549              
2550             my @q = p_adjust(\@pvalues, $method);
2551              
2552             =head2 power_t_test
2553              
2554             $test_data = power_t_test(
2555             n => 30, delta => 0.5,
2556             sd => 1.0, sig_level => 0.05
2557             );
2558              
2559             It also allows configuring the test type (C<< type =E 'one.sample' >>, C<'two.sample'>, C<'paired'>) and alternative hypothesis (C<< alternative =E 'one.sided' >>). You can also pass C<< strict =E 1 >> to strictly evaluate both tails of the distribution.
2560              
2561              
2562              
2563             =begin html
2564              
2565            
2566            
2567            
2568             Parameter
2569             Type
2570             Default
2571             Description
2572            
2573            
2574            
2575            
2576             n
2577             Float
2578             undef
2579             Number of observations (per group for two-sample, pairs for paired).
2580            
2581            
2582             delta
2583             Float
2584             undef
2585             True difference in means.
2586            
2587            
2588             sd
2589             Float
2590             1.0
2591             Standard deviation.
2592            
2593            
2594             sig_level
2595             Float
2596             0.05
2597             Significance level (Type I error probability). Also accepts sig.level.
2598            
2599            
2600             power
2601             Float
2602             undef
2603             Power of test (1 minus Type II error probability).
2604            
2605            
2606             type
2607             String
2608             "two.sample"
2609             Type of t-test: "two.sample", "one.sample", or "paired".
2610            
2611            
2612             alternative
2613             String
2614             "two.sided"
2615             One- or two-sided test: "two.sided", "one.sided", "greater", or "less".
2616            
2617            
2618             strict
2619             Boolean
2620             0 (False)
2621             Use strict interpretation of two-sided power calculations.
2622            
2623            
2624             tol
2625             Float
2626             ~1.22e-4
2627             Numerical tolerance used for the internal root-finding algorithm.
2628            
2629            
2630            
2631              
2632             =end html
2633              
2634              
2635              
2636             =head2 prcomp
2637              
2638             Principal Component Analysis
2639              
2640             =head3 Options
2641              
2642              
2643              
2644             =begin html
2645              
2646            
2647            
2648            
2649             Option
2650             Type
2651             Default
2652             Description
2653            
2654            
2655            
2656            
2657             center
2658             Boolean
2659             1 (True)
2660             If true, the variables are shifted to be zero-centered before the analysis takes place.
2661            
2662            
2663             scale
2664             Boolean
2665             0 (False)
2666             If true, the variables are scaled to have unit variance before the analysis takes place. Note: If a column has zero variance, the function will croak to prevent division by zero.
2667            
2668            
2669             retx
2670             Boolean
2671             1 (True)
2672             If true, the rotated data (the original data multiplied by the rotation matrix) is returned under the key x.
2673            
2674            
2675             tol
2676             Number
2677             undef
2678             A value indicating the magnitude below which components should be omitted. Components are omitted if their standard deviation is less than or equal to tol times the standard deviation of the first component.
2679            
2680            
2681             rank
2682             Integer
2683             undef
2684             Optionally specify a strict limit on the number of principal components to return. The function will return min(rank, rows, columns) components.
2685            
2686            
2687            
2688              
2689             =end html
2690              
2691              
2692              
2693             =head3 Results
2694              
2695             =head4 Returned Data Structure
2696              
2697             The C function returns a HashRef containing the following keys representing the results of the Principal Component Analysis:
2698              
2699              
2700              
2701             =begin html
2702              
2703            
2704            
2705            
2706             Key
2707             Type
2708             Description
2709            
2710            
2711            
2712            
2713             sdev
2714             ArrayRef[Number]
2715             The standard deviations of the principal components. Mathematically, these are the square roots of the eigenvalues of the covariance matrix.
2716            
2717            
2718             rotation
2719             ArrayRef[ArrayRef]
2720             A 2D array representing the matrix of variable loadings (the eigenvectors). Each inner array represents a row, and the columns correspond to the principal components.
2721            
2722            
2723             x
2724             ArrayRef[ArrayRef]
2725             A 2D array containing the rotated data (often referred to as PCA scores). This is the original data projected onto the principal components. Note: Only present if the retx option is true.
2726            
2727            
2728             center
2729             ArrayRef[Number] or 0
2730             The centering values used (typically the column means). Returns false (0) if centering was disabled.
2731            
2732            
2733             scale
2734             ArrayRef[Number] or 0
2735             The scaling values used (typically the column standard deviations). Returns false (0) if scaling was disabled.
2736            
2737            
2738             varnames
2739             ArrayRef[String]
2740             The sorted names of the original variables. Note: Only present if the input data was a Hash of Arrays (HoA) or a Hash of Hashes (HoH).
2741            
2742            
2743            
2744              
2745             =end html
2746              
2747              
2748              
2749             =head3 Using array of arrays
2750              
2751             my $aoa = [
2752             [2, 4],
2753             [4, 2],
2754             [6, 6]
2755             ];
2756            
2757             my $pca = prcomp($aoa);
2758              
2759             which returns
2760              
2761             {
2762             center [
2763             [0] 4,
2764             [1] 4
2765             ],
2766             rotation [
2767             [0] [
2768             [0] 0.707106781186547,
2769             [1] 0.707106781186548
2770             ],
2771             [1] [
2772             [0] 0.707106781186548,
2773             [1] -0.707106781186547
2774             ]
2775             ],
2776             scale 0,
2777             sdev [
2778             [0] 2.44948974278318,
2779             [1] 1.4142135623731
2780             ],
2781             x [
2782             [0] [
2783             [0] -1.41421356237309,
2784             [1] -1.4142135623731
2785             ],
2786             [1] [
2787             [0] -1.4142135623731,
2788             [1] 1.41421356237309
2789             ],
2790             [2] [
2791             [0] 2.82842712474619,
2792             [1] 2.22044604925031e-16
2793             ]
2794             ]
2795             }
2796              
2797             =head3 Hash of Arrays
2798              
2799             my $hoa = { B => [4, 2, 6], A => [2, 4, 6] };
2800             my $pca = prcomp($hoa);
2801              
2802             =head2 quantile
2803              
2804             Calculates sample quantiles using R's continuous Type 7 interpolation.
2805              
2806             my $quantile = quantile('x' => [1..99], probs => [0.05, 0.1, 0.25]);
2807              
2808             If the C parameter is omitted, it behaves identically to R by defaulting to the 0, 25, 50, 75, and 100 percentiles (C). The returned hash keys match R's standardized naming convention (e.g., C<"25%">, C<"33.3%">).
2809              
2810             =head2 rbinom
2811              
2812             Create a binomial distribution of numbers
2813              
2814             my $binom = rbinom( n => $n, prob => 0.5, size => 9);
2815              
2816             It hooks directly into Perl's internal PRNG system, respecting C seeds.
2817              
2818             =head2 read_table
2819              
2820             I've tried to make this as simple as possible, trying to follow from R:
2821              
2822             my $test_data = read_table('t/HepatitisCdata.csv');
2823              
2824             =head3 options
2825              
2826              
2827              
2828             =begin html
2829              
2830            
2831            
2832            
2833             Option
2834             Description
2835             Example
2836            
2837            
2838            
2839            
2840             comment
2841             Comment character, by default #
2842             comment => % or whatever; does not apply with header
2843            
2844            
2845             output.type
2846             data type for output: array of hash, hash of array, or hash of hash
2847             'output.type' => 'aoh'
2848            
2849            
2850             filter
2851             Only take in rows with a certain filter
2852             filter => { Sex => sub {$_ eq 'f'} }
2853            
2854            
2855             row.names
2856             include row names in retrieved data; off by default
2857            
2858            
2859            
2860             sep
2861             field separator character; synonym with delim
2862             sep => "\t"
2863            
2864            
2865             delim
2866             field separator character; synonym with sep
2867             delim => "\t"
2868            
2869            
2870            
2871              
2872             =end html
2873              
2874              
2875              
2876             output types can be AOH (aoa), HOA (hoa), HOH (hoh)
2877              
2878             read_table($filename, 'output.type' => 'aoh');
2879            
2880             read_table($filename, 'output.type' => 'hoa');
2881              
2882             and, like Text::CSV_XS, filters can be applied in order to save RAM on big files:
2883              
2884             $test_data = read_table(
2885             't/HepatitisCdata.csv',
2886             filter => {
2887             Sex => sub {$_ eq 'f'} # where "Sex" is the column name, and "$_" is the value for that column
2888             },
2889             'output.type' => 'aoh'
2890             );
2891              
2892             the default delimiter is C<,>
2893             Suffixes C<.csv> and C<.tsv> are automatically detected from file names, but if specified, are overridden by C and/or C. C is given priority.
2894              
2895             =head2 rnorm
2896              
2897             Make a normal distribution of numbers, with pre-set mean C, standard deviation C, and number C.
2898              
2899             my ($rmean, $sd, $n) = (10, 2, 9999);
2900             my $normals = rnorm( n => $n, mean => $rmean, sd => $sd);
2901              
2902             =head2 runif
2903              
2904             Make an approximately uniform distribution into an array
2905              
2906             =head3 named arguments
2907              
2908             my $unif = runif( n => $n, min => 0, max => 1);
2909              
2910             where C is the number of items, the values are between C and C
2911              
2912             =head3 positional args
2913              
2914             this is to match R's behavior:
2915              
2916             runif( 9 )
2917              
2918             will make 9 numbers in [0,1]
2919              
2920             runif(9, 0, 99)
2921              
2922             will match C, C, and C respectively
2923              
2924             =head2 sample
2925              
2926             take a sample of hash or array slices.
2927              
2928             my $h = sample(\%h, 4); # take 4 hash keys and their values into $h
2929              
2930             or, alternatively, with arrays:
2931              
2932             my $arr = sample(\@arr, 3); # take 3 indices of an array
2933              
2934             =head2 scale
2935              
2936             my @scaled_results = scale(1..5);
2937              
2938             You can also pass an options hash to disable centering or scaling:
2939              
2940             my @scaled_results = scale(1..5, { center => false, scale => 1 });
2941              
2942             It fully supports matrix operations. By passing an array of arrays, C processes the data column by column independently:
2943              
2944             my $scaled_mat = scale([[1, 2], [3, 4], [5, 6]]);
2945              
2946             =head2 sd
2947              
2948             my $stdev = sd(2,4,4,4,5,5,7,9);
2949              
2950             Correct answer is 2.1380899352994
2951              
2952             C can accept both array references as well as arrays:
2953              
2954             my $stdev = sd([2,4,4,4,5,5,7,9]);
2955              
2956             As of version 0.02, sd will croak/die if any undefined values are provided.
2957              
2958             =head2 seq
2959              
2960             Works as closely as I can to R's seq, which is very similar to Perl's C loops. Returns an array, not an array reference.
2961              
2962             =head3 Standard integer sequence
2963              
2964             say 'seq(1, 5):';
2965             my @seq = seq(1, 5);
2966             say join(', ', @seq), "\n";
2967            
2968             say 'seq(1, 2, 0.25):';
2969             @seq = seq(1, 2, 0.25);
2970              
2971             =head3 Fractional steps
2972              
2973             say 'seq(1, 2, 0.25):';
2974             @seq = seq(1, 2, 0.25);
2975             say join(", ", @seq), "\n";
2976             for (my $idx = 2; $idx >= 1; $idx -= 0.25) { # count down to pop
2977             is_approx(pop @seq, $idx, "seq item $idx with fractional step");
2978             }
2979              
2980             =head3 Negative steps
2981              
2982             say 'seq(10, 5, -1):';
2983             @seq = seq(10, 5, -1);
2984             say join(", ", @seq), "\n";
2985             for (my $idx = 5; $idx <= 10; $idx++) { # count down to pop
2986             is_approx(pop @seq, $idx, "seq item $idx with negative step");
2987             }
2988              
2989             =head2 shapiro_test
2990              
2991             tests to see if an array reference is normally distributed, returns a p-value and a statistic
2992              
2993             my $shapiro = shapiro_test(
2994             [1..5]
2995             );
2996              
2997             and returns the hash reference:
2998              
2999             {
3000             p.value 0.589650577093106,
3001             p_value 0.589650577093106,
3002             statistic 0.960870680168535,
3003             W 0.960870680168535
3004             }
3005              
3006             =head2 sum
3007              
3008             returns sum, but using both arrays and array references.
3009              
3010             my $test_data = [1..8];
3011             sum($test_data)
3012              
3013             which I prefer, compared to List::Util's required casting into an array:
3014              
3015             sum(@{ $test_data });
3016              
3017             which passing a reference is shorter and much easier to read. Stats::LikeR, however, will work for B
3018              
3019             as of version 0.02, C will cause the script to die if any undefined values are provided
3020              
3021             =head2 summary
3022              
3023             Analogous to R's C, but does not deal with outputs from other functions.
3024             C only describes data as it is entered.
3025             An option C or its synonym C specifies the maximum number of rows that will print.
3026              
3027             =head3 array of array input
3028              
3029             my @arr;
3030             foreach my $i (0..18) {
3031             push @arr, runif(22);
3032             }
3033              
3034             and then C, or C
3035              
3036             ---------------------------------------------------------------------------
3037             Index # values Min. 1st Qu. Median Mean 3rd Qu. Max.
3038             ---------------------------------------------------------------------------
3039             0 22 0.04312 0.286 0.4975 0.5121 0.7296 0.9633
3040             1 22 0.05932 0.1483 0.495 0.4737 0.7699 0.9371
3041             2 22 0.02742 0.1588 0.4045 0.4325 0.6682 0.9878
3042             3 22 0.009233 0.2552 0.5398 0.5147 0.7755 0.9808
3043             4 22 0.06727 0.2432 0.5019 0.4855 0.7121 0.9043
3044             5 22 0.001032 0.1646 0.3021 0.3727 0.5704 0.9556
3045              
3046             =head3 hash of array input
3047              
3048             $test_data = summary(
3049             {
3050             A => runif(9),
3051             B => runif(9)
3052             },
3053             );
3054              
3055             =head2 t_test
3056              
3057             There are 1-sample and 2-sample t-tests, from one or two arrays:
3058              
3059             my $t_test = t_test( $array1, mu => 0.2334 );
3060              
3061             or 2-sample:
3062              
3063             $t_test = t_test(
3064             $array1, $array2,
3065             paired => 1
3066             );
3067              
3068             returns a hash reference, which looks like:
3069              
3070             conf_int => [
3071             -0.06672889, 0.25672889
3072             ],
3073             df => 5,
3074             estimate => 0.095,
3075             p_value => 0.19143688433660,
3076             statistic => 1.50996688705414
3077              
3078             the two groups compared can be specified, though not necessarily, as C and C, just like in R:
3079              
3080             $t_test = t_test(
3081             'x' => $array1, 'y' => $array2,
3082             paired => 1
3083             );
3084              
3085             =head3 Parameters
3086              
3087              
3088              
3089             =begin html
3090              
3091            
3092            
3093            
3094             Parameter
3095             Type
3096             Default
3097             Description
3098            
3099            
3100            
3101            
3102             x
3103             Array Reference
3104             Required
3105             The first vector of data. Must contain at least 2 elements.
3106            
3107            
3108             y
3109             Array Reference
3110             undef
3111             The second vector of data. Required for two-sample or paired tests.
3112            
3113            
3114             mu
3115             Float
3116             0.0
3117             The true value of the mean (or difference in means) for the null hypothesis.
3118            
3119            
3120             paired
3121             Boolean
3122             FALSE
3123             If true, performs a paired t-test. x and y must be the same length.
3124            
3125            
3126             var_equal
3127             Boolean
3128             FALSE
3129             If true, assumes equal variances (standard two-sample). If false, performs Welch's t-test with unequal variances.
3130            
3131            
3132             conf_level
3133             Float
3134             0.95
3135             Confidence level for the returned confidence interval. Must be between 0 and 1.
3136            
3137            
3138             alternative
3139             String
3140             "two.sided"
3141             Direction of the alternative hypothesis: "two.sided", "less", or "greater".
3142            
3143            
3144            
3145              
3146             =end html
3147              
3148              
3149              
3150             =head3 Return Hash
3151              
3152              
3153              
3154             =begin html
3155              
3156            
3157            
3158            
3159             Key
3160             Description
3161            
3162            
3163            
3164            
3165             statistic
3166             The computed t-statistic.
3167            
3168            
3169             df
3170             Degrees of freedom for the test.
3171            
3172            
3173             p_value
3174             The calculated p-value based on the test directionality.
3175            
3176            
3177             conf_int
3178             An Array Reference containing two elements: [lower_bound, upper_bound].
3179            
3180            
3181             estimate
3182             The estimated mean of x (one-sample) OR the mean of the differences (paired).
3183            
3184            
3185             estimate_x
3186             The estimated mean of the x vector (only returned in two-sample tests).
3187            
3188            
3189             estimate_y
3190             The estimated mean of the y vector (only returned in two-sample tests).
3191            
3192            
3193            
3194              
3195             =end html
3196              
3197              
3198              
3199             =head2 transpose
3200              
3201             Transposes a two-dimensional data structure, swapping rows and columns. Accepts either an array of arrays or a hash of hashes.
3202             Returns a new reference of the same type; the input is never modified.
3203              
3204             =head3 Array of array input
3205              
3206             Takes a reference to an array of array references and returns a new AoA where C.
3207              
3208             my $matrix = [[1, 2, 3], [4, 5, 6]];
3209             my $t = transpose($matrix);
3210             # [[1, 4],
3211             # [2, 5],
3212             # [3, 6]]
3213              
3214             All rows must be the same length; a ragged input is a fatal error.
3215             C is valid as an element value and is preserved exactly. An empty outer array or an array of empty rows both return C<[]>.
3216              
3217             Dies if:
3218             - any inner element is not an array reference
3219             - rows differ in length (ragged array)
3220              
3221             =head3 Hash of hash input
3222              
3223             Takes a reference to a hash of hash references and returns a new HoH where C.
3224              
3225             my $table = { alice => { score => 97, grade => 'A' }, bob => { score => 84, grade => 'B' } };
3226             my $t = transpose($table);
3227             # { score => { alice => 97, bob => 84 },
3228             # grade => { alice => 'A', bob => 'B' } }
3229              
3230             Inner keys do not need to be uniform across rows. If a given column key appears in only some rows, the output hash for that column will simply contain only those rows — no padding or C-filling is performed.
3231              
3232             my $sparse = {
3233             a => { x => 1, y => 2 },
3234             b => { x => 3, z => 4 } };
3235            
3236             my $t = transpose($sparse);
3237             # { x => { a => 1, b => 3 },
3238             # y => { a => 2 },
3239             # z => { b => 4 } }
3240              
3241             An empty outer hash or an outer hash whose inner hashes are all empty both return C<{}>.
3242              
3243             Dies if any inner element is not a hash reference
3244              
3245             =head2 value_counts
3246              
3247             Count the values in a given data set, return a hash reference showing how many times each particular value is present.
3248              
3249             =head3 Scalar
3250              
3251             $hash = value_counts('c');
3252              
3253             returns C<< { c =E 1 } >>
3254              
3255             =head3 Array reference
3256              
3257             value_counts(['a','b','b']);
3258              
3259             returns C<< { a =E 1, b =E 2} >>
3260              
3261             =head3 Array
3262              
3263             my $value_counts = value_counts('a','b','b');
3264              
3265             like an array reference above, returns C<< { a =E 1, b =E 2} >>
3266              
3267             =head3 Hash
3268              
3269             my $value_counts = value_counts( { A => 'a', B => 'a', C => 'b' } );
3270              
3271             returns C<< { a =E 2, b =E 1} >>
3272              
3273             =head3 Hash of array
3274              
3275             my $value_counts = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']});
3276              
3277             without a key (like above), the occurences of C, C, and C are counted.
3278              
3279             With a key, like C for above, only values within that hash key are counted:
3280              
3281             my $vc = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']}, 'a');
3282              
3283             =head3 Hash of hash (table)
3284              
3285             $hash = value_counts( {
3286             A => {
3287             a => 'x',
3288             b => 'z'
3289             },
3290             B => {
3291             a => 'x'
3292             },
3293             C => {
3294             a => 'y'
3295             }
3296             }, 'a');
3297              
3298             the column, or second hash key, that you wish to count, is specified at the command line
3299              
3300             =head2 var
3301              
3302             as simple as possible:
3303              
3304             var(2, 4, 5, 8, 9)
3305              
3306             as of version 0.02, C will die if any undefined values are provided
3307              
3308             like C, C, etc., C can accept array references, to make code simpler:
3309              
3310             my $ref = \@arr;
3311             var($ref) = var(@arr)
3312              
3313             =head2 var_test
3314              
3315             As described by R: Performs an F test to compare the variances of two samples from normal populations
3316              
3317             use Stats::LikeR;
3318            
3319             my @x = (2.9, 3.0, 2.5, 2.6, 3.2);
3320             my @y = (3.8, 2.7, 4.0, 2.4);
3321            
3322             my $vt = var_test(\@x, \@y);
3323              
3324             also, conf_level can be set:
3325              
3326             $vt = var_test(\@x, \@y, conf_level => 0.99);
3327              
3328             as well as a ratio (from R: the hypothesized ratio of the population variances of C and C:
3329              
3330             $test_data = var_test(\@xk, \@yk, ratio => 2);
3331              
3332             =head2 view
3333              
3334             An R-style C for the structures C returns. Prints the first
3335             few rows of a dataframe as an aligned text table, with numeric columns
3336             right-justified, string columns left-justified, and undefined cells shown as
3337             C. Works on all three C values:
3338              
3339              
3340              
3341             =begin html
3342              
3343            
3344            
3345            
3346             `output.type`
3347             Perl structure
3348             What `view` shows
3349            
3350            
3351            
3352            
3353             aoh
3354             array of hash refs
3355             one line per row, sequential row numbers
3356            
3357            
3358             hoa
3359             hash of array refs
3360             values gathered column-wise by row index
3361            
3362            
3363             hoh
3364             hash of hash refs
3365             top-level keys become the row label column
3366            
3367            
3368            
3369              
3370             =end html
3371              
3372              
3373              
3374             =head3 Synopsis
3375              
3376             my $aoh = read_table('all.data.tsv', 'output.type' => 'aoh');
3377            
3378             view($aoh); # first 6 rows, like head()
3379             view($aoh, n => 20); # first 20 rows
3380             view($aoh, cols => [qw(id age tt)]); # force a column order
3381             view($aoh, 'row.names' => 'id'); # use column 'id' as the row label
3382             view($aoh, na => '.', max_width => 30);
3383            
3384             my $txt = view($aoh, return_only => 1); # capture the string, print nothing
3385             view($aoh, to => \*STDERR); # print somewhere other than STDOUT
3386              
3387             =head3 Output
3388              
3389             # AoH: 7 rows x 3 cols (showing 6)
3390             row_name Testosterone, total (nmol/L) age sex
3391             p1 18.2 41 M
3392             p2 NA 7 F
3393             p3 1.05 33 F
3394             p4 22.9 55 M
3395             p5 14 29 M
3396             p6 NA 62 F
3397             # ... 1 more row
3398              
3399             The banner reports the structure type, full dimensions, and how many rows are
3400             displayed. A footer appears only when rows are hidden.
3401              
3402             =head3 Arguments
3403              
3404             All arguments after the data reference are optional name/value pairs.
3405              
3406              
3407              
3408             =begin html
3409              
3410            
3411            
3412            
3413             Argument
3414             Default
3415             Meaning
3416            
3417            
3418            
3419            
3420             n
3421             6
3422             Number of rows to show. n greater than the table shows everything.
3423            
3424            
3425             rows
3426             6
3427             Number of rows to show. n greater than the table shows everything (synonymous with n)
3428            
3429            
3430             cols / columns
3431            
3432             Array ref pinning column order (and which columns appear).
3433            
3434            
3435             row.names
3436            
3437             Column to use as the row label (for aoh/hoa). See ordering note.
3438            
3439            
3440             na
3441             'NA'
3442             Token printed for undefined cells.
3443            
3444            
3445             max_width
3446             80
3447             Truncate any cell wider than this (column names are never truncated).
3448            
3449            
3450             ellipsis
3451             '...'
3452             Marker appended to truncated cells.
3453            
3454            
3455             gap
3456             2
3457             Spaces between columns.
3458            
3459            
3460             to
3461             STDOUT
3462             Filehandle to print to.
3463            
3464            
3465             return_only
3466             0
3467             If true, return the string and print nothing.
3468            
3469            
3470            
3471              
3472             =end html
3473              
3474              
3475              
3476             C always returns the formatted string, whether or not it also prints.
3477              
3478             =head3 A note on column order
3479              
3480             C stores rows as hashes, so the original CSV column order is not
3481             preserved. C therefore sorts columns by name for a stable, reproducible
3482             layout. Two conveniences soften this:
3483              
3484             =over
3485              
3486             =item * A column literally named C (the label C assigns to a
3487             leading blank header) is detected automatically and moved to the left as the
3488             row label.
3489              
3490             =item * Pass C<< cols =E [ ... ] >> to control both the order and the selection of columns
3491             shown.
3492              
3493             =back
3494              
3495             When no label column is present, C numbers the rows C<1, 2, 3, …>, the way
3496             R prints row names for an unnamed data frame.
3497              
3498             =head3 Edge cases
3499              
3500             =over
3501              
3502             =item * Empty input (C<[]> or C<{}>) prints a clean C<0 rows x 0 cols> banner.
3503              
3504             =item * Tabs, carriage returns, and newlines inside a cell are escaped (C<\t>, C<\r>,
3505             C<\n>) so one record always stays on one line.
3506              
3507             =item * A non-reference argument, or a hash whose values are plain scalars, dies with
3508             a clear message rather than producing garbled output.
3509              
3510             =back
3511              
3512             =head3 Tests
3513              
3514             The behavior above is covered by C (run with C): the three
3515             structure types, C boundaries, alignment, C rendering, truncation,
3516             C/C handling, control-character escaping, the C and
3517             C output paths, empty structures, and the error cases.
3518              
3519             =head2 wilcox_test
3520              
3521             $test_data = wilcox_test(
3522             [1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30],
3523             [0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29]
3524             );
3525              
3526             Computes the Wilcoxon rank-sum / Mann-Whitney test (two samples) or the Wilcoxon signed-rank test (one sample or paired), following R's C conventions.
3527             This is an alternative to the t-test, that does not assume a normal distribution.
3528             With two array refs and no C flag it runs the two-sample rank-sum test; with a single sample, or with C<< paired =E 1 >>, it runs the signed-rank test. It calculates exact p-values by default for C<< N E 50 >> without ties; when ties (or, for the signed-rank case, zero differences) are present it automatically switches to the normal approximation with continuity correction.
3529              
3530             =head3 Calling conventions
3531              
3532             The first one or two array-ref arguments are taken positionally as C and C; everything after that is parsed as C<< key =E value >> pairs. The named forms C<< x =E >> and C<< y =E >> are also accepted and override the positional values. The flat argument list following the positional refs must contain an even number of elements, or the call dies with a usage message.
3533              
3534             # positional
3535             wilcox_test(\@x, \@y, paired => 1);
3536            
3537             # fully named
3538             wilcox_test(x => \@x, y => \@y, alternative => "greater", exact => 0);
3539              
3540             =head3 Input parameters
3541              
3542              
3543              
3544             =begin html
3545              
3546            
3547            
3548            
3549             Parameter
3550             Type
3551             Default
3552             Description
3553            
3554            
3555            
3556            
3557             x
3558             ARRAY ref
3559             (required)
3560             The first sample. Passed positionally or as x =>. Non-numeric and undefined elements are silently dropped; an empty or all-missing x is fatal. In the two-sample test mu is subtracted from each x value.
3561            
3562            
3563             y
3564             ARRAY ref
3565             undef
3566             The second sample. If present and paired is false, a two-sample rank-sum test is run. If paired is true, y is required and must be the same length as x. Omit it for the one-sample signed-rank test.
3567            
3568            
3569             paired
3570             boolean
3571             0 (false)
3572             Run a paired signed-rank test on the per-element differences x[i] - y[i] - mu. Requires y of equal length.
3573            
3574            
3575             correct
3576             boolean
3577             1 (true)
3578             Apply the continuity correction (±0.5) when using the normal approximation. Ignored when an exact p-value is computed.
3579            
3580            
3581             mu
3582             number
3583             0.0
3584             Null-hypothesis location shift. Subtracted from x (two-sample) or from each difference (one-sample / paired).
3585            
3586            
3587             exact
3588             boolean / undef
3589             undef (auto)
3590             Tri-state. undef (or absent) selects exact automatically: when both group sizes are < 50 and there are no ties (two-sample), or n < 50 with no ties (signed-rank). A true value forces the exact test, a false value forces the approximation. Exact is impossible with ties — or, for the signed-rank test, with zero differences — and falls back to the approximation with a warning.
3591            
3592            
3593             alternative
3594             string
3595             "two.sided"
3596             One of "two.sided", "less", or "greater". Selects the tail(s) used for the p-value.
3597            
3598            
3599            
3600              
3601             =end html
3602              
3603              
3604              
3605             =head3 Output
3606              
3607             Returns a hash ref with the following keys:
3608              
3609              
3610              
3611             =begin html
3612              
3613            
3614            
3615            
3616             Key
3617             Type
3618             Description
3619            
3620            
3621            
3622            
3623             statistic
3624             number
3625             The test statistic. For the two-sample test this is the Mann-Whitney W (the x rank sum minus nx*(nx+1)/2). For the signed-rank test it is V, the sum of the ranks assigned to the positive differences.
3626            
3627            
3628             p_value
3629             number
3630             The p-value for the chosen alternative, capped at 1.0. Two-sided p-values are 2 * min(p_less, p_greater).
3631            
3632            
3633             method
3634             string
3635             A human-readable description of the exact test variant that was run (see below).
3636            
3637            
3638             alternative
3639             string
3640             Echoes the alternative actually used ("two.sided", "less", or "greater").
3641            
3642            
3643            
3644              
3645             =end html
3646              
3647              
3648              
3649             The C string reports which path executed:
3650              
3651             =over
3652              
3653             =item * Two-sample: C<"Wilcoxon rank sum exact test">, C<"Wilcoxon rank sum test with continuity correction">, or C<"Wilcoxon rank sum test">.
3654              
3655             =item * One-sample / paired: C<"Wilcoxon exact signed rank test">, C<"Wilcoxon signed rank test with continuity correction">, or C<"Wilcoxon signed rank test">.
3656              
3657             =back
3658              
3659             =head3 Notes and edge cases
3660              
3661             Missing data is handled by listwise removal of non-numeric / undefined cells before ranking; in the paired case a pair is dropped if either member is missing. An empty C (or, in the two-sample case, an empty C) after this filtering is fatal.
3662              
3663             For the signed-rank test, exact zero differences are discarded before ranking (matching R), and their presence disables the exact computation. Both empty-after-filtering and all-zero-difference inputs are fatal.
3664              
3665             Ties are detected during ranking and trigger the tie-corrected variance in the normal approximation; they also rule out the exact p-value. When C is left on auto, the size thresholds (C<< E 50 >> per group, or C<< E 50 >> differences) are what gate the exact vs. approximate decision.
3666              
3667             =head2 write_table
3668              
3669             mimics R's C, with data as first argument to subroutine, and output file as second
3670              
3671             write_table(\@data_aoh, $tmp_file, sep => "\t", 'row.names' => 1);
3672              
3673             You can also precisely filter and reorder which columns are written by passing an array reference to C:
3674              
3675             write_table(\@data, $tmp_file, sep => "\t", 'col.names' => ['c', 'a']);
3676              
3677             undefined variables are printed as C by default, but can be set as you wish using C
3678              
3679             write_table(\%data_hoa, '/tmp/undef.val.tsv', sep => "\t", 'undef.val' => 'nan')
3680              
3681             as of version 0.07, C determines comma and tab-separated delimiters from the filename, but will override if C or C are explicitly set.
3682              
3683             Args can also be accepted:
3684              
3685             write_table( 'data' => \%flat, 'file' => $f );
3686              
3687             =head1 changes
3688              
3689             =head2 0.16
3690              
3691             changes to dist.ini, the minimum Perl version disappeared when I fixed other problems
3692              
3693             clarifications between run time and test dependencies
3694              
3695             addition of C function to sort AoH and HoA
3696              
3697             addition of C to translate array of hashes into a hash of arrays
3698              
3699             fix of long double functions: https://www.cpantesters.org/cpan/report/5d5d9836-6a5f-11f1-aadb-63fd6d8775ea
3700              
3701             =head3 C
3702              
3703             output residual keys now use names, not integers
3704              
3705             =head3 C
3706              
3707             =head3 Bug fixes
3708              
3709             B When
3710             C<< valid_n E= p >>, the cleanup freed the C I but not the
3711             per-row name strings it held (those had been transferred out of C,
3712             whose own array was already freed). The strings leaked on every such error.
3713             Added the per-entry C loop before freeing the array, matching the
3714             normal path.
3715              
3716             B Only the first hash value was
3717             checked to be a C; subsequent values were C'd unconditionally, so
3718             a malformed row (C<< { a =E {...}, b =E 5 } >>) dereferenced a non-reference. Every
3719             row is now validated, with the partial allocations cleaned up before the
3720             C, mirroring the existing AoH path.
3721              
3722             B<< C on a possibly-signed C. >> C is undefined for
3723             byte values ≥ 0x80 on platforms where C is signed. Cast to
3724             C<(unsigned char)> before the call.
3725              
3726             =head3 Speed / RAM improvements
3727              
3728             B C silently
3729             truncated any longer formula. Replaced with a buffer sized to
3730             C, so there is no fixed limit and no truncation.
3731              
3732             B<< C<.>-expansion buffer is now a growable heap buffer. >> C
3733             silently dropped expanded terms once full. It is now a buffer that doubles on
3734             demand. Appends also went from C (which rescans from the start every
3735             time — O(n²) over many columns) to an O(1) amortised append that tracks the
3736             write position.
3737              
3738             B The original
3739             C'd a C buffer, filled it, copied it into C, and freed it
3740             I — C allocations plus C copies. Each candidate row is now
3741             written straight into C at its prospective commit slot; a row that fails
3742             listwise deletion is simply overwritten by the next candidate. This removes the
3743             C allocate/free cycles and the copy loop entirely.
3744              
3745             B<< Categorical levels sorted with C. >> The level list used an O(n²) bubble
3746             sort; replaced with C (relevant only for high-cardinality factors).
3747              
3748             B<< Unused tail of C reclaimed after listwise deletion. >> C is allocated for
3749             all C rows up front (C is unknown until rows are scanned). When rows
3750             are dropped, C is now Ced down to C, returning the unused
3751             tail to the allocator before the OLS phase.
3752              
3753             B The argument-parsing index was widened from
3754             C to C to match C, and the HoH row count now uses
3755             C rather than relying on C's return value.
3756              
3757             =head3 Known limitations (left unchanged)
3758              
3759             =over
3760              
3761             =item * A multi-way term such as C is split only on the first C<*>, so it yields
3762             C, C, and C rather than a full three-way expansion. Deeper
3763             interactions silently fail (the unparsable term evaluates to C and the
3764             rows are dropped). This matches the documented two-way C<*> support.
3765              
3766             =item * HoA input takes the row count from the first column; columns shorter than
3767             that simply contribute dropped rows rather than raising an error.
3768              
3769             =back
3770              
3771             =head3 C
3772              
3773             =head4 Bug fixes
3774              
3775             B Nearly every C after an allocation
3776             leaked memory. C does a C, so anything allocated but not yet
3777             freed is lost. Affected paths:
3778              
3779             =over
3780              
3781             =item * AoA and hash first-pass errors leaked C and any C entries
3782             allocated so far.
3783              
3784             =item * Formula-mode "not found as an array ref" errors leaked C and C.
3785              
3786             =back
3787              
3788             All post-allocation errors now route through a single C label that frees
3789             every pointer unconditionally. Pointers are initialised to C and C
3790             is zero-allocated with C, so the cleanup is always safe to run.
3791              
3792             B<< Undefined and non-numeric cells silently coerced to C<0.0>. >> The original
3793             second pass used C<(svp && *svp) ? SvNV(*svp) : 0.0>, meaning an C or
3794             non-numeric cell was quietly treated as zero, silently corrupting the
3795             F-statistic. Each cell is now validated with C and C;
3796             the call dies naming the group and observation index, consistent with the rest
3797             of C (C, C, C, etc.).
3798              
3799             B C
3800             cast to C I adding, so an empty array (C returns C<-1>)
3801             produced C rather than C<0>. Changed to
3802             C so the C<+1> is done in signed arithmetic
3803             before the cast.
3804              
3805             B<< Unreliable group count from C. >> C returns the
3806             number of buckets in use rather than the number of keys for tied hashes.
3807             Replaced with C, which always returns the correct key count.
3808              
3809             =head4 Improvements
3810              
3811             B<< C accepted as an alias for C. >> R users write
3812             C; the argument parser now accepts both spellings.
3813              
3814             B C and manual C replaced
3815             with C, C, C, and C. C additionally
3816             preserves embedded NUL bytes in group key strings, which the previous
3817             C-based copies silently truncated.
3818              
3819             =head4 Known limitations (not changed)
3820              
3821             =over
3822              
3823             =item * A factor column named C or C in a formula call will
3824             collide with reserved top-level keys in the result hash.
3825              
3826             =item * Group names containing an embedded NUL are stored correctly but are still
3827             truncated at C when written into the output hash keys.
3828              
3829             =back
3830              
3831             =head3 C
3832              
3833             default view shifted to 80 characters to match Linux window length
3834              
3835             =head4 New features
3836              
3837             =over
3838              
3839             =item * B<< C is accepted as a synonym for C >> (the number of rows shown).
3840             Passing both C and C is an error.
3841              
3842             =item * B C validates its argument names
3843             against the documented set (C, C, C, C, C,
3844             C, C, C, C, C, C, C) and
3845             dies listing any it does not recognise, so a misspelt option (e.g. C)
3846             is caught instead of silently ignored.
3847              
3848             =item * B<< C / C is validated. >> It must be a non-negative integer; C or
3849             a non-numeric value now dies with a clear message instead of producing
3850             warnings and being treated as C<0>.
3851              
3852             =item * B
3853              
3854             =back
3855              
3856             =head4 Bug fixes
3857              
3858             =over
3859              
3860             =item * B<< C<< n =E 0 >> now still prints the column header. >> Column names were collected
3861             only from the rows being shown, so requesting zero rows produced an empty
3862             header line. At least one row is now scanned (when data exists) so the
3863             header always lists the columns.
3864              
3865             =item * B<< An empty hash (C<{}>) no longer dies. >> It was rejected as
3866             I<"neither ARRAY nor HASH">; it is now shown as an empty table
3867             (C<0 rows x 0 cols>), matching the handling of an empty array.
3868              
3869             =item * B<< The C alias now drives the Hash-of-Hashes label header. >> The
3870             header for the row-label column consulted only C, so
3871             C<< row_names =E 'id' >> displayed C instead of C. Both spellings are
3872             now honoured consistently.
3873              
3874             =item * B A Hash-of-Arrays column or
3875             Hash-of-Hashes row whose value is not actually an array/hash reference now
3876             renders as empty cells rather than throwing a dereference error.
3877              
3878             =back
3879              
3880             =head4 Performance
3881              
3882             =over
3883              
3884             =item * Column gathering no longer sorts once per scanned row. Unique column names
3885             are collected across the scanned rows and sorted a single time (same output
3886             order), and the ellipsis length is computed once rather than per cell.
3887              
3888             =back
3889              
3890             =head4 Tests
3891              
3892             =over
3893              
3894             =item * C is self-contained (the C implementation is inlined; it loads
3895             no other files) and covers the new argument handling, the bug fixes above,
3896             and the existing AoH / HoA / HoH behaviour, alignment, truncation, and
3897             output-path handling.
3898              
3899             =back
3900              
3901             =head3 C
3902              
3903             Corrected four bugs in the C XSUB plus a portability fix in its exact signed-rank helper. Behaviour on valid input is unchanged: the R-agreement cases (unpaired C, C

; paired one-sided C, C

; separated exact C, C

) all still match R's C.

3904              
3905             =head4 Bug fixes
3906              
3907             =over
3908              
3909             =item * B<< Invalid C is now rejected. >> Any value other than C or C previously fell through to the two-sided branch and returned a two-sided result mislabelled with the bad string, so a typo like C<< alternative =E "twosided" >> silently "worked". It now croaks unless C is one of C, C, C.
3910              
3911             =item * B When every observation is tied the approximation's variance collapses to 0 and the old code divided by C: C returned C

(a "significant" difference between identical samples). It now warns and returns C

.

3912              
3913             =item * B<< Two-sided continuity correction at C. >> R uses C, so the correction is C<0> when the statistic sits exactly on its mean; the old code used C<-0.5>. Example: C<< wilcox_test([1,4], [2,3], exact =E 0) >> changed from C

to C

(matches R).

3914              
3915             =item * B<< C no longer shadows libm. >> The local C accumulator (mean of the statistic) shadowed the C library C; renamed to C (two-sample) and C (signed-rank). No active miscompute, removed as a latent hazard.
3916              
3917             =back
3918              
3919             =head4 Cosmetic
3920              
3921             =over
3922              
3923             =item * Collapsed a no-op ternary that assigned the same signed-rank exact method string on both branches; the C field is now simply C.
3924              
3925             =back
3926              
3927             =head4 Portability (exact signed-rank helper)
3928              
3929             =over
3930              
3931             =item * B<< C no longer calls C. >> The C<2^n> normaliser is now built by exact repeated doubling, which has no long-double libm dependency. This fixes an C load failure reported by a CPAN smoker (FreeBSD, perl 5.20, C) whose libm lacks the long-double math functions; the symbol resolved on glibc, which is why local builds passed. C accumulation in the DP is retained — only the C call was at fault.
3932              
3933             =item * B<< C → C >> for C, C, and the DP loop counters, which also removes a C-to-C narrowing at the call site. The C result (C) stays signed so its negative-C sentinel still fires, and is cast to C only after the C<< k E 0 >> check.
3934              
3935             =back
3936              
3937             =head4 Tests
3938              
3939             =over
3940              
3941             =item * Added C (flat, no subtests): R-agreement cases, option handling (C, C, C, C, named/positional C/C, NA dropping), regressions for all four bug fixes, argument-error and C-validation checks, output shape, and C coverage of the two-sample, exact, and paired allocation paths.
3942              
3943             =back
3944              
3945             =head2 0.15
3946              
3947             C function added, similar to R's C
3948              
3949             C:
3950             filter => {
3951             'Testosterone, total (nmol/L)' => sub { defined $_ },
3952             }
3953              
3954             was broken by the change in undefined variables in 0.14, but is back to being C
3955              
3956             C improvement in sectioning in README
3957              
3958             Numerous changes to prevent quadmath/long double CPAN test failures
3959              
3960             Minimum Scalar::Util version in dist.ini is now 1.22, see https://www.cpantesters.org/cpan/report/6b682236-6567-11f1-a3bc-a055f9c4ba34
3961              
3962             C is no longer needed, and removed as a dependency
3963              
3964             =head3 C
3965              
3966             =head4 Bug fixes
3967              
3968             =over
3969              
3970             =item * B C strips a
3971             leading comment marker from the header line (so a file may begin with
3972             C<#id,val>), but that strip was dead code: the XS parser skipped I line
3973             beginning with the comment string before the callback ever saw it, so a
3974             commented header was silently dropped and the first data row was mistaken for
3975             the header. The parser now delivers the first content line even when it
3976             begins with the comment marker, and only skips comment lines after the header
3977             has been seen.
3978              
3979             =item * B The parser stripped
3980             C<\r> unconditionally, so a quoted value such as C<"x\ry"> lost its carriage
3981             return and would not survive a C -> C round-trip. C<\r>
3982             is now stripped only as part of a trailing CRLF line ending and as a stray CR
3983             I quotes; inside quotes it is literal data.
3984              
3985             =item * B<< Duplicate column names no longer corrupt C output. >> With
3986             C<< output.type =E 'hoa' >>, a repeated column name pushed the same cell once per
3987             occurrence, so the affected columns came out longer than the others and the
3988             arrays no longer lined up by row. Columns are now keyed by unique header name
3989             (first-seen order preserved, later values win, one warning emitted).
3990              
3991             =item * B Passing a defined argument
3992             that was not a CODE reference silently fell through to slurp mode and ignored
3993             the argument; it now croaks
3994             (I<"callback must be a CODE reference">).
3995              
3996             =item * B<< An undefined/empty C row-name now dies instead of keying on C<"">. >>
3997             With C<< output.type =E 'hoh' >>, a row whose row-name column was empty/undef was
3998             stored under the C<''> key and raised I<"uninitialized value"> warnings. It now
3999             dies, naming the column and the offending data row.
4000              
4001             =item * B A 1-based numeric
4002             filter key greater than the column count was accepted, then silently extended
4003             every row through the C<$_> write-back. It is now rejected up front with a
4004             message naming the column count.
4005              
4006             =item * B<< C and C together now die. >> Supplying both silently preferred
4007             C; passing both is now an explicit error (C remains an alias for
4008             C when used alone).
4009              
4010             =item * B The unknown-argument path used
4011             C to dump the offending names to STDOUT before dying; the names are now
4012             carried in the C message itself.
4013              
4014             =back
4015              
4016             =head4 Better diagnostics
4017              
4018             =over
4019              
4020             =item * Alignment errors now report B is ragged
4021             (I<"Alignment error on FILE data row N (X fields vs Y headers)">), instead of
4022             only the field/header counts.
4023              
4024             =back
4025              
4026             =head4 Memory-leak fixes (exception paths)
4027              
4028             The parser allocated its working buffers (C, C, and — in
4029             slurp mode — C) in the XS C block, i.e. I any validation, and
4030             freed them only by falling off the end of the function. Any non-local exit
4031             therefore leaked:
4032              
4033             =over
4034              
4035             =item * the open-failure C leaked the row buffer and field (and the slurp
4036             accumulator);
4037              
4038             =item * far more commonly, a C thrown B — which
4039             C does routinely on alignment errors, bad row names, and filter
4040             exceptions — unwound straight out of the XS frame and leaked the field, the
4041             current row, the line buffer, the slurp accumulator, I
4042             handle>.
4043              
4044             =back
4045              
4046             Allocations now happen in C after every croak-able check, and every
4047             long-lived resource (the file handle via C, the buffers via
4048             C) is tied to the save stack, which an exception unwinds. Measured
4049             with C: a C mid-file went from 5 leaked SVs to 0, and an
4050             open failure from 2 to 0. This is the likely source of the constant-size leaks
4051             seen in CPAN-tester reports for the exception-path tests.
4052              
4053             =head4 Performance
4054              
4055             =over
4056              
4057             =item * B<~2.5x faster parsing> (57 -> 145 MB/s on a 100k-row quoted file). The core
4058             loop appended one character at a time with C; it now
4059             scans runs of ordinary bytes with C / a bounded scan and appends each
4060             run in a single C, copying field contents in bulk rather than byte
4061             by byte.
4062              
4063             =back
4064              
4065             =head4 Internal / non-behavioral
4066              
4067             =over
4068              
4069             =item * XS declarations moved from C to C; allocations deferred into
4070             C (see the leak fixes above).
4071              
4072             =item * The filter loop now aliases the row hash with C
4073             instead of copying it with C. This removes a full
4074             per-row hash copy for every filtered row and fixes a latent staleness bug:
4075             after a filter mutated C<$_> and the change was written back, C<%_> still
4076             reflected the pre-mutation copy, so a subsequent filter in the same row saw
4077             stale values. With aliasing, C<%_> I the row, so write-backs are always
4078             visible.
4079              
4080             =back
4081              
4082             =head4 Known limitation (not changed)
4083              
4084             =over
4085              
4086             =item * B<< C does not round-trip back to C. >> C renders an
4087             C cell as an empty field by default, and C maps an empty
4088             field back to C, so the I round-trip is clean. But if a file is
4089             written with a token such as C<< 'undef.val' =E 'NA' >>, C has no
4090             inverse option and reads C back as the string C<'NA'>. C also
4091             cannot distinguish a deliberately quoted empty string (C<"">) from a missing
4092             value -- both become C. Adding an C-style option to
4093             C (mapping configurable tokens and/or empty fields to C)
4094             would close this gap.
4095              
4096             =back
4097              
4098             =head3 C
4099              
4100             =head4 Behavior change
4101              
4102             =over
4103              
4104             =item * B<< C cells now write as an empty field, not an empty string. >> A missing
4105             or C value renders as nothing between separators (C) rather than a
4106             quoted empty string (C / C). Supplying C<< 'undef.val' =E 'NA' >>
4107             (or any other token) still overrides this, exactly as before. This is the
4108             only change that can alter the bytes of an existing output file; if you relied
4109             on the previous default, pass C<< 'undef.val' =E '' >> to keep an explicit empty
4110             field, or your chosen placeholder.
4111              
4112             =back
4113              
4114             =head4 Bug fixes
4115              
4116             =over
4117              
4118             =item * B
4119             Previously, cells were looked up with the raw bytes of the column name
4120             (C), which fails to match a
4121             UTF-8-flagged hash key: the column header printed correctly but every cell
4122             under it came back empty. All lookups now fetch by SV (C), header
4123             lists are gathered and sorted as SVs (C + C, preserving the
4124             flag) instead of being round-tripped through C, and the C
4125             column is matched with C rather than C. Embedded NUL bytes in
4126             keys are handled correctly as a side effect.
4127              
4128             =item * B<< C<< col.names =E [] >> no longer loops forever. >> An empty C array made
4129             C return C<-1>, which — compared against an unsigned C loop
4130             index — wrapped to C and ran effectively without end. This was fixed
4131             for flat hashes previously; it was still present for hash-of-hashes,
4132             hash-of-arrays, and array-of-hashes, plus both C header-filtering
4133             loops. All such loops now use a signed index.
4134              
4135             =item * B One header loop used an
4136             C index that silently wrapped past 65,535 and never terminated.
4137             It now uses C like the rest of the code.
4138              
4139             =item * B Every other input shape
4140             rejects a nested reference with
4141             I<"Cannot write nested reference types to table">; a flat hash instead
4142             stringified it (e.g. C) into the file. It now croaks
4143             consistently.
4144              
4145             =item * B<< C<< 'undef.val' =E undef >> is handled cleanly. >> It previously called
4146             C on C, raising an I<"uninitialized value"> warning and
4147             yielding an empty string by accident. It is now treated explicitly as an empty
4148             field, with no warning.
4149              
4150             =back
4151              
4152             =head4 Memory-leak fixes (exception paths)
4153              
4154             =over
4155              
4156             =item * The row-key list gathered for hash-of-hashes input was leaked when the output
4157             file could not be opened.
4158              
4159             =item * The I<"Could not get headers"> croak on hash-of-arrays input leaked both the
4160             already-open filehandle and the headers array.
4161              
4162             =back
4163              
4164             =head4 Internal / non-behavioral
4165              
4166             =over
4167              
4168             =item * Numeric row labels are now formatted into a reused stack buffer instead of a
4169             per-row C / C allocation (no functional change; removes a
4170             cast-away-C and one allocation per row).
4171              
4172             =item * Several signed/unsigned index types were made consistent (C vs
4173             C) to match C and silence the conditions behind the loop bugs
4174             above.
4175              
4176             =back
4177              
4178             =head4 Tests
4179              
4180             =over
4181              
4182             =item * C expanded from 17 to 69 assertions. New coverage targets each
4183             fix above: the empty-field default and C<< undef.val =E undef >> (no warning),
4184             C<< col.names =E [] >> termination across all four input shapes, the
4185             >65,535-column header loop (gated behind C), in-sequence
4186             numeric row labels, nested-reference rejection, CSV quoting corners
4187             (carriage return, separators inside column names, multi-character separators),
4188             empty input writing no file, and UTF-8 column names and row keys. Two leak
4189             assertions cover the exception paths above.
4190              
4191             =back
4192              
4193             =head2 0.14
4194              
4195             C function added for rows
4196              
4197             C reads undefined values to C instead of C, which makes calculations easier
4198              
4199             C writes undef by default as an empty string C<''>
4200              
4201             C transforms a hash of hashes into an hash of arrays
4202              
4203             C uses C instead of C to allow for high-precision 128-bit floats to be used on quadmath machines when available: https://www.cpantesters.org/cpan/report/296f4868-631f-11f1-abba-ff15558d240b
4204              
4205             Numerous switches from C to C for local precision, like above
4206              
4207             numerous changes to C for ease of use and working with datasets with numerous undefined values
4208              
4209             dist.ini now links to math library when compiling: https://www.cpantesters.org/cpan/report/785e26d8-6397-11f1-89c0-dc066e8775ea
4210              
4211             C now should be complete, errors with confidence intervals fixed
4212              
4213             =head2 0.13
4214              
4215             C: speed improvements; commented headers are now allowed
4216              
4217             C: fix for
4218              
4219             Attempt to free temp prematurely: SV 0x56417a2ae610 at t/write_table.t line 182.
4220             main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203
4221             Attempt to free unreferenced scalar: SV 0x56417a2ae610 at t/write_table.t line 183.
4222             main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203
4223              
4224             C gives better warnings for incorrect types of data given
4225              
4226             Numerous changes to dist.ini to improve CPAN testing, especially for Win32
4227              
4228             =head2 0.12
4229              
4230             C can also take hash of arrays, and various mixes of data types
4231              
4232             C: Addition of C keywords in many places; should improve CPU performance
4233              
4234             Better POD formatting, correction of output hash for README's C
4235              
4236             C can now accept hash of hashes as input
4237              
4238             new C function for switching 2D hash keys and 2D array indices, and C for comparing columns against columns
4239              
4240             removed unused function from C helpers
4241              
4242             C: addition of restrict keywords in preinit, should improve CPU performance
4243              
4244             MANIFEST.skip changed to MANIFEST.SKIP to improve CPAN testing
4245              
4246             using C for tests of C, which may or may not work with CPAN testers (experimental)
4247              
4248             Added function name to warnings, so I actually know which function is producing the error
4249              
4250             C can also take C and C as args, in addition to positions
4251              
4252             fixed C as it could hang if given empty C or C
4253              
4254             Added C<__EXTENSIONS__> to source XS file for better CPAN testing
4255              
4256             =head2 0.11
4257              
4258             better POD formatting for tables
4259              
4260             addition of MANIFEST.skip to get better testing results on CPAN
4261              
4262             C: bugfix for when there is no intercept in the formula, new test cases in t/glm.t
4263              
4264             C now accepts simple hashes as input, in addition to hash of arrays, hash of hashes, and arrays of hashes
4265              
4266             Better documentation for t-test
4267              
4268             =head2 0.10
4269              
4270             changes to compilation for CPAN, trying to get this work on Windows
4271              
4272             Addition of C and C
4273              
4274             C will work without key names, just like in R. Testing for C has improved.
4275              
4276             =head2 0.09
4277              
4278             context changes in XS C, C, and C to get better CPAN testing results
4279              
4280             C keywords added to C to increase speed
4281              
4282             =head2 0.08
4283              
4284             Speed improvement in C of hashes.
4285              
4286             Addition of C, C, C, C, and C functions
4287              
4288             Chi-squared function no longer has Perl wrapper, and all code is in XS, which should result in a minor speed increase with 1 less function call.
4289              
4290             Compiler changes for GNU source and inclusion of C, to ensure more CPAN testing works better.
4291              
4292             C now returns hash-of-hash in {row}{column}
4293              
4294             =head2 0.07
4295              
4296             Addition of C function.
4297              
4298             Formulas can now be omitted from C, resulting in a stacked calculation as R would think.
4299              
4300             Addition of C for multi-group comparisons that does not assume normality like C does.
4301              
4302             C and C now automatically set separators for C<.csv> files as C<,> and C<.tsv> files as C<"\t">, respectively, so these values no longer need to be specified separately from the file name.
4303              
4304             =head2 0.06
4305              
4306             Changed compiler options so that Solaris will work
4307              
4308             signed integers changed to unsigned in C
4309              
4310             Added restrict keywords to C, and made C to C
4311              
4312             =head2 0.05
4313              
4314             Leak testing for C
4315              
4316             removal of Data::Printer dependency for easier CPAN testing
4317              
4318             switched several C variable to C so that clang doesn't complain
4319              
4320             added restrict keyword for C
4321              
4322             =head2 0.04
4323              
4324             addition of C function
4325              
4326             GNU source, to maximize compatibility and ease installation
4327              
4328             removal of JSON dependency to ease installation
4329              
4330             =head2 0.03
4331              
4332             Compatibility back to Perl 5.10
4333              
4334             =head2 0.02
4335              
4336             back-compatible to Perl 5.10, instead of original 5.40, ensuring more people can use it
4337              
4338             added var_test
4339              
4340             mean, min, sum, median, var, and max die with undefined values, and print the offending indices
4341              
4342             "group_stats" added to aov, for TukeyHSD in the future
4343              
4344             "cor" dies when given data with standard deviation of 0
4345              
4346             C now has C option, which shows how undefined values are printed to tables, which is C by default.