File Coverage

blib/lib/Stats/LikeR.pm
Criterion Covered Total %
statement 205 367 55.8
branch 71 204 34.8
condition 36 77 46.7
subroutine 30 36 83.3
pod 4 4 100.0
total 346 688 50.2


are exported by default.
line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2             # ABSTRACT: Get basic statistical functions, like in R, but with Perl using XS for performance
3             require 5.010;
4 14     14   546487 use strict;
  14         23  
  14         482  
5 14     14   83 use feature 'say';
  14         22  
  14         3472  
6             package Stats::LikeR;
7             our $VERSION = 0.15;
8             require XSLoader;
9 14     14   7237 use Devel::Confess 'color';
  14         128225  
  14         43  
10 14     14   1370 use warnings FATAL => 'all';
  14         23  
  14         629  
11 14     14   5912 use autodie ':default';
  14         197331  
  14         55  
12 14     14   66792 use Exporter 'import';
  14         18  
  14         525  
13 14     14   53 use Scalar::Util 'looks_like_number';
  14         17  
  14         8547  
14             XSLoader::load('Stats::LikeR', $VERSION);
15             our @EXPORT_OK = qw(add_data aov cfilter chisq_test col col2col cor cor_test cov dnorm filter fisher_test glm group_by hoh2hoa hist kruskal_test ks_test ljoin lm matrix max mean median min mode oneway_test p_adjust power_t_test prcomp quantile rbinom read_table rnorm runif sample scale sd seq shapiro_test sum summary t_test transpose value_counts var var_test view wilcox_test write_table);
16             our @EXPORT = @EXPORT_OK;
17              
18             require XSLoader;
19             # ---- filter DSL: col() builds a predicate via overloading (pure Perl) -------
20             # Exported: filter (XS) and col. Place col()/Col/Pred near the top of the .pm;
21             # they need no XS. filter() is the XSUB.
22 26     26 1 139024 sub col { Stats::LikeR::Col->new($_[0]) }
23             {
24             package Stats::LikeR::Col;
25 26   33 26   176 sub new { bless { name => $_[1] }, ref($_[0]) || $_[0] }
26             # build a comparison leaf; if operands were swapped (4 > col('x')), flip the op
27             sub _c {
28 26     26   47 my ($self, $val, $swapped, $op, $flip) = @_;
29 26 100       61 Stats::LikeR::Pred->_leaf($self->{name}, $swapped ? $flip : $op, $val);
30             }
31             use overload
32 13     13   29 '>' => sub { $_[0]->_c($_[1],$_[2],'>','<') },
33 3     3   6 '<' => sub { $_[0]->_c($_[1],$_[2],'<','>') },
34 1     1   4 '>=' => sub { $_[0]->_c($_[1],$_[2],'>=','<=') },
35 1     1   4 '<=' => sub { $_[0]->_c($_[1],$_[2],'<=','>=') },
36 3     3   6 '==' => sub { $_[0]->_c($_[1],$_[2],'==','==') },
37 1     1   15 '!=' => sub { $_[0]->_c($_[1],$_[2],'!=','!=') },
38 0     0   0 'lt' => sub { $_[0]->_c($_[1],$_[2],'lt','gt') },
39 1     1   4 'gt' => sub { $_[0]->_c($_[1],$_[2],'gt','lt') },
40 0     0   0 'le' => sub { $_[0]->_c($_[1],$_[2],'le','ge') },
41 0     0   0 'ge' => sub { $_[0]->_c($_[1],$_[2],'ge','le') },
42 2     2   5 'eq' => sub { $_[0]->_c($_[1],$_[2],'eq','eq') },
43 1     1   4 'ne' => sub { $_[0]->_c($_[1],$_[2],'ne','ne') },
44 14     14   123 fallback => 1;
  14         33  
  14         270  
45             }
46             {
47             package Stats::LikeR::Pred;
48 26     26   308 sub _leaf { bless { op => $_[2], col => $_[1], val => $_[3] }, 'Stats::LikeR::Pred' }
49 4     4   52 sub _node { bless { op => $_[0], l => $_[1], r => $_[2] }, 'Stats::LikeR::Pred' }
50             use overload
51 2     2   4 '&' => sub { Stats::LikeR::Pred::_node('and', $_[0], $_[1]) },
52 1     1   3 '|' => sub { Stats::LikeR::Pred::_node('or', $_[0], $_[1]) },
53 1     1   3 '!' => sub { Stats::LikeR::Pred::_node('not', $_[0], undef) },
54 14     14   4601 fallback => 1;
  14         16  
  14         91  
55             }
56             sub summary {
57 5     5 1 660125 my ($data, %args);
58 5         38 my $current_sub = (split(/::/,(caller(0))[3]))[-1];
59              
60 5 100 66     29 if (@_ && ref $_[0]) {
61             # Handles: summary(\@arr) or summary(\@arr, nrows => 5) or summary(\%h, nrow => 3)
62 4         4 $data = shift;
63 4         9 %args = @_; # capture any trailing key/value pairs
64             } else {
65             # Handles: summary(@runif) or summary(@runif, nrows => 2)
66             # Extract known trailing named arguments from the flat list
67 1   33     15 while (@_ >= 2 && defined $_[-2] && !ref($_[-2]) && $_[-2] =~ /^(?:nrows|nrow)$/) {
      33        
      33        
68 0         0 my $val = pop @_;
69 0         0 my $key = pop @_;
70 0         0 $args{$key} = $val;
71             }
72             # The remaining items in @_ make up the actual data array
73 1         6 my @list = @_;
74 1         1 $data = \@list;
75             }
76             # Normalize nrow -> nrows, default to 10
77 5   50     26 $args{nrows} //= delete($args{nrow}) // 10;
      66        
78 5         8 my $ref_type = ref $data;
79 5 50 66     17 if (($ref_type ne 'ARRAY') && ($ref_type ne 'HASH')) {
80 0         0 die "$current_sub' data must either be a hash or an array, not \"$ref_type\"";
81             }
82 5         6 my $single_arr = 0;
83 5 100 100     26 if (($ref_type eq 'ARRAY') && (ref $data->[0] eq '')) {
84 2         4 $single_arr = 1;
85             }
86 5         11 my @header = ('# values', 'Min.', '1st Qu.', 'Median', 'Mean', '3rd Qu.', 'Max.');
87 5         6 my @out;
88 5 100       30 if ($single_arr == 1) {
    100          
    50          
89 2         3 push @out, '-' x 75;
90 2         11 my $header = sprintf('%9s ' x scalar @header, @header);
91 2         4 push @out, $header;
92 2         3 push @out, '-' x 75;
93 2         3 my @undef = grep {!defined $data->[$_]} 0..scalar @{ $data }-1;
  198         206  
  2         12  
94 2 50       8 if (scalar @undef > 0) {
95 0         0 say STDERR join (',', @undef);
96 0         0 die "The above indices are not defined in $current_sub";
97             }
98 2         2 my @numeric = grep {looks_like_number($_)} @{ $data };
  198         261  
  2         4  
99 2         51 my $q = quantile(\@numeric, probs => [0.25, 0.75]);
100 2         72 my $vals = sprintf('%9.4g ' x scalar @header, scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));
101 2         9 push @out, $vals;
102             } elsif ($ref_type eq 'ARRAY') {
103 1         2 push @out, '-' x 75;
104 1         6 my $header = sprintf('%9s ' x scalar @header, @header);
105 1         3 unshift @header, 'Index';
106 1         2 $header = 'Index ' . $header;
107 1         1 push @out, $header;
108 1         2 push @out, '-' x 75;
109 1         1 my $rows_printed = 0;
110 1         4 foreach my $index (0..$#$data) {
111 2         2 my @undef = grep {!defined $data->[$index][$_]} 0..scalar @{ $data->[$index] }-1;
  18         22  
  2         4  
112 2 50       6 if (scalar @undef > 0) {
113 0         0 say STDERR join (',', @undef);
114 0         0 die "The above indices are not defined for index $index in $current_sub";
115             }
116 2         2 my @numeric = grep {looks_like_number($_)} @{ $data->[$index] };
  18         24  
  2         3  
117 2         15 my $q = quantile(\@numeric, probs => [0.25, 0.75]);
118 2         29 my $vals = sprintf('%6.4g', $index) . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));
119 2         4 push @out, $vals;
120 2         3 $rows_printed++;
121 2 50       6 last if $rows_printed >= $args{nrows}; # Changed to >= just to be safe
122             }
123             } elsif ($ref_type eq 'HASH') {
124 2         4 push @out, '-' x 78;
125 2         10 my $header = sprintf('%9s ' x scalar @header, @header);
126 2         4 unshift @header, 'Key';
127 2         5 $header = ' Key ' . $header;
128 2         3 push @out, $header;
129 2         2 push @out, '-' x 78;
130 2         2 my $rows_printed = 0;
131 2         3 foreach my $key (sort {lc $a cmp lc $b} keys %{ $data }) {
  4         10  
  2         9  
132 3         5 my @undef = grep {!defined $data->{$key}[$_]} 0..scalar @{ $data->{$key} }-1;
  27         32  
  3         7  
133 3 50       9 if (scalar @undef > 0) {
134 0         0 say STDERR join (',', @undef);
135 0         0 die "The above indices are not defined for key $key in $current_sub";
136             }
137 3         3 my @numeric = grep {looks_like_number($_)} @{ $data->{$key} };
  27         37  
  3         5  
138 3         24 my $q = quantile(\@numeric, probs => [0.25, 0.75]);
139 3         7 my $print_key = substr($key, 0, 9);
140 3 50       21 if ((length $print_key) < 9) { # make sure that short keys line up correctly
141 3         5 $print_key .= ' ' x (9 - length $print_key);
142             }
143 3         41 my $vals = $print_key . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));
144 3         6 push @out, $vals;
145 3         3 $rows_printed++;
146 3 100       11 last if $rows_printed >= $args{nrows};
147             }
148             }
149 5         61 say join ("\n", @out);
150 5         27 return \@out;
151             }
152              
153             sub read_table {
154 523     523 1 730274 my $file = shift;
155 523 50       4901 die "read_table: \"$file\" is not a file\n" unless -f $file;
156 523 50       3250 die "read_table: \"$file\" is not readable\n" unless -r $file;
157              
158 523         661 my %input_args = @_;
159 523 100       906 if (exists $input_args{delim}) {
160             # FIX: sep + delim together used to silently prefer delim
161             die "read_table: pass either 'sep' or 'delim', not both\n"
162 1 50       3 if exists $input_args{sep};
163 1         3 $input_args{sep} = delete $input_args{delim};
164             }
165              
166 523 100       1489 my $default_sep = $file =~ /\.tsv$/i ? "\t" : ',';
167 523         1137 my %args = (
168             sep => $default_sep,
169             comment => '#',
170             %input_args,
171             );
172              
173 523         640 my %allowed_args = map { $_ => 1 } (
  2615         3319  
174             'comment', 'output.type', 'filter', 'row.names', 'sep',
175             );
176 523         954 my @undef_args = sort grep { !$allowed_args{$_} } keys %args;
  1067         1523  
177 523 50       819 if (@undef_args) {
178 0         0 my $current_sub = ( split /::/, (caller(0))[3] )[-1];
179             # FIX: no more printing to STDOUT from a library ('say'); the die
180             # message carries the offending argument names itself
181 0         0 die "the args \"@undef_args\" aren't defined for $current_sub\n";
182             }
183 523   100     1224 my $otype = $args{'output.type'} // 'aoh';
184 523 100       1136 die "read_table: output.type \"$otype\" isn't allowed (aoh, hoa, hoh)\n"
185             unless $otype =~ m/^(?:aoh|hoa|hoh)$/;
186              
187 522         535 my $filter = $args{filter};
188 522 50 66     1200 if (defined $filter && ref($filter) eq 'CODE') {
    50 66        
189 0         0 $filter = { 0 => $filter };
190             } elsif (defined $filter && ref($filter) ne 'HASH') {
191 0         0 die "'filter' must be a CODE or HASH reference\n";
192             }
193              
194 522         546 my (@data, %data, @header, @uniq_header,
195             %mapped_filters, @sorted_filter_flds, %seen_rownames);
196 522         467 my $data_row = 0;
197             _parse_csv_file($file, $args{sep} // '', $args{comment} // '', sub {
198 6718     6718   7608 my ($line_ref) = @_;
199 6718 100       7560 if (!@header) {
200             # --- HEADER PROCESSING (copy made only here; runs once) ---
201 522         909 my @line = @$line_ref;
202             $line[0] =~ s/^\Q$args{comment}\E//
203 522 50 33     3141 if @line && defined $line[0] && length( $args{comment} // '' );
      50        
      33        
204 522         699 @header = @line;
205 522 100 66     1077 if (@header && $header[0] eq '') {
206 10         14 $header[0] = 'row_name';
207             }
208 522         466 my %seen_h;
209             # FIX: hoa output with duplicate column names used to push the
210             # same cell once PER OCCURRENCE, silently corrupting the column
211             # lengths. Iterate unique names (order preserved) instead.
212 522         566 @uniq_header = grep { !$seen_h{$_}++ } @header;
  1166         2166  
213 522         555 my @dup_cols = grep { $seen_h{$_} > 1 } @uniq_header;
  1165         1407  
214 522 100       610 warn "read_table: duplicate column name(s) in $file: @dup_cols (later values win)\n"
215             if @dup_cols;
216 522 100 100     1347 if ($otype eq 'hoh' && !defined $args{'row.names'}) {
217 4         8 $args{'row.names'} = $header[0];
218             }
219 522 50 66     771 if (defined $args{'row.names'}
220 61         74 && !grep { $_ eq $args{'row.names'} } @header) {
221 0         0 die "\"$args{'row.names'}\" isn't in the header of $file\n";
222             }
223 522 100       672 if ($filter) {
224 4         12 for my $k (keys %$filter) {
225 4 50       15 if ($k =~ /^\d+$/) {
226             # FIX: a numeric key past the last column used to be
227             # accepted and then silently extended every row via
228             # the $_ write-back
229 0 0       0 die "read_table: numeric filter key $k exceeds the "
230             . scalar(@header) . " columns of $file\n"
231             if $k > @header;
232 0         0 $mapped_filters{$k} = $filter->{$k};
233             } else {
234 4         12 my ($idx) = grep { $header[$_] eq $k } 0 .. $#header;
  44         54  
235 4 50       11 die "Filter column '$k' not found in header\n"
236             unless defined $idx;
237 4         16 $mapped_filters{ $idx + 1 } = $filter->{$k};
238             }
239             }
240 4         11 @sorted_filter_flds = sort { $a <=> $b } keys %mapped_filters;
  0         0  
241             }
242 522         1955 return;
243             }
244             # --- DATA PROCESSING (operate on $line_ref directly; no row copy) ---
245 6196         5233 $data_row++;
246 6196 100       7196 if (@$line_ref != @header) {
247             # FIX: alignment errors now say WHICH row is ragged
248 1         11 die sprintf "Alignment error on %s data row %d (%d fields vs %d headers).\n",
249             $file, $data_row, scalar @$line_ref, scalar @header;
250             }
251 6195         5405 my %line_hash;
252 6195         7433 for my $i (0 .. $#header) {
253 74732         67231 my $v = $line_ref->[$i];
254 74732 100 66     145140 $line_hash{ $header[$i] } = ( !defined($v) || $v eq '' ) ? undef : $v;
255             }
256             # --- APPLY FILTERS ---
257 6195 100       7180 if (@sorted_filter_flds) {
258             # FIX: 'local %_ = %line_hash' copied the whole row for every
259             # filtered row AND went stale after a $_ write-back. Aliasing the
260             # glob makes %_ THE row hash: zero copy, mutations always visible.
261 1847         1821 local *_ = \%line_hash;
262 1847         1550 my $skip = 0;
263 1847         1765 foreach my $fld (@sorted_filter_flds) {
264 1847 50       2713 local $_ = $fld == 0 ? $line_ref : $line_hash{ $header[ $fld - 1 ] };
265 1847 100       2354 if ( !$mapped_filters{$fld}->( $line_ref, \%line_hash ) ) {
266 1131         2232 $skip = 1;
267 1131         1095 last;
268             }
269 716 50       1800 if ( $fld > 0 ) { # write back any mutation made to $_
270 716         753 $line_ref->[ $fld - 1 ] = $_;
271 716 100 66     1503 $line_hash{ $header[ $fld - 1 ] }
272             = ( !defined($_) || $_ eq '' ) ? undef : $_;
273             }
274             }
275 1847 100       8545 return if $skip;
276             }
277             # Populate requested data structure
278 5064 100       7073 if ($otype eq 'aoh') {
    100          
    50          
279 1863         10983 push @data, \%line_hash;
280             } elsif ($otype eq 'hoa') {
281 1114         1205 push @{ $data{$_} }, $line_hash{$_} for @uniq_header;
  15757         30131  
282             } elsif ($otype eq 'hoh') {
283 2087         2334 my $row_name = $line_hash{ $args{'row.names'} };
284             # FIX: an undef/empty row-name cell used to become the '' hash
285             # key with "uninitialized" warnings; now it dies with the row
286             die sprintf "read_table: undefined row name (column '%s') in %s data row %d\n",
287 2087 50       2378 $args{'row.names'}, $file, $data_row
288             unless defined $row_name;
289             warn "read_table: duplicate row name '$row_name' in $file (later values win)\n"
290 2087 100       3421 if $seen_rownames{$row_name}++;
291 2087         1988 foreach my $col (@uniq_header) {
292 29171 100       32998 next if $col eq $args{'row.names'};
293 27084         51430 $data{$row_name}{$col} = $line_hash{$col};
294             }
295             }
296 522   50     15010 });
      50        
297 521 100       6481 if ($otype eq 'aoh') {
298 509         2039 return \@data;
299             } else { # hoa or hoh
300 12         7373 return \%data;
301             }
302             }
303             # ---------------------------------------------------------------------------
304             # view(): an R-style `head` for the structures read_table() returns.
305             #
306             # Handles all three output.type values:
307             # aoh -> ARRAY of HASH refs
308             # hoa -> HASH of ARRAY refs
309             # hoh -> HASH of HASH refs
310             #
311             # Pure Perl; the only dependency is Scalar::Util (core). There is nothing for
312             # XS to speed up here: view only touches the first n rows, and the total-row
313             # count is O(1) for every structure (scalar @$aoh, array length, scalar keys).
314             # Keeping it pure Perl also keeps install portable (no compiler needed).
315             #
316             # Usage:
317             # view($data); # first 6 rows, like head()
318             # view($data, n => 20); # first 20 rows
319             # view($data, cols => [...]); # pin explicit column order
320             # view($data, na => '.', max_width => 30, gap => 1, ellipsis => '~');
321             # view($data, 'row.names' => 'id'); # use column 'id' as the row label
322             # view($data, to => \*STDERR); # print elsewhere
323             # my $s = view($data, return_only => 1); # capture string, suppress print
324             #
325             # Column order: read_table stores rows as hashes, so the original CSV column
326             # order is gone. view sorts columns by name for a stable layout and treats a
327             # column literally named 'row_name' (read_table's label for a leading blank
328             # header) as the row label. Pass cols => [...] to override the order.
329             # ---------------------------------------------------------------------------
330 14     14   39791 use Scalar::Util qw(looks_like_number);
  14         18  
  14         31916  
331              
332             sub view {
333 0     0 1   my $data = shift;
334 0           my %args = @_;
335 0 0         my $n = exists $args{n} ? $args{n} : 6;
336 0 0         my $na = exists $args{na} ? $args{na} : 'NA';
337 0 0         my $maxw = exists $args{max_width} ? $args{max_width} : 50;
338 0 0         my $ell = exists $args{ellipsis} ? $args{ellipsis} : '...';
339 0 0         my $gap = exists $args{gap} ? (' ' x $args{gap}) : ' ';
340 0   0       my $ucols = $args{cols} || $args{columns};
341 0           my $fh = $args{to};
342 0           my $quiet = $args{return_only};
343              
344             my $label_col = exists $args{'row.names'} ? $args{'row.names'}
345             : exists $args{row_names} ? $args{row_names}
346 0 0         : undef;
    0          
347 0           my $rt = ref $data;
348 0 0 0       die "view: expected an ARRAY (AoH) or HASH (HoA/HoH) reference, got "
      0        
349             . ($rt || 'a non-reference') . "\n"
350             unless $rt eq 'ARRAY' or $rt eq 'HASH';
351 0           my ($kind, @cols, @labels, @raw, $total, $lab_header);
352 0 0         if ($rt eq 'ARRAY') { # ---- AoH ----
    0          
353 0           $kind = 'AoH';
354 0           $total = scalar @$data;
355 0 0         my $show = $n < $total ? $n : $total;
356 0 0         $show = 0 if $show < 0;
357              
358 0 0         if ($ucols) {
359 0           @cols = @$ucols;
360             } else {
361 0           my %seen;
362 0           for my $i (0 .. $show - 1) {
363 0           my $row = $data->[$i];
364 0 0         next unless ref $row eq 'HASH';
365 0           push @cols, grep { !$seen{$_}++ } sort keys %$row;
  0            
366             }
367             }
368             my $lc = defined $label_col ? $label_col
369 0 0         : (grep { $_ eq 'row_name' } @cols) ? 'row_name' : undef;
  0 0          
370 0 0         if (defined $lc) {
371 0           @cols = grep { $_ ne $lc } @cols;
  0            
372 0           $lab_header = $lc;
373             }
374 0           for my $i (0 .. $show - 1) {
375 0   0       my $row = $data->[$i] || {};
376 0 0         push @labels, defined $lc ? $row->{$lc} : ($i + 1);
377 0           push @raw, [ map { $row->{$_} } @cols ];
  0            
378             }
379 0 0         $lab_header = '' unless defined $lab_header;
380             } elsif ($rt eq 'HASH') {
381 0           my @keys = keys %$data;
382 0           my $sample;
383 0 0         for my $k (@keys) { $sample = $data->{$k}; last if defined $sample; }
  0            
  0            
384 0           my $vt = ref $sample;
385              
386 0 0         if ($vt eq 'ARRAY') { # ---- HoA ----
    0          
387 0           $kind = 'HoA';
388 0 0         my @allcols = $ucols ? @$ucols : sort @keys;
389 0           $total = 0;
390 0           for my $k (@keys) {
391 0 0         my $l = ref $data->{$k} eq 'ARRAY' ? scalar @{ $data->{$k} } : 0;
  0            
392 0 0         $total = $l if $l > $total;
393             }
394 0 0         my $show = $n < $total ? $n : $total;
395 0 0         $show = 0 if $show < 0;
396             my $lc = defined $label_col ? $label_col
397 0 0         : (grep { $_ eq 'row_name' } @allcols) ? 'row_name' : undef;
  0 0          
398 0 0         @cols = grep { !defined $lc || $_ ne $lc } @allcols;
  0            
399 0 0         $lab_header = defined $lc ? $lc : '';
400 0           for my $i (0 .. $show - 1) {
401 0 0         push @labels, defined $lc ? $data->{$lc}[$i] : ($i + 1);
402 0           push @raw, [ map { $data->{$_}[$i] } @cols ];
  0            
403             }
404             } elsif ($vt eq 'HASH') { # ---- HoH ----
405 0           $kind = 'HoH';
406 0           $total = scalar @keys;
407 0           my @rk = sort @keys;
408 0 0         my $show = $n < $total ? $n : $total;
409 0 0         $show = 0 if $show < 0;
410 0 0         my @shown = $show > 0 ? @rk[0 .. $show - 1] : ();
411 0 0         if ($ucols) {
412 0           @cols = @$ucols;
413             } else {
414 0           my %seen;
415 0           for my $rkk (@shown) {
416 0           push @cols, grep { !$seen{$_}++ } sort keys %{ $data->{$rkk} };
  0            
  0            
417             }
418             }
419 0 0         @cols = grep { $_ ne $label_col } @cols if defined $label_col;
  0            
420 0 0         $lab_header = exists $args{'row.names'} ? $args{'row.names'} : 'row_name';
421 0           for my $rkk (@shown) {
422 0           push @labels, $rkk;
423 0           push @raw, [ map { $data->{$rkk}{$_} } @cols ];
  0            
424             }
425             } else {
426 0           die "view: HASH values are neither ARRAY (HoA) nor HASH (HoH) refs; "
427             . "cannot interpret as a table\n";
428             }
429             }
430              
431             # numeric detection per column (drives right/left alignment)
432 0           my @numeric = (1) x scalar @cols;
433 0           for my $r (@raw) {
434 0           for my $j (0 .. $#cols) {
435 0           my $v = $r->[$j];
436 0 0         next unless defined $v;
437 0 0         $numeric[$j] = 0 unless looks_like_number($v);
438             }
439             }
440 0 0         my $lab_numeric = @labels ? 1 : 0;
441 0 0 0       for my $l (@labels) { $lab_numeric = 0, last unless defined $l && looks_like_number($l); }
  0            
442              
443             # stringify + sanitize + truncate cells (column names left intact)
444             my $clean = sub {
445 0     0     my $v = shift;
446 0 0         return $na unless defined $v;
447 0           $v = "$v";
448 0           $v =~ s/\t/\\t/g; $v =~ s/\r/\\r/g; $v =~ s/\n/\\n/g;
  0            
  0            
449 0 0 0       if ($maxw && length($v) > $maxw) {
450 0           my $keep = $maxw - length($ell);
451 0 0         $keep = 0 if $keep < 0;
452 0           $v = substr($v, 0, $keep) . $ell;
453             }
454 0           return $v;
455 0           };
456 0           my @lab_s = map { $clean->($_) } @labels;
  0            
457 0           my @rows_s = map { [ map { $clean->($_) } @$_ ] } @raw;
  0            
  0            
458 0           my @head_s = @cols;
459              
460             # column widths
461 0           my $lab_w = length $lab_header;
462 0 0         $lab_w = length $_ > $lab_w ? length $_ : $lab_w for @lab_s;
463 0           my @w = map { length $_ } @head_s;
  0            
464 0           for my $r (@rows_s) {
465 0           for my $j (0 .. $#cols) {
466 0           my $l = length $r->[$j];
467 0 0         $w[$j] = $l if $l > $w[$j];
468             }
469             }
470              
471             my $pad = sub {
472 0     0     my ($s, $width, $right) = @_;
473 0 0         my $g = $width - length $s; $g = 0 if $g < 0;
  0            
474 0 0         return $right ? (' ' x $g) . $s : $s . (' ' x $g);
475 0           };
476              
477 0           my @out;
478 0           my $shown = scalar @rows_s;
479 0 0         push @out, sprintf("# %s: %d row%s x %d col%s (showing %d)",
    0          
480             $kind, $total, ($total == 1 ? '' : 's'),
481             scalar(@cols), (@cols == 1 ? '' : 's'), $shown);
482              
483 0           my @hcells = ( $pad->($lab_header, $lab_w, 0) );
484 0           push @hcells, $pad->($head_s[$_], $w[$_], $numeric[$_]) for 0 .. $#cols;
485 0           push @out, join($gap, @hcells);
486              
487 0           for my $ri (0 .. $#rows_s) {
488 0           my @cells = ( $pad->($lab_s[$ri], $lab_w, $lab_numeric) );
489 0           push @cells, $pad->($rows_s[$ri][$_], $w[$_], $numeric[$_]) for 0 .. $#cols;
490 0           push @out, join($gap, @cells);
491             }
492 0 0         push @out, sprintf("# ... %d more row%s", $total - $shown,
    0          
493             ($total - $shown == 1 ? '' : 's')) if $shown < $total;
494              
495 0           my $str = join("\n", @out) . "\n";
496 0 0         unless ($quiet) { defined $fh ? print {$fh} $str : print $str; }
  0 0          
  0            
497 0           return $str;
498             }
499              
500             1;
501             =encoding utf8
502              
503             =head1 Synopsis
504              
505             Get basic statistical functions working in Perl as if they were part of List::Util, like C, C, C, etc.
506             I've used Artificial Intelligence tools such as Claude, Gemini, and Grok to write this as well as using my own gray matter.
507             There are other similar tools on CPAN, but I want speed and a form like List::Util, which I've gotten here with the help of AI, which often required many attempts to do correctly.
508             This is meant to call subroutines directly through eXternal Subroutines (XS) for performance and portability.
509              
510             There B other modules on CPAN that can do B of this, but this works the way that I B it to.
511              
512             =head1 Functions/Subroutines
513              
514             ========================================================================
515              
516             =head2 add_data
517              
518             Add data to an existing hash or array reference. This function acts as the equivalent of adding new rows, as well as an C (described below). It dynamically infers your target data structure, handles deeply nested records, and seamlessly coerces mismatched data shapes to preserve the structural integrity of your primary reference.
519              
520             =head3 Hash of Hashes (HoH)
521              
522             When the target is a Hash of Hashes, incoming hash keys update existing rows, and new keys create new rows.
523              
524             $data = { 'Jack Smith' => { age => 30 } };
525            
526             $n = {
527             'Jack Smith' => { # Update existing (Hash)
528             dept => 'Engineering'
529             },
530             'Jane Doe' => { age => 25, dept => 'Sales' }, # Add new (Hash)
531             'Invalid' => 'Not a reference' # Edge case safety
532             };
533            
534             add_data($data, $n);
535              
536             B
537              
538             {
539             "Jack Smith": {
540             "age": 30,
541             "dept": "Engineering"
542             },
543             "Jane Doe": {
544             "age": 25,
545             "dept": "Sales"
546             }
547             }
548              
549             =head3 Hash of Arrays (HoA)
550              
551             When the target is a Hash of Arrays, incoming arrays are pushed onto the existing arrays, appending the new elements, similarly to R's C.
552              
553             $data = { 'Project Alpha' => [ 'task1', 'task2' ] };
554             $n = {
555             'Project Alpha' => [ 'task3' ], # Appends to existing array
556             'Project Beta' => [ 'task1', 'task2' ] # Creates new array row
557             };
558             add_data($data, $n);
559              
560             B
561              
562             {
563             "Project Alpha": [ "task1", "task2", "task3" ],
564             "Project Beta": [ "task1", "task2" ]
565             }
566              
567             =head3 Array of Hashes / Arrays (AoH / AoA)
568              
569             C now natively supports Array references at the root level. When targeting an Array, it iterates through the source array and merges data at the corresponding indices.
570              
571             $data = [
572             { id => 1, name => 'Alice' }
573             ];
574            
575             $n = [
576             { role => 'Admin' }, # Updates index 0
577             { id => 2, name => 'Bob' } # Creates index 1
578             ];
579            
580             add_data($data, $n);
581              
582             B
583              
584             [
585             { "id": 1, "name": "Alice", "role": "Admin" },
586             { "id": 2, "name": "Bob" }
587             ]
588              
589             =head3 Advanced Structural Coercion & Cross-Merging
590              
591             C strictly enforces the primary structure of your target reference (determined by inspecting its outer and inner bounds). If you mix Array and Hash types, the function automatically coerces the incoming data to match the target.
592              
593             B<1. Inner Coercion (Mixing Rows):>
594              
595             =over
596              
597             =item * B Source Array rows are read in pairs and converted to key-value pairs.
598              
599             =item * B Source Hash rows are flattened into key-value pairs and pushed onto the array.
600              
601             =back
602              
603             B<2. Root-Level Coercion (Mixing Outer Containers):>
604              
605             =over
606              
607             =item * B The function evaluates the Hash keys as numeric indices. (e.g., source key C<"0"> merges into target array index C<[0]>). Non-numeric keys are safely ignored.
608              
609             =item * B The function converts the Array indices into stringified Hash keys. (e.g., source array index C<[1]> merges into target hash key C<"1">).
610              
611             =back
612              
613             =head3 Source is a mixed Hash. Keys dictate the target array index!
614              
615             $n = {
616             '0' => { y => 20 }, # Merges into $data->[0]
617             '1' => [ 'z', 30 ], # Array pair coerced to Hash, creates $data->[1]
618             'ignored' => { k => 'v' } # Ignored: cannot map to an array index
619             };
620            
621             add_data($data, $n);
622              
623             B
624              
625             [
626             { "x": 10, "y": 20 },
627             { "z": 30 }
628             ]
629              
630             NB: If C is called on a completely empty target reference (e.g., C<$data = {}> or C<$data = []>), it will intelligently infer the required inner structure (Hashes vs Arrays) by inspecting the first valid row of the source data.
631              
632             =head2 aov
633              
634             Warning: assumes normal distribution
635              
636             aov(
637             {
638             yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
639             ctrl => [1, 1, 1, 0, 0, 0]
640             },
641             'yield ~ ctrl');
642              
643             which returns
644              
645             {
646             ctrl {
647             Df 1,
648             "F value" 25.6000000000001,
649             "Mean Sq" 1.70666666666667,
650             Pr(>F) 0.00718232855871859,
651             "Sum Sq" 1.70666666666667
652             },
653             Residuals {
654             Df 4,
655             "Mean Sq" 0.0666666666666665,
656             "Sum Sq" 0.266666666666666
657             }
658             }
659              
660             You can also perform Two-Way ANOVA with categorical interactions using the C<*> operator. The parser will implicitly evaluate the main effects alongside the interaction:
661              
662             my $res_2way = aov($data_2way, 'len ~ supp * dose');
663              
664             It is robust against rank deficiency; collinear terms will gracefully receive 0 degrees of freedom and 0 sum of squares, matching R's behavior.
665              
666             =head3 Input Parameters
667              
668              
669              
670             =begin html
671              
672            
673            
674            
675             Parameter
676             Type
677             Default
678             Description
679             Example
680            
681            
682            
683            
684             data_sv
685             HashRef or ArrayRef
686             (Required)
687             The dataset to analyze. Accepts a Hash of Arrays (HoA) or Array of Hashes (AoH). If no formula is provided, it must be an HoA to allow automatic stacking (mimicking R's stack() on a named list).
688            
689            
690            
691             formula_sv
692             String
693             undef
694             A symbolic description of the model to be fitted. If omitted, the formula automatically defaults to 'Value ~ Group' and the input data is stacked.
695             'yield ~ N * P'
696            
697            
698            
699              
700             =end html
701              
702              
703              
704             =head3 Output Variables
705              
706             The function returns a single C containing the evaluated statistical results. Because the keys map dynamically to the terms parsed from your formula, the structure will vary based on your inputs.
707              
708              
709              
710             =begin html
711              
712            
713            
714            
715             Parameter
716             Type
717             Default
718             Description
719             Example
720            
721            
722            
723            
724             (Term Name)
725             HashRef
726             undef
727             A nested hash for each independent term in the formula (e.g., 'Group', 'N:P'), containing its ANOVA table statistics.
728             {'Df' => 1, 'Sum Sq' => 14.2, 'Mean Sq' => 14.2, 'F value' => 25.81, 'Pr(>F)' => 0.0004}
729            
730            
731             Residuals
732             HashRef
733             undef
734             A nested hash containing the residual (error) statistics for the fitted model.
735             {'Df' => 10, 'Sum Sq' => 5.5, 'Mean Sq' => 0.55}
736            
737            
738             group_stats
739             HashRef
740             undef
741             A nested hash containing descriptive statistics (mean and size / count) for every column evaluated in the original unstacked data structure.
742             {'mean' => {'A' => 2.1, 'B' => 5.4}, 'size' => {'A' => 10, 'B' => 10}}
743            
744            
745            
746              
747             =end html
748              
749              
750              
751             =head3 omitting formula
752              
753             As of version 0.07, in the case of an omitted formula, stacking is done:
754              
755             aov(
756             {
757             yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
758             ctrl => [1, 1, 1, 0, 0, 0]
759             },
760             );
761              
762             is the equivalent of:
763              
764             yield <- c(5.5, 5.4, 5.8, 4.5, 4.8, 4.2)
765             ctrl <- c(1, 1, 1, 0, 0, 0)
766            
767             # Combine them into a named list (the R equivalent of your hash)
768             my_list <- list(yield = yield, ctrl = ctrl)
769            
770             # Convert the list into a "long" dataframe
771             # This creates two columns: "values" and "ind" (the group name)
772             my_data <- stack(my_list)
773            
774             # Rename columns for clarity (optional but good practice)
775             colnames(my_data) <- c("Value", "Group")
776             anova_model <- aov(Value ~ Group, data = my_data)
777             summary(anova_model)
778              
779             in R
780              
781             =head2 cfilter
782              
783             Select B out of a table and return it in the same shape. A column is
784             the inner (second-level) key of a B or an B,
785             or the outer key of a B:
786              
787             use Stats::LikeR;
788             my %hoa = ( x => [1,2,3], y => [4,5,6], z => [0,0,0] );
789             cfilter(\%hoa, keep => ['x','y']); # { x => [1,2,3], y => [4,5,6] }
790             cfilter(\%hoa, remove => ['z']); # { x => [1,2,3], y => [4,5,6] }
791              
792             C takes exactly one of C or C. C returns only the
793             matching columns; C returns everything except them. The result is the
794             same shape as the input (HoH → HoH, HoA → HoA, AoH → AoH), with cell values
795             copied and the original structure left untouched.
796              
797             =head3 Selecting by name
798              
799             Pass an array ref of column names. Naming a column that is not present in the
800             data is an error (it catches typos), and a row that happens not to contain a
801             kept column simply comes back without it:
802              
803             my @aoh = ( { a => 1, b => 2 }, { a => 3 } );
804             cfilter(\@aoh, keep => ['b']); # [ { b => 2 }, {} ]
805              
806             =head3 Selecting by a predicate
807              
808             Instead of names, C/C accept a B — a CODE ref or a
809             function name — evaluated once per column. It is called as
810              
811             $predicate->($column_values, $column_name)
812              
813             where C<$column_values> is an array ref of the column's B cells (undef
814             and missing cells are dropped, so functions like C get clean input).
815             With C, columns for which the predicate is true are kept; with C,
816             those columns are dropped.
817              
818             # Keep only the constant columns (standard deviation zero):
819             my $const = cfilter(\%hoa, keep => sub { sd($_[0]) == 0 }); # { z => [0,0,0] }
820             # Drop the constant columns instead:
821             my $varying = cfilter(\%hoa, remove => sub { sd($_[0]) == 0 }); # { x=>..., y=>... }
822             # A bare function name resolves in Stats::LikeR:: (use a package for your own):
823             cfilter(\%hoa, keep => 'some_predicate');
824              
825             A bare string is always treated as a B, not a single column
826             name, so to keep one column by name use an array ref: C<< keep =E ['x'] >>.
827              
828             =head3 Errors
829              
830             C dies (via C) when:
831              
832             =over
833              
834             =item * neither C nor C is given, or both are,
835              
836             =item * a named column is not present in the data,
837              
838             =item * the selector is neither an array ref nor a code ref / function name, or the
839             function name cannot be resolved,
840              
841             =item * an unknown option is given, or the options are not C<< name =E value >> pairs,
842              
843             =item * the data is not a hash/array reference of the expected shape (a hash of hash
844             refs or array refs, or an array of hash refs).
845              
846             =back
847              
848             =head2 chisq_test
849              
850             The C function performs chi-squared contingency table tests and goodness-of-fit tests. It natively accepts both arrays and hashes (1D and 2D) and mathematically mirrors R's C, returning a structured hash reference of the results.
851              
852             For 2x2 matrices, Yates' Continuity Correction is applied automatically.
853              
854             =head3 Accepted Inputs
855              
856              
857              
858             =begin html
859              
860            
861            
862            
863             Input Type
864             Data Structure
865             Applied Test
866            
867            
868            
869            
870             1D Array
871             [ $v1, $v2, ... ]
872             Chi-squared test for given probabilities
873            
874            
875             2D Array
876             [ [ $v1, $v2 ], [ $v3, $v4 ] ]
877             Pearson's Chi-squared test (Yates' correction if 2x2)
878            
879            
880             1D Hash
881             { key1 => $v1, key2 => $v2 }
882             Chi-squared test for given probabilities
883            
884            
885             2D Hash
886             { row1 => { c1 => $v1, c2 => $v2 } }
887             Pearson's Chi-squared test (Yates' correction if 2x2)
888            
889            
890            
891              
892             =end html
893              
894              
895              
896             =head3 Output Object Structure
897              
898             The function returns a single Hash Reference containing the following key-value pairs. The internal structure of C and C will always identically match the structure of your input.
899              
900              
901              
902             =begin html
903              
904            
905            
906            
907             Key
908             Data Type
909             Description
910            
911            
912            
913            
914             data.name
915             String
916             Identifies the input type (e.g., "Perl ArrayRef" or "Perl HashRef").
917            
918            
919             expected
920             Array/Hash Ref
921             The expected frequencies, matching the geometry of the input.
922            
923            
924             method
925             String
926             The specific statistical test applied.
927            
928            
929             observed
930             Array/Hash Ref
931             The original data passed to the function.
932            
933            
934             p.value
935             Float
936             The calculated p-value of the test.
937            
938            
939             parameter
940             Hash Ref
941             Contains the degrees of freedom (df).
942            
943            
944             statistic
945             Hash Ref
946             Contains the test statistic (X-squared).
947            
948            
949            
950              
951             =end html
952              
953              
954              
955             =head3 Two-Dimensional Array
956              
957             Passing an Array of Arrays (AoA) triggers a standard Pearson's Chi-squared test. If the input is exactly a 2x2 matrix, Yates' continuity correction is applied automatically.
958              
959             my $test_data = [
960             [762, 327, 468],
961             [484, 239, 477]
962             ];
963             my $res = chisq_test($test_data);
964              
965             B
966              
967             {
968             'data.name' => 'Perl ArrayRef',
969             'expected' => [
970             [ 703.671381936888, 319.645266594124, 533.683351468988 ],
971             [ 542.328618063112, 246.354733405876, 411.316648531012 ]
972             ],
973             'method' => "Pearson's Chi-squared test",
974             'observed' => [
975             [ 762, 327, 468 ],
976             [ 484, 239, 477 ]
977             ],
978             'p.value' => 2.95358918321176e-07,
979             'parameter' => { 'df' => 2 },
980             'statistic' => { 'X-squared' => 30.0701490957547 }
981             }
982              
983             =head3 1-Dimensional Array (Goodness of Fit)
984              
985             Passing a flat Array Reference triggers a Goodness of Fit test, assuming equal expected probabilities across all items.
986              
987             my $data = [10, 20, 30];
988             my $res = chisq_test($data);
989              
990             B
991              
992             {
993             'data.name' => 'Perl ArrayRef',
994             'expected' => [ 20, 20, 20 ],
995             'method' => 'Chi-squared test for given probabilities',
996             'observed' => [ 10, 20, 30 ],
997             'p.value' => 0.00673794699908547,
998             'parameter' => { 'df' => 2 },
999             'statistic' => { 'X-squared' => 10 }
1000             }
1001              
1002             =head3 2-Dimensional Hash (Pearson's Chi-squared)
1003              
1004             Passing a Hash of Hashes (HoH) applies the exact same logic as a 2D Array, but preserves your nested string keys in the output. This is particularly useful when mapping data extracted directly from JSON, databases, or categorical mappings.
1005              
1006             my $data = {
1007             GroupA => { Success => 10, Failure => 15 },
1008             GroupB => { Success => 20, Failure => 5 }
1009             };
1010            
1011             my $res = chisq_test($data);
1012              
1013             B
1014              
1015             {
1016             'data.name' => 'Perl HashRef',
1017             'expected' => {
1018             'GroupA' => { 'Failure' => 10, 'Success' => 15 },
1019             'GroupB' => { 'Failure' => 10, 'Success' => 15 }
1020             },
1021             'method' => "Pearson's Chi-squared test with Yates' continuity correction",
1022             'observed' => {
1023             'GroupA' => { 'Failure' => 15, 'Success' => 10 },
1024             'GroupB' => { 'Failure' => 5, 'Success' => 20 }
1025             },
1026             'p.value' => 0.00937475878430379,
1027             'parameter' => { 'df' => 1 },
1028             'statistic' => { 'X-squared' => 6.75 }
1029             }
1030              
1031             =head3 One-Dimensional Hash (Goodness of Fit)
1032              
1033             Flat Hash References evaluate Goodness of Fit while preserving your categorical keys in the C and C output blocks.
1034              
1035             my $data = {
1036             Apples => 10,
1037             Oranges => 20,
1038             Bananas => 30
1039             };
1040            
1041             my $res = chisq_test($data);
1042              
1043             =head2 C
1044              
1045             Apply a B to every pair of columns in a table and collect
1046             the answers in a hash of hashes.
1047              
1048             It's the workhorse behind things like correlation matrices: give it your data and
1049             the name of a function that takes two columns (C, C, …) and you get
1050             back every column compared against every other column.
1051              
1052             use Stats::LikeR;
1053            
1054             my %data = (
1055             height => [ 170, 165, 180, 175 ],
1056             weight => [ 70, 60, 85, 77 ],
1057             age => [ 30, 41, 25, 38 ],
1058             );
1059            
1060             my $result = col2col(\%data, 'cor');
1061            
1062             # $result->{height}{weight} == correlation of height vs weight
1063             # $result->{height}{age} == correlation of height vs age
1064             # ...and so on for every pair
1065              
1066             ========================================================================
1067              
1068             =head3 Arguments
1069              
1070             col2col( $data, $command, $cols, %options )
1071             col2col( $data, $command, \%options ) # options in place of $cols
1072              
1073              
1074              
1075             =begin html
1076              
1077            
1078            
1079            
1080             Position
1081             Argument
1082             What it is
1083            
1084            
1085            
1086            
1087             1
1088             $data
1089             Your table, as a reference (see Data shapes below).
1090            
1091            
1092             2
1093             $command
1094             A code block or the name of a two-column function.
1095            
1096            
1097             3
1098             $cols
1099             (optional) Which columns to use as the "from" side. Omit for all.
1100            
1101            
1102             4+
1103             %options
1104             (optional) na, skip.errors, … (see Options).
1105            
1106            
1107            
1108              
1109             =end html
1110              
1111              
1112              
1113             ========================================================================
1114              
1115             =head3 Data shapes
1116              
1117             C understands three layouts. In every case a B is the thing that
1118             gets compared, and the result is keyed by column name.
1119              
1120             B — keys are column names:
1121              
1122             my %hoa = ( a => [1, 2, 3], b => [4, 5, 6] );
1123              
1124             B — First keys are row names, second keys are columns:
1125              
1126             my %hoh = (
1127             row1 => { a => 1, b => 4 },
1128             row2 => { a => 2, b => 5 },
1129             );
1130              
1131             B — each element is a row, inner keys are columns:
1132              
1133             my @aoh = ( { a => 1, b => 4 }, { a => 2, b => 5 } );
1134              
1135             All three produce the same result for the same underlying numbers. Missing or
1136             C cells are handled by the C option (below).
1137              
1138             ========================================================================
1139              
1140             =head3 The command
1141              
1142             The second argument is the function applied to each pair of columns. It is called
1143             as:
1144              
1145             $command->( $column_a, $column_b ) # two ARRAY refs
1146              
1147             so inside a block the two columns arrive in C<@_>:
1148              
1149             my $result = col2col(\%data, sub {
1150             my ($x, $y) = @_; # $x and $y are array refs
1151             cor($x, $y);
1152             });
1153              
1154             You can also pass a B. A bare name is looked up in
1155             C, so these two are equivalent:
1156              
1157             col2col(\%data, 'cor');
1158             col2col(\%data, sub { cor($_[0], $_[1]) });
1159              
1160             ========================================================================
1161              
1162             =head3 The result
1163              
1164             Always a hash of hashes: B<< C<< $result-E{from}{to} >> >>.
1165              
1166             for my $from (sort keys %$result) {
1167             for my $to (sort keys %{ $result->{$from} }) {
1168             printf "%s vs %s = %s\n", $from, $to, $result->{$from}{$to};
1169             }
1170             }
1171              
1172             A column is never compared with itself, so C<< $result-E{a}{a} >> does not exist.
1173              
1174             ========================================================================
1175              
1176             =head3 Restricting columns (C<$cols>)
1177              
1178             By default every column is used as the "from" side. The third argument narrows
1179             that down — handy when you only care about one variable.
1180              
1181             # all columns vs all columns
1182             my $all = col2col(\%data, 'cor');
1183             # just ONE column vs every other column
1184             my $one = col2col(\%data, 'cor', 'height');
1185             my $cors = $one->{height}; # { weight => ..., age => ... }
1186             # a FEW specific columns vs every other column
1187             my $few = col2col(\%data, 'cor', ['height', 'weight']);
1188              
1189             The "to" side is always every other column; C<$cols> only limits the outer keys.
1190              
1191             ========================================================================
1192              
1193             =head3 Options
1194              
1195             Options can be given two ways:
1196              
1197             col2col(\%data, 'cor', $cols, 'skip.errors' => 0); # after $cols
1198             col2col(\%data, 'cor', { 'skip.errors' => 0 }); # hash ref, no $cols needed
1199              
1200             The hash-ref form is convenient when you have B column restriction — it saves
1201             you from passing a placeholder. (A hash ref I C<$cols>, so you can't use
1202             it to restrict columns at the same time; use the trailing form for that.)
1203              
1204             =head4 C — how undefined values are handled
1205              
1206             Real data has gaps. C decides what the function sees.
1207              
1208              
1209              
1210             =begin html
1211              
1212            
1213            
1214            
1215             Value
1216             Behaviour
1217             Use for
1218            
1219            
1220            
1221            
1222             'pairwise' (default)
1223             A row is used for a pair only if both columns are defined there. The two columns arrive aligned and equal-length.
1224             Paired stats like cor.
1225            
1226            
1227             'omit'
1228             Each column drops its own undefined values independently. The two columns may end up different lengths.
1229             Unpaired tests like t_test, kruskal_test, where a gap in one sample shouldn't discard a value in the other.
1230            
1231            
1232             'keep'
1233             Every row is passed through, undef and all.
1234             When your function does its own missing-data handling.
1235            
1236            
1237            
1238              
1239             =end html
1240              
1241              
1242              
1243             # correlation: keep only complete pairs (the default)
1244             col2col(\%data, 'cor');
1245             # two-sample test: each column keeps its own values
1246             col2col(\%data, 't_test', undef, na => 'omit');
1247             col2col(\%data, 't_test', { na => 'omit' }); # same, no placeholder
1248              
1249             C / C remain as boolean aliases for backward compatibility:
1250             C means C<'pairwise'>, C means C<'keep'>. Don't combine them with C.
1251              
1252             =head4 C — keep going when a pair fails I<(default: true)>
1253              
1254             Some functions croak on degenerate input — for example C dies if a column has
1255             zero variance. By default C B that croak per pair: instead of
1256             aborting the whole run, it stores the B of the error message in that
1257             cell, so the result tells you I pair failed and I. Every other cell is
1258             computed normally.
1259              
1260             my $r = col2col(\%data, 'cor');
1261             # a good pair: $r->{a}{b} == 0.83
1262             # a bad pair: $r->{a}{const} eq 'cor: standard deviation of y is 0'
1263              
1264             To restore the old "die on the first error" behaviour, turn it off:
1265              
1266             col2col(\%data, 'cor', undef, 'skip.errors' => 0);
1267             col2col(\%data, 'cor', { 'skip.errors' => 0 });
1268              
1269             Only errors from B are trapped. Mistakes in the call itself
1270             (unknown column, bad data, unknown function name, unknown option) always die.
1271              
1272             ========================================================================
1273              
1274             =head3 Worked examples
1275              
1276             B
1277              
1278             my $m = col2col(\%data, 'cor');
1279              
1280             B
1281              
1282             my $col = 'Testosterone, total (nmol/L)';
1283             my $cors = col2col($hoa, 'cor', $col)->{$col};
1284             for my $other (sort { ($cors->{$b} // -2) <=> ($cors->{$a} // -2) } keys %$cors) {
1285             next unless $cors->{$other} =~ /^-?\d/; # skip cells holding an error message
1286             printf "%-30s % .3f\n", $other, $cors->{$other};
1287             }
1288              
1289             B
1290              
1291             my $t = col2col($hoa, 't_test', undef, na => 'omit');
1292              
1293             B
1294              
1295             my $m = col2col($hoa, 'cor');
1296             for my $from (sort keys %$m) {
1297             for my $to (sort keys %{ $m->{$from} }) {
1298             my $v = $m->{$from}{$to};
1299             warn "$from vs $to: $v\n" if defined $v && $v !~ /^-?\d/; # non-numeric = error
1300             }
1301             }
1302              
1303             ========================================================================
1304              
1305             =head3 Gotchas
1306              
1307             =over
1308              
1309             =item * B, C<($col_a, $col_b)> — not a column and
1310             a name. Unpack with C.
1311              
1312             =item * B<< C<'pairwise'> can still hit a constant I. >> A column with overall
1313             variance can be flat on just the rows it shares with one partner, so C may
1314             still croak for that pair. With the default C, that shows up as a
1315             message in the single offending cell rather than killing the run.
1316              
1317             =item * B<< C does not modify your data. >> It reads the table and returns a new
1318             hash of hashes.
1319              
1320             =item * B — i.e.
1321             C is the inner ("to") key. So C<< $result-E{A}{B} >> reading C<…deviation of y is 0>
1322             means column C is the degenerate one for that pair.
1323              
1324             =back
1325              
1326             =head2 cor
1327              
1328             cor($array1, $array2, $method = 'pearson'),
1329              
1330             that is, C is the default and will be used if C<$method> is not specified.
1331              
1332             Just like R, C, C, and C are available
1333              
1334             If you provide an array of arrays (a matrix), C will compute the correlation matrix automatically.
1335              
1336             =head2 cor_test
1337              
1338             my $result = cor_test(
1339             'x' => $x,
1340             'y' => $y,
1341             alternative => 'two.sided',
1342             method => 'pearson',
1343             continuity => 1
1344             );
1345              
1346             C safely handles C (or C) values seamlessly by computing over pairwise complete observations.
1347              
1348             =head2 cov
1349              
1350             cov($array1, $array2, 'pearson')
1351              
1352             or
1353              
1354             cov($array1, $array2, 'spearman')
1355              
1356             or
1357              
1358             cov($array1, $array2, 'kendall')
1359              
1360             =head2 dnorm
1361              
1362             gives the density of the normal distribution, with the specified mean and standard deviation.
1363              
1364             In other words, the predicted height of the value C, given a mean, standard deviation, and whether or not to use a log value.
1365              
1366             returns a single scalar/number if a single value is given, otherwise returns an array reference.
1367              
1368             Usage:
1369              
1370             dnorm(4) # assumes a mean of 0 and standard deviation of 1
1371              
1372             but default mean, standard deviation, and log can be passed as parameters:
1373              
1374             $x = dnorm(0, mean => 0, sd => 2, 'log' => 0);
1375              
1376             =head2 filter
1377              
1378             Return a new data frame containing only the rows of C<$df> that match a predicate. The original C<$df> is never modified.
1379              
1380             my $df2 = filter($df, col('column.name') > 4);
1381              
1382             C accepts a predicate in one of two forms:
1383              
1384             =over
1385              
1386             =item 1. a B<< C expression >> — a small, composable comparison built with overloaded operators, and
1387              
1388             =item 2. a B — for anything the operators can't express (multiple columns, regexes, arbitrary logic), in the same spirit as the C option of L<#>.
1389              
1390             =back
1391              
1392             Both C and C
1393              
1394             =head3 Arguments
1395              
1396              
1397              
1398             =begin html
1399              
1400            
1401            
1402            
1403             Position
1404             Name
1405             Description
1406            
1407            
1408            
1409            
1410             1
1411             $df
1412             The data frame to filter. Either an array of hashes (AoH — e.g. the default output of read_table) or a hash of arrays (HoA).
1413            
1414            
1415             2
1416             predicate
1417             Either a col() comparison object or a CODE reference.
1418            
1419            
1420            
1421              
1422             =end html
1423              
1424              
1425              
1426             The return value is a B data frame of the B as the input (AoH in → AoH out, HoA in → HoA out). For an HoA, every column is filtered in parallel by row index, so all returned columns stay the same length and aligned.
1427              
1428             =head3 The C form
1429              
1430             C is a deferred reference to a column. It carries no data — only the column name — so it can be compared with a literal (or another value) to build a predicate that C evaluates once per row.
1431              
1432             filter($df, col('age') >= 18); # keep rows where age >= 18
1433             filter($df, col('sex') eq 'f'); # keep rows where sex is 'f'
1434             filter($df, 18 <= col('age')); # operands may be in either order
1435              
1436             =head3 Comparison operators
1437              
1438              
1439              
1440             =begin html
1441              
1442            
1443            
1444            
1445             Kind
1446             Operators
1447             Comparison
1448            
1449            
1450            
1451            
1452             Numeric
1453             > < >= <= == !=
1454             numeric (the cell and the value are compared as numbers)
1455            
1456            
1457             String
1458             gt lt ge le eq ne
1459             string (the cell and the value are compared as strings)
1460            
1461            
1462            
1463              
1464             =end html
1465              
1466              
1467              
1468             C may appear on either side of the operator; C<< 4 E col('x') >> is automatically rewritten to the equivalent C<< col('x') E 4 >>.
1469              
1470             =head3 Combining predicates: C<&>, C<|>, C
1471              
1472             Predicates compose with bitwise C<&> (and), C<|> (or), and C (not):
1473              
1474             filter($df, (col('age') > 18) & (col('sex') eq 'f')); # and
1475             filter($df, (col('grp') eq 'a') | (col('grp') eq 'c')); # or
1476             filter($df, !(col('x') > 100)); # not
1477              
1478             Comparison operators bind more tightly than C<&> and C<|>, so C<< (col('a') E 4) & (col('b') E 2) >> is parsed correctly, but the parentheses are recommended for readability.
1479              
1480             =head3 The code-reference form
1481              
1482             For logic the operators can't express, pass a C. It is called once per row; the B, available both as C<$_> and as the first argument C<$_[0]>. Return a true value to keep the row.
1483              
1484             filter($df, sub { $_->{x} > 4 && $_->{grp} eq 'a' });
1485             filter($df, sub { $_->{name} =~ /^A/ });
1486             filter($df, sub { $_[0]{score} > $_[0]{threshold} });
1487              
1488             For an HoA, each row is assembled into a temporary hash reference (C<< { column =E value, ... } >>) before the sub is called, so the same C<< $_-E{column} >> syntax works regardless of the input shape.
1489              
1490             =head3 Examples
1491              
1492             use Stats::LikeR;
1493             my $df = read_table('patients.csv'); # array of hashes
1494             # numeric threshold
1495             my $adults = filter($df, col('Age') >= 18);
1496             # combine conditions
1497             my $target = filter($df, (col('Age') >= 18) & (col('Sex') eq 'f'));
1498             # arbitrary logic with a coderef
1499             my $flagged = filter($df, sub { $_->{ALT} > 40 || $_->{AST} > 40 });
1500             # hash-of-arrays input -> hash-of-arrays output, columns filtered in parallel
1501             my $hoa = read_table('patients.csv', 'output.type' => 'hoa');
1502             my $sub = filter($hoa, col('Age') > 32);
1503             # $sub->{Age}, $sub->{Sex}, ... are all the same length and row-aligned
1504              
1505             =head3 Behavior and notes
1506              
1507             =over
1508              
1509             =item * B C builds and returns a new frame; C<$df> is left untouched.
1510              
1511             =item * B<< A missing or C cell never matches >> a C comparison. For example C<< col('x') E 0 >> silently drops any row that has no C value or whose C is C.
1512              
1513             =item * B, into the returned frame: the returned array references the I row hashes as the input (fast, low-memory). Mutating a row in the result would therefore also change it in the original. HoA values are copied into fresh arrays.
1514              
1515             =item * B are well defined: a predicate true for every row returns a copy-shaped frame with all rows; a predicate true for none returns an empty frame (C<[]> for AoH, a hash of empty arrays for HoA).
1516              
1517             =item * B Passing a non-reference, an array element that is not a hash reference, or an HoA column that is not an array reference raises a descriptive error.
1518              
1519             =item * B The C/operator layer is pure Perl (operator overloading); the per-row evaluation is done in XS.
1520              
1521             =back
1522              
1523             =head3 See also
1524              
1525             C (whose C option applies the same coderef convention while reading a file), C.
1526              
1527             =head2 fisher_test
1528              
1529             =head3 array reference entry
1530              
1531             my $array_data = [
1532             [10, 2],
1533             [3, 15]
1534             ];
1535             my $res1 = fisher_test($array_data);
1536              
1537             which returns a hash reference:
1538              
1539             {
1540             alternative "two.sided",
1541             conf_int [
1542             [0] 2.75343836564204,
1543             [1] 300.682787419401
1544             ],
1545             conf_level 0.95,
1546             estimate {
1547             "odds ratio" 21.3053312750168
1548             },
1549             method "Fisher's Exact Test for Count Data",
1550             p_value 0.000536724119143435
1551             }
1552              
1553             =head3 hash reference entry
1554              
1555             $ft = fisher_test( {
1556             Guess => {
1557             Milk => 3, Tea => 1
1558             },
1559             Truth => {
1560             Milk => 1, Tea => 3
1561             }
1562             });
1563              
1564             =head2 glm
1565              
1566             takes a hash of an array as input
1567              
1568             my %tooth_growth = (
1569             dose => [qw(0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
1570             1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
1571             0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0
1572             2.0 2.0 2.0)],
1573             len => [qw(4.2 11.5 7.3 5.8 6.4 10.0 11.2 11.2 5.2 7.0 16.5 16.5 15.2 17.3 22.5
1574             17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5
1575             15.2 21.5 17.6 9.7 14.5 10.0 8.2 9.4 16.5 9.7 19.7 23.3 23.6 26.4 20.0
1576             25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0)],
1577             supp => [qw(VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC
1578             VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
1579             OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ)]
1580             );
1581            
1582             my $glm_teeth = glm(
1583             data => \%tooth_growth,
1584             formula => 'len ~ dose + supp',
1585             family => 'gaussian'
1586             );
1587              
1588             In addition to the C default, it fully supports logistic regression using the C family parameter via Iteratively Reweighted Least Squares (IRLS):
1589              
1590             my $glm_bin = glm(formula => 'am ~ wt + hp', data => \%mtcars, family => 'binomial');
1591              
1592             =head3 Input Parameters
1593              
1594              
1595              
1596             =begin html
1597              
1598            
1599            
1600            
1601             Parameter
1602             Type
1603             Default
1604             Description
1605             Example
1606            
1607            
1608            
1609            
1610             formula
1611             String
1612             None (Required)
1613             A symbolic description of the model to be fitted. Supports operators like +, :, *, ^, and -1 (to remove the intercept).
1614             'am ~ wt + hp', 'y ~ x - 1'
1615            
1616            
1617             data
1618             HashRef or ArrayRef
1619             None (Required)
1620             The dataset containing the variables used in the formula. Accepts either a Hash of Arrays (HoA) or an Array of Hashes (AoH).
1621             \%mtcars, [{x => 1, y => 2}, ...]
1622            
1623            
1624             family
1625             String
1626             'gaussian'
1627             A description of the error distribution and link function to be used in the model. Currently supports 'gaussian' (identity link) and 'binomial' (logit link).
1628             'binomial'
1629            
1630            
1631            
1632              
1633             =end html
1634              
1635              
1636              
1637             =head3 Output variables
1638              
1639              
1640              
1641             =begin html
1642              
1643            
1644            
1645            
1646             Variable
1647             Type
1648             Description
1649             Example
1650            
1651            
1652            
1653            
1654             aic
1655             Double
1656             Akaike's Information Criterion for the fitted model.
1657             123.45
1658            
1659            
1660             boundary
1661             Integer (Boolean)
1662             1 if the fitted values computationally reached the 0 or 1 boundary (specific to the binomial family), 0 otherwise.
1663             0
1664            
1665            
1666             coefficients
1667             HashRef
1668             A hash mapping the expanded model term names to their estimated coefficient values.
1669             {'Intercept' => 1.5, 'wt' => -0.5}
1670            
1671            
1672             converged
1673             Integer (Boolean)
1674             1 if the Iteratively Reweighted Least Squares (IRLS) algorithm converged within the maximum iterations, 0 otherwise.
1675             1
1676            
1677            
1678             deviance
1679             Double
1680             The residual deviance of the fitted model.
1681             15.2
1682            
1683            
1684             deviance.resid
1685             HashRef
1686             A hash mapping data row names to their computed deviance residuals.
1687             {'Mazda RX4' => 0.12}
1688            
1689            
1690             df.null
1691             Integer
1692             The residual degrees of freedom for the null model.
1693             31
1694            
1695            
1696             df.residual
1697             Integer
1698             The residual degrees of freedom for the fitted model.
1699             30
1700            
1701            
1702             family
1703             String
1704             The statistical family used to fit the model.
1705             "gaussian"
1706            
1707            
1708             fitted.values
1709             HashRef
1710             A hash mapping data row names to the fitted mean values (the model's predictions on the scale of the response).
1711             {'Mazda RX4' => 0.85}
1712            
1713            
1714             iter
1715             Integer
1716             The number of IRLS iterations performed before convergence or hitting the iteration limit.
1717             4
1718            
1719            
1720             null.deviance
1721             Double
1722             The deviance for the null model (a baseline model containing only an intercept, or an offset of 0 if the intercept is removed).
1723             43.5
1724            
1725            
1726             rank
1727             Integer
1728             The numeric rank of the fitted linear model (the number of estimated, non-aliased parameters).
1729             2
1730            
1731            
1732             summary
1733             HashRef
1734             A nested hash mapping each term to its detailed summary statistics, including Estimate, Std. Error, t value / z value, and Pr(> t ) / Pr(> z ). Aliased parameters return "NaN".
1735             {'wt' => {'Estimate' => -0.5, 'Std. Error' => 0.1, ...}}
1736            
1737            
1738             terms
1739             ArrayRef
1740             An ordered list of the expanded term names included in the model matrix.
1741             ['Intercept', 'wt', 'hp']
1742            
1743            
1744            
1745              
1746             =end html
1747              
1748              
1749              
1750             =head2 group_by
1751              
1752             Take a hash of arrays, hash of hashes, or array of hashes, and group a column by another column.
1753              
1754             my $aoh_data = [
1755             { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 },
1756             { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 },
1757             { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 },
1758             { 'Gender' => 'Female' } # Intentional missing target value
1759             ];
1760              
1761             as well as
1762              
1763             $hoh_data = {
1764             'Patient_A' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 },
1765             'Patient_B' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 },
1766             'Patient_C' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 },
1767             'Patient_D' => { 'Gender' => 'Female' }, # Intentional missing target value
1768             'Patient_E' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => undef } # Explicit undef
1769             };
1770              
1771             and
1772              
1773             my $hoa_data = {
1774             'Gender' => ['Male', 'Female', 'Male', 'Female'],
1775             'Testosterone, total (nmol/L)' => [22.1, 2.5, 19.4, undef ]
1776             };
1777              
1778             then run the function thus:
1779              
1780             group_by( $hoa_data, 'Testosterone, total (nmol/L)', 'Gender');
1781              
1782             The output can be thought of like a hash, with the first string broken down by the second.
1783              
1784             all become hash of arrays:
1785              
1786             {
1787             Female [
1788             [0] 1.8
1789             ],
1790             Male [
1791             [0] 18.2,
1792             [1] 20.5
1793             ]
1794             }
1795              
1796             returns an empty array of hashes if neither target nor group keys are found.
1797              
1798             =head3 Filtering
1799              
1800             Data can be further broken down with filter/subs like in C:
1801              
1802             my $testosterone = group_by($d, # group testosterone by "Gender"
1803             'Testosterone, total (nmol/L)',
1804             'Gender',
1805             { 'Race/Hispanic origin w/ NH Asian' => sub { $_ eq $n } },# filter
1806             { 'Testosterone, total (nmol/L)' => sub { $_ ne 'NA' } } # filter
1807             );
1808              
1809             where each filter filters on the columns, e.g. second hash keys.
1810              
1811             =head2 hoh2hoa
1812              
1813             Convert a B (row-major: outer key = row, inner key = column)
1814             into a B (column-major: key = column, value = that column's
1815             cells down the rows).
1816              
1817             use Stats::LikeR;
1818            
1819             my %hoh = (
1820             'r1' => { 'a' => 1, 'b' => 2 },
1821             'r2' => { 'a' => 3, 'b' => 4 },
1822             );
1823            
1824             my $hoa = hoh2hoa(\%hoh);
1825              
1826             which returns
1827             {
1828             a => [1, 3],
1829             b => [2, 4],
1830             }
1831              
1832             =head3 Behavior
1833              
1834             =over
1835              
1836             =item * B are the union of every inner key, so a key that appears in only
1837             some rows still becomes a column.
1838              
1839             =item * B are emitted in sorted outer-key (row-name) order, and that one order
1840             is used for every column, so the arrays stay aligned and the result is
1841             reproducible regardless of hash ordering.
1842              
1843             =item * B — a missing inner key, or a cell whose value is C — are filled
1844             with the fill value (see C below). Every column therefore has
1845             exactly one entry per row.
1846              
1847             =item * Values are B into the result; the original structure is left
1848             untouched.
1849              
1850             =item * An B hash of hashes returns an empty hash of arrays (it is not an
1851             error).
1852              
1853             =back
1854              
1855             =head3 Options
1856              
1857             Options are passed as trailing C<< name =E value >> pairs.
1858              
1859              
1860              
1861             =begin html
1862              
1863            
1864            
1865            
1866             Option
1867             Default
1868             Meaning
1869            
1870            
1871            
1872            
1873             undef.val
1874             undef
1875             Value used to fill a missing key or an undef cell. Any defined scalar works, including 0 and ''. Passing undef keeps the default.
1876            
1877            
1878             row.names
1879             (none)
1880             If set to a string, an extra column of that name is added holding the sorted row labels, aligned with the data. Dies if the name collides with an existing column.
1881            
1882            
1883            
1884              
1885             =end html
1886              
1887              
1888              
1889             # Ragged input with an explicit fill string:
1890             my %ragged = (
1891             'r1' => { 'a' => 1, 'b' => 2 },
1892             'r2' => { 'a' => 3, 'c' => 9 },
1893             );
1894             my $hoa = hoh2hoa(\%ragged, 'undef.val' => 'NA');
1895             # {
1896             # a => [1, 3 ],
1897             # b => [2, 'NA'],
1898             # c => ['NA', 9 ],
1899             # }
1900            
1901             # Keep the row labels as a column:
1902             my $with_ids = hoh2hoa(\%ragged, 'row.names' => 'id');
1903             # {
1904             # id => ['r1', 'r2'],
1905             # a => [1, 3 ],
1906             # b => [2, undef],
1907             # c => [undef, 9 ],
1908             # }
1909              
1910             =head3 Errors
1911              
1912             C dies (via C) when:
1913              
1914             =over
1915              
1916             =item * the argument is not a hash reference,
1917              
1918             =item * any value in the hash is not itself a hash reference,
1919              
1920             =item * an unknown option is given, or the options are not C<< name =E value >> pairs,
1921              
1922             =item * C is not a plain string, or it names an already-present column.
1923              
1924             =back
1925              
1926             =head2 hist
1927              
1928             Computes the histogram of the given data values, operating in single $O(N)$ pass performance. It returns the bin counts, computed breaks, midpoints, and density.
1929              
1930             my $res = hist([1, 2, 2, 3, 3, 3, 4, 4, 5], breaks => 4);
1931              
1932             If C is not explicitly provided, it defaults to calculating the number of bins using Sturges' formula.
1933              
1934             =head2 kruskal_test
1935              
1936             Essentially the test determines if all groups have the same median (same distribution) (an excellent review is at https://library.virginia.edu/data/articles/getting-started-with-the-kruskal-wallis-test)
1937              
1938             Performs a Kruskal-Wallis rank sum test, see
1939             https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/kruskal.test
1940              
1941             =head3 hash of array entry
1942              
1943             I feel that this is better, and more easily read, than what you get in R:
1944              
1945             my %x = (
1946             'normal.subjects' => [2.9, 3.0, 2.5, 2.6, 3.2],
1947             'obs. airway disease' => [3.8, 2.7, 4.0, 2.4],
1948             'asbestosis' => [2.8, 3.4, 3.7, 2.2, 2.0]
1949             );
1950             $kt = kruskal_test(\%x);
1951              
1952             =head3 R-like array entry
1953              
1954             my @xk = (2.9, 3.0, 2.5, 2.6, 3.2); # normal subjects
1955             my @yk = (3.8, 2.7, 4.0, 2.4); # with obstructive airway disease
1956             my @zk = (2.8, 3.4, 3.7, 2.2, 2.0); # with asbestosis
1957             my @x = (@xk, @yk, @zk);
1958             my @g = (
1959             (map {'Normal subjects'} 0..4),
1960             (map {'Subjects with obstructive airway disease'} 0..3),
1961             map {'Subjects with asbestosis'} 0..4
1962             );
1963             my $kt = kruskal_test(\@x, \@g);
1964              
1965             =head2 ks_test
1966              
1967             The Kolmogorov-Smirnov test, which tests whether or not two arrays/lists of data are part of the same distribution is implemented simply:
1968              
1969             $ks = ks_test(\@x, \@y, alternative => 'greater');
1970              
1971             returning a hash reference.
1972              
1973             Also, a single array can be tested against a normal distribution:
1974              
1975             $ks = ks_test($ksx, 'pnorm');
1976              
1977             The p-value precision is about 1e-8, which I want to improve, but am not sure how.
1978              
1979             =head2 ljoin
1980              
1981             Consider a hash: C<$h{$row}{$col}>, and another hash C<$i{$row}{$col}>.
1982             C will add information for C<$col> in C<%i> for each C<$row> to C<%h>, where C<$row> exists in both C<%h> and C<%i>
1983              
1984             For example,
1985              
1986             {
1987             "Jack Smith" {
1988             age 30
1989             }
1990             }
1991              
1992             and a second hash,
1993             {
1994             "Jack Smith" {
1995             dept "Engineering"
1996             },
1997             "Jane Doe" {
1998             age 25
1999             }
2000             }
2001              
2002             in this case, running C will modify \%h to result:
2003              
2004             {
2005             "Jack Smith" {
2006             age 30,
2007             dept "Engineering"
2008             }
2009             }
2010              
2011             =head2 lm
2012              
2013             This is the linear models function.
2014              
2015             $lm = lm(formula => 'mpg ~ wt + hp', data => $mtcars);
2016              
2017             where C<$mtcars> is a hash of hashes
2018              
2019             C also supports generating interaction terms directly within the formula using the C<*> operator:
2020              
2021             my $lm = lm(formula => 'mpg ~ wt * hp^2', data => \%mtcars);
2022              
2023             If your data contains missing numbers (C or C), C handles listwise deletion dynamically to ensure mathematical integrity before fitting.
2024              
2025             the dot operator also works:
2026              
2027             $lm = lm(formula => 'y ~ .', data => $dot_data);
2028              
2029             =head2 matrix
2030              
2031             my $mat1 = matrix(
2032             data => [1..6],
2033             nrow => 2
2034             );
2035              
2036             You can also pass C<< byrow =E 1 >> if you want the matrix populated row-wise instead of column-wise.
2037              
2038             As of version 0.10, parameters do not need to be named, so that C works more like R:
2039              
2040             my $d = matrix(rnorm(32000), 1000, 32);
2041              
2042             works as C, C, and C
2043              
2044             =head2 max
2045              
2046             max(1,2,3);
2047              
2048             or
2049              
2050             my @arr = 1..8;
2051             max(@arr, 4, 5)
2052              
2053             as of version 0.02, max will die if any undefined values are provided
2054              
2055             =head2 mean
2056              
2057             mean(1,2,3);
2058              
2059             or
2060              
2061             my @arr = 1..8;
2062             mean(@arr, 4, 5)
2063              
2064             or
2065              
2066             mean([1,1], [2,2]) # 1.5
2067              
2068             as of version 0.02, mean will die if any undefined values are provided
2069              
2070             =head2 median
2071              
2072             works like mean, taking array references and arrays:
2073              
2074             median( $test_data[$i][0] )
2075              
2076             as of version 0.02, median will die if any undefined values are provided
2077              
2078             =head2 min
2079              
2080             min(1,2,3);
2081              
2082             or
2083              
2084             my @arr = 1..8;
2085             min(@arr, 4, 5)
2086              
2087             as of version 0.02, min will die if any undefined values are provided
2088              
2089             =head2 mode
2090              
2091             Takes either an array or an array reference, and returns an array of the most common scalars (numbers or strings)
2092              
2093             @arr = mode([1,3,3,3]); # returns (3)
2094            
2095             @arr = mode('a','a','c','c','z'); # returns ('a', 'c')
2096              
2097             =head2 oneway_test
2098              
2099             Like ANOVA/aov but does not assume normality
2100              
2101             =head3 hash of array input
2102              
2103             $test_data = oneway_test({
2104             yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
2105             ctrl => [1, 1, 1, 0, 0, 0]
2106             });
2107              
2108             which will output a hash reference:
2109              
2110             {
2111             Group {
2112             Df 1,
2113             "F value" 177.504798464491,
2114             "Mean Sq" 61.6533333333333,
2115             Pr(>F) 1.31343255160843e-07,
2116             "Sum Sq" 61.6533333333333
2117             },
2118             group_stats {
2119             mean {
2120             ctrl 0.5,
2121             yield 5.03333333333333
2122             },
2123             size {
2124             ctrl 6,
2125             yield 6
2126             }
2127             },
2128             Residuals {
2129             Df 9.81767348326473,
2130             "Mean Sq" 0.353783749200256,
2131             "Sum Sq" 3.47333333333333
2132             }
2133              
2134             }
2135              
2136             =head3 array of array input
2137              
2138             oneway_test([
2139             [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
2140             [1, 1, 1, 0, 0, 0]
2141             ]);
2142              
2143             which will output a nearly identical hash reference as for hash of arrays:
2144              
2145             {
2146             Group {
2147             Df 1,
2148             "F value" 177.504798464491,
2149             "Mean Sq" 61.6533333333333,
2150             Pr(>F) 1.31343255160843e-07,
2151             "Sum Sq" 61.6533333333333
2152             },
2153             group_stats {
2154             mean {
2155             "Index 0" 5.03333333333333,
2156             "Index 1" 0.5
2157             },
2158             size {
2159             "Index 0" 6,
2160             "Index 1" 6
2161             }
2162             },
2163             Residuals {
2164             Df 9.81767348326473,
2165             "Mean Sq" 0.353783749200256,
2166             "Sum Sq" 3.47333333333333
2167             }
2168             }
2169              
2170             =head2 p_adjust
2171              
2172             Returns array of false-discovery-rate-corrected p-values, where methods available are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr"
2173              
2174             my @q = p_adjust(\@pvalues, $method);
2175              
2176             =head2 power_t_test
2177              
2178             $test_data = power_t_test(
2179             n => 30, delta => 0.5,
2180             sd => 1.0, sig_level => 0.05
2181             );
2182              
2183             It also allows configuring the test type (C<< type =E 'one.sample' >>, C<'two.sample'>, C<'paired'>) and alternative hypothesis (C<< alternative =E 'one.sided' >>). You can also pass C<< strict =E 1 >> to strictly evaluate both tails of the distribution.
2184              
2185              
2186              
2187             =begin html
2188              
2189            
2190            
2191            
2192             Parameter
2193             Type
2194             Default
2195             Description
2196            
2197            
2198            
2199            
2200             n
2201             Float
2202             undef
2203             Number of observations (per group for two-sample, pairs for paired).
2204            
2205            
2206             delta
2207             Float
2208             undef
2209             True difference in means.
2210            
2211            
2212             sd
2213             Float
2214             1.0
2215             Standard deviation.
2216            
2217            
2218             sig_level
2219             Float
2220             0.05
2221             Significance level (Type I error probability). Also accepts sig.level.
2222            
2223            
2224             power
2225             Float
2226             undef
2227             Power of test (1 minus Type II error probability).
2228            
2229            
2230             type
2231             String
2232             "two.sample"
2233             Type of t-test: "two.sample", "one.sample", or "paired".
2234            
2235            
2236             alternative
2237             String
2238             "two.sided"
2239             One- or two-sided test: "two.sided", "one.sided", "greater", or "less".
2240            
2241            
2242             strict
2243             Boolean
2244             0 (False)
2245             Use strict interpretation of two-sided power calculations.
2246            
2247            
2248             tol
2249             Float
2250             ~1.22e-4
2251             Numerical tolerance used for the internal root-finding algorithm.
2252            
2253            
2254            
2255              
2256             =end html
2257              
2258              
2259              
2260             =head2 prcomp
2261              
2262             Principal Component Analysis
2263              
2264             =head3 Options
2265              
2266              
2267              
2268             =begin html
2269              
2270            
2271            
2272            
2273             Option
2274             Type
2275             Default
2276             Description
2277            
2278            
2279            
2280            
2281             center
2282             Boolean
2283             1 (True)
2284             If true, the variables are shifted to be zero-centered before the analysis takes place.
2285            
2286            
2287             scale
2288             Boolean
2289             0 (False)
2290             If true, the variables are scaled to have unit variance before the analysis takes place. Note: If a column has zero variance, the function will croak to prevent division by zero.
2291            
2292            
2293             retx
2294             Boolean
2295             1 (True)
2296             If true, the rotated data (the original data multiplied by the rotation matrix) is returned under the key x.
2297            
2298            
2299             tol
2300             Number
2301             undef
2302             A value indicating the magnitude below which components should be omitted. Components are omitted if their standard deviation is less than or equal to tol times the standard deviation of the first component.
2303            
2304            
2305             rank
2306             Integer
2307             undef
2308             Optionally specify a strict limit on the number of principal components to return. The function will return min(rank, rows, columns) components.
2309            
2310            
2311            
2312              
2313             =end html
2314              
2315              
2316              
2317             =head3 Results
2318              
2319             =head4 Returned Data Structure
2320              
2321             The C function returns a HashRef containing the following keys representing the results of the Principal Component Analysis:
2322              
2323              
2324              
2325             =begin html
2326              
2327            
2328            
2329            
2330             Key
2331             Type
2332             Description
2333            
2334            
2335            
2336            
2337             sdev
2338             ArrayRef[Number]
2339             The standard deviations of the principal components. Mathematically, these are the square roots of the eigenvalues of the covariance matrix.
2340            
2341            
2342             rotation
2343             ArrayRef[ArrayRef]
2344             A 2D array representing the matrix of variable loadings (the eigenvectors). Each inner array represents a row, and the columns correspond to the principal components.
2345            
2346            
2347             x
2348             ArrayRef[ArrayRef]
2349             A 2D array containing the rotated data (often referred to as PCA scores). This is the original data projected onto the principal components. Note: Only present if the retx option is true.
2350            
2351            
2352             center
2353             ArrayRef[Number] or 0
2354             The centering values used (typically the column means). Returns false (0) if centering was disabled.
2355            
2356            
2357             scale
2358             ArrayRef[Number] or 0
2359             The scaling values used (typically the column standard deviations). Returns false (0) if scaling was disabled.
2360            
2361            
2362             varnames
2363             ArrayRef[String]
2364             The sorted names of the original variables. Note: Only present if the input data was a Hash of Arrays (HoA) or a Hash of Hashes (HoH).
2365            
2366            
2367            
2368              
2369             =end html
2370              
2371              
2372              
2373             =head3 Using array of arrays
2374              
2375             my $aoa = [
2376             [2, 4],
2377             [4, 2],
2378             [6, 6]
2379             ];
2380            
2381             my $pca = prcomp($aoa);
2382              
2383             which returns
2384              
2385             {
2386             center [
2387             [0] 4,
2388             [1] 4
2389             ],
2390             rotation [
2391             [0] [
2392             [0] 0.707106781186547,
2393             [1] 0.707106781186548
2394             ],
2395             [1] [
2396             [0] 0.707106781186548,
2397             [1] -0.707106781186547
2398             ]
2399             ],
2400             scale 0,
2401             sdev [
2402             [0] 2.44948974278318,
2403             [1] 1.4142135623731
2404             ],
2405             x [
2406             [0] [
2407             [0] -1.41421356237309,
2408             [1] -1.4142135623731
2409             ],
2410             [1] [
2411             [0] -1.4142135623731,
2412             [1] 1.41421356237309
2413             ],
2414             [2] [
2415             [0] 2.82842712474619,
2416             [1] 2.22044604925031e-16
2417             ]
2418             ]
2419             }
2420              
2421             =head3 Hash of Arrays
2422              
2423             my $hoa = { B => [4, 2, 6], A => [2, 4, 6] };
2424             my $pca = prcomp($hoa);
2425              
2426             =head2 quantile
2427              
2428             Calculates sample quantiles using R's continuous Type 7 interpolation.
2429              
2430             my $quantile = quantile('x' => [1..99], probs => [0.05, 0.1, 0.25]);
2431              
2432             If the C parameter is omitted, it behaves identically to R by defaulting to the 0, 25, 50, 75, and 100 percentiles (C). The returned hash keys match R's standardized naming convention (e.g., C<"25%">, C<"33.3%">).
2433              
2434             =head2 rbinom
2435              
2436             Create a binomial distribution of numbers
2437              
2438             my $binom = rbinom( n => $n, prob => 0.5, size => 9);
2439              
2440             It hooks directly into Perl's internal PRNG system, respecting C seeds.
2441              
2442             =head2 read_table
2443              
2444             I've tried to make this as simple as possible, trying to follow from R:
2445              
2446             my $test_data = read_table('t/HepatitisCdata.csv');
2447              
2448             =head3 options
2449              
2450              
2451              
2452             =begin html
2453              
2454            
2455            
2456            
2457             Option
2458             Description
2459             Example
2460            
2461            
2462            
2463            
2464             comment
2465             Comment character, by default #
2466             comment => % or whatever; does not apply with header
2467            
2468            
2469             output.type
2470             data type for output: array of hash, hash of array, or hash of hash
2471             'output.type' => 'aoh'
2472            
2473            
2474             filter
2475             Only take in rows with a certain filter
2476             filter => { Sex => sub {$_ eq 'f'} }
2477            
2478            
2479             row.names
2480             include row names in retrieved data; off by default
2481            
2482            
2483            
2484             sep
2485             field separator character; synonym with delim
2486             sep => "\t"
2487            
2488            
2489             delim
2490             field separator character; synonym with sep
2491             delim => "\t"
2492            
2493            
2494            
2495              
2496             =end html
2497              
2498              
2499              
2500             output types can be AOH (aoa), HOA (hoa), HOH (hoh)
2501              
2502             read_table($filename, 'output.type' => 'aoh');
2503            
2504             read_table($filename, 'output.type' => 'hoa');
2505              
2506             and, like Text::CSV_XS, filters can be applied in order to save RAM on big files:
2507              
2508             $test_data = read_table(
2509             't/HepatitisCdata.csv',
2510             filter => {
2511             Sex => sub {$_ eq 'f'} # where "Sex" is the column name, and "$_" is the value for that column
2512             },
2513             'output.type' => 'aoh'
2514             );
2515              
2516             the default delimiter is C<,>
2517             Suffixes C<.csv> and C<.tsv> are automatically detected from file names, but if specified, are overridden by C and/or C. C is given priority.
2518              
2519             =head2 rnorm
2520              
2521             Make a normal distribution of numbers, with pre-set mean C, standard deviation C, and number C.
2522              
2523             my ($rmean, $sd, $n) = (10, 2, 9999);
2524             my $normals = rnorm( n => $n, mean => $rmean, sd => $sd);
2525              
2526             =head2 runif
2527              
2528             Make an approximately uniform distribution into an array
2529              
2530             =head3 named arguments
2531              
2532             my $unif = runif( n => $n, min => 0, max => 1);
2533              
2534             where C is the number of items, the values are between C and C
2535              
2536             =head3 positional args
2537              
2538             this is to match R's behavior:
2539              
2540             runif( 9 )
2541              
2542             will make 9 numbers in [0,1]
2543              
2544             runif(9, 0, 99)
2545              
2546             will match C, C, and C respectively
2547              
2548             =head2 sample
2549              
2550             take a sample of hash or array slices.
2551              
2552             my $h = sample(\%h, 4); # take 4 hash keys and their values into $h
2553              
2554             or, alternatively, with arrays:
2555              
2556             my $arr = sample(\@arr, 3); # take 3 indices of an array
2557              
2558             =head2 scale
2559              
2560             my @scaled_results = scale(1..5);
2561              
2562             You can also pass an options hash to disable centering or scaling:
2563              
2564             my @scaled_results = scale(1..5, { center => false, scale => 1 });
2565              
2566             It fully supports matrix operations. By passing an array of arrays, C processes the data column by column independently:
2567              
2568             my $scaled_mat = scale([[1, 2], [3, 4], [5, 6]]);
2569              
2570             =head2 sd
2571              
2572             my $stdev = sd(2,4,4,4,5,5,7,9);
2573              
2574             Correct answer is 2.1380899352994
2575              
2576             C can accept both array references as well as arrays:
2577              
2578             my $stdev = sd([2,4,4,4,5,5,7,9]);
2579              
2580             As of version 0.02, sd will croak/die if any undefined values are provided.
2581              
2582             =head2 seq
2583              
2584             Works as closely as I can to R's seq, which is very similar to Perl's C loops. Returns an array, not an array reference.
2585              
2586             =head3 Standard integer sequence
2587              
2588             say 'seq(1, 5):';
2589             my @seq = seq(1, 5);
2590             say join(', ', @seq), "\n";
2591            
2592             say 'seq(1, 2, 0.25):';
2593             @seq = seq(1, 2, 0.25);
2594              
2595             =head3 Fractional steps
2596              
2597             say 'seq(1, 2, 0.25):';
2598             @seq = seq(1, 2, 0.25);
2599             say join(", ", @seq), "\n";
2600             for (my $idx = 2; $idx >= 1; $idx -= 0.25) { # count down to pop
2601             is_approx(pop @seq, $idx, "seq item $idx with fractional step");
2602             }
2603              
2604             =head3 Negative steps
2605              
2606             say 'seq(10, 5, -1):';
2607             @seq = seq(10, 5, -1);
2608             say join(", ", @seq), "\n";
2609             for (my $idx = 5; $idx <= 10; $idx++) { # count down to pop
2610             is_approx(pop @seq, $idx, "seq item $idx with negative step");
2611             }
2612              
2613             =head2 shapiro_test
2614              
2615             tests to see if an array reference is normally distributed, returns a p-value and a statistic
2616              
2617             my $shapiro = shapiro_test(
2618             [1..5]
2619             );
2620              
2621             and returns the hash reference:
2622              
2623             {
2624             p.value 0.589650577093106,
2625             p_value 0.589650577093106,
2626             statistic 0.960870680168535,
2627             W 0.960870680168535
2628             }
2629              
2630             =head2 sum
2631              
2632             returns sum, but using both arrays and array references.
2633              
2634             my $test_data = [1..8];
2635             sum($test_data)
2636              
2637             which I prefer, compared to List::Util's required casting into an array:
2638              
2639             sum(@{ $test_data });
2640              
2641             which passing a reference is shorter and much easier to read. Stats::LikeR, however, will work for B
2642              
2643             as of version 0.02, C will cause the script to die if any undefined values are provided
2644              
2645             =head2 summary
2646              
2647             Analogous to R's C, but does not deal with outputs from other functions.
2648             C only describes data as it is entered.
2649             An option C or its synonym C specifies the maximum number of rows that will print.
2650              
2651             =head3 array of array input
2652              
2653             my @arr;
2654             foreach my $i (0..18) {
2655             push @arr, runif(22);
2656             }
2657              
2658             and then C, or C
2659              
2660             ---------------------------------------------------------------------------
2661             Index # values Min. 1st Qu. Median Mean 3rd Qu. Max.
2662             ---------------------------------------------------------------------------
2663             0 22 0.04312 0.286 0.4975 0.5121 0.7296 0.9633
2664             1 22 0.05932 0.1483 0.495 0.4737 0.7699 0.9371
2665             2 22 0.02742 0.1588 0.4045 0.4325 0.6682 0.9878
2666             3 22 0.009233 0.2552 0.5398 0.5147 0.7755 0.9808
2667             4 22 0.06727 0.2432 0.5019 0.4855 0.7121 0.9043
2668             5 22 0.001032 0.1646 0.3021 0.3727 0.5704 0.9556
2669              
2670             =head3 hash of array input
2671              
2672             $test_data = summary(
2673             {
2674             A => runif(9),
2675             B => runif(9)
2676             },
2677             );
2678              
2679             =head2 t_test
2680              
2681             There are 1-sample and 2-sample t-tests, from one or two arrays:
2682              
2683             my $t_test = t_test( $array1, mu => 0.2334 );
2684              
2685             or 2-sample:
2686              
2687             $t_test = t_test(
2688             $array1, $array2,
2689             paired => 1
2690             );
2691              
2692             returns a hash reference, which looks like:
2693              
2694             conf_int => [
2695             -0.06672889, 0.25672889
2696             ],
2697             df => 5,
2698             estimate => 0.095,
2699             p_value => 0.19143688433660,
2700             statistic => 1.50996688705414
2701              
2702             the two groups compared can be specified, though not necessarily, as C and C, just like in R:
2703              
2704             $t_test = t_test(
2705             'x' => $array1, 'y' => $array2,
2706             paired => 1
2707             );
2708              
2709             =head3 Parameters
2710              
2711              
2712              
2713             =begin html
2714              
2715            
2716            
2717            
2718             Parameter
2719             Type
2720             Default
2721             Description
2722            
2723            
2724            
2725            
2726             x
2727             Array Reference
2728             Required
2729             The first vector of data. Must contain at least 2 elements.
2730            
2731            
2732             y
2733             Array Reference
2734             undef
2735             The second vector of data. Required for two-sample or paired tests.
2736            
2737            
2738             mu
2739             Float
2740             0.0
2741             The true value of the mean (or difference in means) for the null hypothesis.
2742            
2743            
2744             paired
2745             Boolean
2746             FALSE
2747             If true, performs a paired t-test. x and y must be the same length.
2748            
2749            
2750             var_equal
2751             Boolean
2752             FALSE
2753             If true, assumes equal variances (standard two-sample). If false, performs Welch's t-test with unequal variances.
2754            
2755            
2756             conf_level
2757             Float
2758             0.95
2759             Confidence level for the returned confidence interval. Must be between 0 and 1.
2760            
2761            
2762             alternative
2763             String
2764             "two.sided"
2765             Direction of the alternative hypothesis: "two.sided", "less", or "greater".
2766            
2767            
2768            
2769              
2770             =end html
2771              
2772              
2773              
2774             =head3 Return Hash
2775              
2776              
2777              
2778             =begin html
2779              
2780            
2781            
2782            
2783             Key
2784             Description
2785            
2786            
2787            
2788            
2789             statistic
2790             The computed t-statistic.
2791            
2792            
2793             df
2794             Degrees of freedom for the test.
2795            
2796            
2797             p_value
2798             The calculated p-value based on the test directionality.
2799            
2800            
2801             conf_int
2802             An Array Reference containing two elements: [lower_bound, upper_bound].
2803            
2804            
2805             estimate
2806             The estimated mean of x (one-sample) OR the mean of the differences (paired).
2807            
2808            
2809             estimate_x
2810             The estimated mean of the x vector (only returned in two-sample tests).
2811            
2812            
2813             estimate_y
2814             The estimated mean of the y vector (only returned in two-sample tests).
2815            
2816            
2817            
2818              
2819             =end html
2820              
2821              
2822              
2823             =head2 transpose
2824              
2825             Transposes a two-dimensional data structure, swapping rows and columns. Accepts either an array of arrays or a hash of hashes.
2826             Returns a new reference of the same type; the input is never modified.
2827              
2828             =head3 Array of array input
2829              
2830             Takes a reference to an array of array references and returns a new AoA where C.
2831              
2832             my $matrix = [[1, 2, 3], [4, 5, 6]];
2833             my $t = transpose($matrix);
2834             # [[1, 4],
2835             # [2, 5],
2836             # [3, 6]]
2837              
2838             All rows must be the same length; a ragged input is a fatal error.
2839             C is valid as an element value and is preserved exactly. An empty outer array or an array of empty rows both return C<[]>.
2840              
2841             Dies if:
2842             - any inner element is not an array reference
2843             - rows differ in length (ragged array)
2844              
2845             =head3 Hash of hash input
2846              
2847             Takes a reference to a hash of hash references and returns a new HoH where C.
2848              
2849             my $table = { alice => { score => 97, grade => 'A' }, bob => { score => 84, grade => 'B' } };
2850             my $t = transpose($table);
2851             # { score => { alice => 97, bob => 84 },
2852             # grade => { alice => 'A', bob => 'B' } }
2853              
2854             Inner keys do not need to be uniform across rows. If a given column key appears in only some rows, the output hash for that column will simply contain only those rows — no padding or C-filling is performed.
2855              
2856             my $sparse = {
2857             a => { x => 1, y => 2 },
2858             b => { x => 3, z => 4 } };
2859            
2860             my $t = transpose($sparse);
2861             # { x => { a => 1, b => 3 },
2862             # y => { a => 2 },
2863             # z => { b => 4 } }
2864              
2865             An empty outer hash or an outer hash whose inner hashes are all empty both return C<{}>.
2866              
2867             Dies if any inner element is not a hash reference
2868              
2869             =head2 value_counts
2870              
2871             Count the values in a given data set, return a hash reference showing how many times each particular value is present.
2872              
2873             =head3 Scalar
2874              
2875             $hash = value_counts('c');
2876              
2877             returns C<< { c =E 1 } >>
2878              
2879             =head3 Array reference
2880              
2881             value_counts(['a','b','b']);
2882              
2883             returns C<< { a =E 1, b =E 2} >>
2884              
2885             =head3 Array
2886              
2887             my $value_counts = value_counts('a','b','b');
2888              
2889             like an array reference above, returns C<< { a =E 1, b =E 2} >>
2890              
2891             =head3 Hash
2892              
2893             my $value_counts = value_counts( { A => 'a', B => 'a', C => 'b' } );
2894              
2895             returns C<< { a =E 2, b =E 1} >>
2896              
2897             =head3 Hash of array
2898              
2899             my $value_counts = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']});
2900              
2901             without a key (like above), the occurences of C, C, and C are counted.
2902              
2903             With a key, like C for above, only values within that hash key are counted:
2904              
2905             my $vc = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']}, 'a');
2906              
2907             =head3 Hash of hash (table)
2908              
2909             $hash = value_counts( {
2910             A => {
2911             a => 'x',
2912             b => 'z'
2913             },
2914             B => {
2915             a => 'x'
2916             },
2917             C => {
2918             a => 'y'
2919             }
2920             }, 'a');
2921              
2922             the column, or second hash key, that you wish to count, is specified at the command line
2923              
2924             =head2 var
2925              
2926             as simple as possible:
2927              
2928             var(2, 4, 5, 8, 9)
2929              
2930             as of version 0.02, C will die if any undefined values are provided
2931              
2932             like C, C, etc., C can accept array references, to make code simpler:
2933              
2934             my $ref = \@arr;
2935             var($ref) = var(@arr)
2936              
2937             =head2 var_test
2938              
2939             As described by R: Performs an F test to compare the variances of two samples from normal populations
2940              
2941             use Stats::LikeR;
2942            
2943             my @x = (2.9, 3.0, 2.5, 2.6, 3.2);
2944             my @y = (3.8, 2.7, 4.0, 2.4);
2945            
2946             my $vt = var_test(\@x, \@y);
2947              
2948             also, conf_level can be set:
2949              
2950             $vt = var_test(\@x, \@y, conf_level => 0.99);
2951              
2952             as well as a ratio (from R: the hypothesized ratio of the population variances of C and C:
2953              
2954             $test_data = var_test(\@xk, \@yk, ratio => 2);
2955              
2956             =head2 view
2957              
2958             An R-style C for the structures C returns. Prints the first
2959             few rows of a dataframe as an aligned text table, with numeric columns
2960             right-justified, string columns left-justified, and undefined cells shown as
2961             C. Works on all three C values:
2962              
2963              
2964              
2965             =begin html
2966              
2967            
2968            
2969            
2970             `output.type`
2971             Perl structure
2972             What `view` shows
2973            
2974            
2975            
2976            
2977             aoh
2978             array of hash refs
2979             one line per row, sequential row numbers
2980            
2981            
2982             hoa
2983             hash of array refs
2984             values gathered column-wise by row index
2985            
2986            
2987             hoh
2988             hash of hash refs
2989             top-level keys become the row label column
2990            
2991            
2992            
2993              
2994             =end html
2995              
2996              
2997              
2998             =head3 Synopsis
2999              
3000             my $aoh = read_table('all.data.tsv', 'output.type' => 'aoh');
3001            
3002             view($aoh); # first 6 rows, like head()
3003             view($aoh, n => 20); # first 20 rows
3004             view($aoh, cols => [qw(id age tt)]); # force a column order
3005             view($aoh, 'row.names' => 'id'); # use column 'id' as the row label
3006             view($aoh, na => '.', max_width => 30);
3007            
3008             my $txt = view($aoh, return_only => 1); # capture the string, print nothing
3009             view($aoh, to => \*STDERR); # print somewhere other than STDOUT
3010              
3011             =head3 Output
3012              
3013             # AoH: 7 rows x 3 cols (showing 6)
3014             row_name Testosterone, total (nmol/L) age sex
3015             p1 18.2 41 M
3016             p2 NA 7 F
3017             p3 1.05 33 F
3018             p4 22.9 55 M
3019             p5 14 29 M
3020             p6 NA 62 F
3021             # ... 1 more row
3022              
3023             The banner reports the structure type, full dimensions, and how many rows are
3024             displayed. A footer appears only when rows are hidden.
3025              
3026             =head3 Arguments
3027              
3028             All arguments after the data reference are optional name/value pairs.
3029              
3030              
3031              
3032             =begin html
3033              
3034            
3035            
3036            
3037             Argument
3038             Default
3039             Meaning
3040            
3041            
3042            
3043            
3044             n
3045             6
3046             Number of rows to show. n greater than the table shows everything.
3047            
3048            
3049             cols / columns
3050            
3051             Array ref pinning column order (and which columns appear).
3052            
3053            
3054             row.names
3055            
3056             Column to use as the row label (for aoh/hoa). See ordering note.
3057            
3058            
3059             na
3060             'NA'
3061             Token printed for undefined cells.
3062            
3063            
3064             max_width
3065             50
3066             Truncate any cell wider than this (column names are never truncated).
3067            
3068            
3069             ellipsis
3070             '...'
3071             Marker appended to truncated cells.
3072            
3073            
3074             gap
3075             2
3076             Spaces between columns.
3077            
3078            
3079             to
3080             STDOUT
3081             Filehandle to print to.
3082            
3083            
3084             return_only
3085             0
3086             If true, return the string and print nothing.
3087            
3088            
3089            
3090              
3091             =end html
3092              
3093              
3094              
3095             C always returns the formatted string, whether or not it also prints.
3096              
3097             =head3 A note on column order
3098              
3099             C stores rows as hashes, so the original CSV column order is not
3100             preserved. C therefore sorts columns by name for a stable, reproducible
3101             layout. Two conveniences soften this:
3102              
3103             =over
3104              
3105             =item * A column literally named C (the label C assigns to a
3106             leading blank header) is detected automatically and moved to the left as the
3107             row label.
3108              
3109             =item * Pass C<< cols =E [ ... ] >> to control both the order and the selection of columns
3110             shown.
3111              
3112             =back
3113              
3114             When no label column is present, C numbers the rows C<1, 2, 3, …>, the way
3115             R prints row names for an unnamed data frame.
3116              
3117             =head3 Edge cases
3118              
3119             =over
3120              
3121             =item * Empty input (C<[]> or C<{}>) prints a clean C<0 rows x 0 cols> banner.
3122              
3123             =item * Tabs, carriage returns, and newlines inside a cell are escaped (C<\t>, C<\r>,
3124             C<\n>) so one record always stays on one line.
3125              
3126             =item * A non-reference argument, or a hash whose values are plain scalars, dies with
3127             a clear message rather than producing garbled output.
3128              
3129             =back
3130              
3131             =head3 Tests
3132              
3133             The behavior above is covered by C (run with C): the three
3134             structure types, C boundaries, alignment, C rendering, truncation,
3135             C/C handling, control-character escaping, the C and
3136             C output paths, empty structures, and the error cases.
3137              
3138             =head2 wilcox_test
3139              
3140             $test_data = wilcox_test(
3141             [1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30],
3142             [0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29]
3143             );
3144              
3145             It fully supports paired tests (C<< paired =E 1 >>) and can calculate exact p-values (the default for C<< N E 50 >> without ties). If ties are encountered, it automatically switches to an approximation with continuity correction.
3146              
3147             =head2 write_table
3148              
3149             mimics R's C, with data as first argument to subroutine, and output file as second
3150              
3151             write_table(\@data_aoh, $tmp_file, sep => "\t", 'row.names' => 1);
3152              
3153             You can also precisely filter and reorder which columns are written by passing an array reference to C:
3154              
3155             write_table(\@data, $tmp_file, sep => "\t", 'col.names' => ['c', 'a']);
3156              
3157             undefined variables are printed as C by default, but can be set as you wish using C
3158              
3159             write_table(\%data_hoa, '/tmp/undef.val.tsv', sep => "\t", 'undef.val' => 'nan')
3160              
3161             as of version 0.07, C determines comma and tab-separated delimiters from the filename, but will override if C or C are explicitly set.
3162              
3163             Args can also be accepted:
3164              
3165             write_table( 'data' => \%flat, 'file' => $f );
3166              
3167             =head1 changes
3168              
3169             =head2 0.15
3170              
3171             C function added, similar to R's C
3172              
3173             C:
3174             filter => {
3175             'Testosterone, total (nmol/L)' => sub { defined $_ },
3176             }
3177              
3178             was broken by the change in undefined variables in 0.14, but is back to being C
3179              
3180             C improvement in sectioning in README
3181              
3182             Numerous changes to prevent quadmath/long double CPAN test failures
3183              
3184             Minimum Scalar::Util version in dist.ini is now 1.22, see https://www.cpantesters.org/cpan/report/6b682236-6567-11f1-a3bc-a055f9c4ba34
3185              
3186             C is no longer needed, and removed as a dependency
3187              
3188             =head3 C
3189              
3190             =head4 Behavior change
3191              
3192             =over
3193              
3194             =item * B<< C cells now write as an empty field, not an empty string. >> A missing
3195             or C value renders as nothing between separators (C) rather than a
3196             quoted empty string (C / C). Supplying C<< 'undef.val' =E 'NA' >>
3197             (or any other token) still overrides this, exactly as before. This is the
3198             only change that can alter the bytes of an existing output file; if you relied
3199             on the previous default, pass C<< 'undef.val' =E '' >> to keep an explicit empty
3200             field, or your chosen placeholder.
3201              
3202             =back
3203              
3204             =head4 Bug fixes
3205              
3206             =over
3207              
3208             =item * B
3209             Previously, cells were looked up with the raw bytes of the column name
3210             (C), which fails to match a
3211             UTF-8-flagged hash key: the column header printed correctly but every cell
3212             under it came back empty. All lookups now fetch by SV (C), header
3213             lists are gathered and sorted as SVs (C + C, preserving the
3214             flag) instead of being round-tripped through C, and the C
3215             column is matched with C rather than C. Embedded NUL bytes in
3216             keys are handled correctly as a side effect.
3217              
3218             =item * B<< C<< col.names =E [] >> no longer loops forever. >> An empty C array made
3219             C return C<-1>, which — compared against an unsigned C loop
3220             index — wrapped to C and ran effectively without end. This was fixed
3221             for flat hashes previously; it was still present for hash-of-hashes,
3222             hash-of-arrays, and array-of-hashes, plus both C header-filtering
3223             loops. All such loops now use a signed index.
3224              
3225             =item * B One header loop used an
3226             C index that silently wrapped past 65,535 and never terminated.
3227             It now uses C like the rest of the code.
3228              
3229             =item * B Every other input shape
3230             rejects a nested reference with
3231             I<"Cannot write nested reference types to table">; a flat hash instead
3232             stringified it (e.g. C) into the file. It now croaks
3233             consistently.
3234              
3235             =item * B<< C<< 'undef.val' =E undef >> is handled cleanly. >> It previously called
3236             C on C, raising an I<"uninitialized value"> warning and
3237             yielding an empty string by accident. It is now treated explicitly as an empty
3238             field, with no warning.
3239              
3240             =back
3241              
3242             =head4 Memory-leak fixes (exception paths)
3243              
3244             =over
3245              
3246             =item * The row-key list gathered for hash-of-hashes input was leaked when the output
3247             file could not be opened.
3248              
3249             =item * The I<"Could not get headers"> croak on hash-of-arrays input leaked both the
3250             already-open filehandle and the headers array.
3251              
3252             =back
3253              
3254             =head4 Internal / non-behavioral
3255              
3256             =over
3257              
3258             =item * Numeric row labels are now formatted into a reused stack buffer instead of a
3259             per-row C / C allocation (no functional change; removes a
3260             cast-away-C and one allocation per row).
3261              
3262             =item * Several signed/unsigned index types were made consistent (C vs
3263             C) to match C and silence the conditions behind the loop bugs
3264             above.
3265              
3266             =back
3267              
3268             =head4 Tests
3269              
3270             =over
3271              
3272             =item * C expanded from 17 to 69 assertions. New coverage targets each
3273             fix above: the empty-field default and C<< undef.val =E undef >> (no warning),
3274             C<< col.names =E [] >> termination across all four input shapes, the
3275             >65,535-column header loop (gated behind C), in-sequence
3276             numeric row labels, nested-reference rejection, CSV quoting corners
3277             (carriage return, separators inside column names, multi-character separators),
3278             empty input writing no file, and UTF-8 column names and row keys. Two leak
3279             assertions cover the exception paths above.
3280              
3281             =back
3282              
3283             =head2 0.14
3284              
3285             C function added for rows
3286              
3287             C reads undefined values to C instead of C, which makes calculations easier
3288              
3289             C writes undef by default as an empty string C<''>
3290              
3291             C transforms a hash of hashes into an hash of arrays
3292              
3293             C uses C instead of C to allow for high-precision 128-bit floats to be used on quadmath machines when available: https://www.cpantesters.org/cpan/report/296f4868-631f-11f1-abba-ff15558d240b
3294              
3295             Numerous switches from C to C for local precision, like above
3296              
3297             numerous changes to C for ease of use and working with datasets with numerous undefined values
3298              
3299             dist.ini now links to math library when compiling: https://www.cpantesters.org/cpan/report/785e26d8-6397-11f1-89c0-dc066e8775ea
3300              
3301             C now should be complete, errors with confidence intervals fixed
3302              
3303             =head2 0.13
3304              
3305             C: speed improvements; commented headers are now allowed
3306              
3307             C: fix for
3308              
3309             Attempt to free temp prematurely: SV 0x56417a2ae610 at t/write_table.t line 182.
3310             main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203
3311             Attempt to free unreferenced scalar: SV 0x56417a2ae610 at t/write_table.t line 183.
3312             main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203
3313              
3314             C gives better warnings for incorrect types of data given
3315              
3316             Numerous changes to dist.ini to improve CPAN testing, especially for Win32
3317              
3318             =head2 0.12
3319              
3320             C can also take hash of arrays, and various mixes of data types
3321              
3322             C: Addition of C keywords in many places; should improve CPU performance
3323              
3324             Better POD formatting, correction of output hash for README's C
3325              
3326             C can now accept hash of hashes as input
3327              
3328             new C function for switching 2D hash keys and 2D array indices, and C for comparing columns against columns
3329              
3330             removed unused function from C helpers
3331              
3332             C: addition of restrict keywords in preinit, should improve CPU performance
3333              
3334             MANIFEST.skip changed to MANIFEST.SKIP to improve CPAN testing
3335              
3336             using C for tests of C, which may or may not work with CPAN testers (experimental)
3337              
3338             Added function name to warnings, so I actually know which function is producing the error
3339              
3340             C can also take C and C as args, in addition to positions
3341              
3342             fixed C as it could hang if given empty C or C
3343              
3344             Added C<__EXTENSIONS__> to source XS file for better CPAN testing
3345              
3346             =head2 0.11
3347              
3348             better POD formatting for tables
3349              
3350             addition of MANIFEST.skip to get better testing results on CPAN
3351              
3352             C: bugfix for when there is no intercept in the formula, new test cases in t/glm.t
3353              
3354             C now accepts simple hashes as input, in addition to hash of arrays, hash of hashes, and arrays of hashes
3355              
3356             Better documentation for t-test
3357              
3358             =head2 0.10
3359              
3360             changes to compilation for CPAN, trying to get this work on Windows
3361              
3362             Addition of C and C
3363              
3364             C will work without key names, just like in R. Testing for C has improved.
3365              
3366             =head2 0.09
3367              
3368             context changes in XS C, C, and C to get better CPAN testing results
3369              
3370             C keywords added to C to increase speed
3371              
3372             =head2 0.08
3373              
3374             Speed improvement in C of hashes.
3375              
3376             Addition of C, C, C, C, and C functions
3377              
3378             Chi-squared function no longer has Perl wrapper, and all code is in XS, which should result in a minor speed increase with 1 less function call.
3379              
3380             Compiler changes for GNU source and inclusion of C, to ensure more CPAN testing works better.
3381              
3382             C now returns hash-of-hash in {row}{column}
3383              
3384             =head2 0.07
3385              
3386             Addition of C function.
3387              
3388             Formulas can now be omitted from C, resulting in a stacked calculation as R would think.
3389              
3390             Addition of C for multi-group comparisons that does not assume normality like C does.
3391              
3392             C and C now automatically set separators for C<.csv> files as C<,> and C<.tsv> files as C<"\t">, respectively, so these values no longer need to be specified separately from the file name.
3393              
3394             =head2 0.06
3395              
3396             Changed compiler options so that Solaris will work
3397              
3398             signed integers changed to unsigned in C
3399              
3400             Added restrict keywords to C, and made C to C
3401              
3402             =head2 0.05
3403              
3404             Leak testing for C
3405              
3406             removal of Data::Printer dependency for easier CPAN testing
3407              
3408             switched several C variable to C so that clang doesn't complain
3409              
3410             added restrict keyword for C
3411              
3412             =head2 0.04
3413              
3414             addition of C function
3415              
3416             GNU source, to maximize compatibility and ease installation
3417              
3418             removal of JSON dependency to ease installation
3419              
3420             =head2 0.03
3421              
3422             Compatibility back to Perl 5.10
3423              
3424             =head2 0.02
3425              
3426             back-compatible to Perl 5.10, instead of original 5.40, ensuring more people can use it
3427              
3428             added var_test
3429              
3430             mean, min, sum, median, var, and max die with undefined values, and print the offending indices
3431              
3432             "group_stats" added to aov, for TukeyHSD in the future
3433              
3434             "cor" dies when given data with standard deviation of 0
3435              
3436             C now has C option, which shows how undefined values are printed to tables, which is C by default.