File Coverage

blib/lib/Stats/LikeR.pm
Criterion Covered Total %
statement 200 221 90.5
branch 70 90 77.7
condition 34 56 60.7
subroutine 29 32 90.6
pod 3 3 100.0
total 336 402 83.5


are exported by default.
line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2             # ABSTRACT: Get basic statistical functions, like in R, but with Perl using XS for performance
3             require 5.010;
4 13     13   458814 use strict;
  13         17  
  13         438  
5 13     13   80 use feature 'say';
  13         15  
  13         3171  
6             package Stats::LikeR;
7             our $VERSION = 0.14;
8             require XSLoader;
9 13     13   6411 use Devel::Confess 'color';
  13         111176  
  13         38  
10 13     13   1199 use warnings FATAL => 'all';
  13         33  
  13         592  
11 13     13   5129 use autodie ':default';
  13         171192  
  13         42  
12 13     13   57231 use Exporter 'import';
  13         17  
  13         459  
13 13     13   48 use Scalar::Util 'looks_like_number';
  13         17  
  13         7171  
14             XSLoader::load('Stats::LikeR', $VERSION);
15             our @EXPORT_OK = qw(add_data aov cfilter chisq_test col col2col cor cor_test cov dnorm filter fisher_test glm group_by hoh2hoa hist kruskal_test ks_test ljoin lm matrix max mean median min mode oneway_test p_adjust power_t_test prcomp quantile rbinom read_table rnorm runif sample scale sd seq shapiro_test sum summary t_test transpose value_counts var var_test wilcox_test write_table);
16             our @EXPORT = @EXPORT_OK;
17              
18             require XSLoader;
19             # ---- filter DSL: col() builds a predicate via overloading (pure Perl) -------
20             # Exported: filter (XS) and col. Place col()/Col/Pred near the top of the .pm;
21             # they need no XS. filter() is the XSUB.
22 26     26 1 135822 sub col { Stats::LikeR::Col->new($_[0]) }
23             {
24             package Stats::LikeR::Col;
25 26   33 26   173 sub new { bless { name => $_[1] }, ref($_[0]) || $_[0] }
26             # build a comparison leaf; if operands were swapped (4 > col('x')), flip the op
27             sub _c {
28 26     26   47 my ($self, $val, $swapped, $op, $flip) = @_;
29 26 100       62 Stats::LikeR::Pred->_leaf($self->{name}, $swapped ? $flip : $op, $val);
30             }
31             use overload
32 13     13   24 '>' => sub { $_[0]->_c($_[1],$_[2],'>','<') },
33 3     3   12 '<' => sub { $_[0]->_c($_[1],$_[2],'<','>') },
34 1     1   4 '>=' => sub { $_[0]->_c($_[1],$_[2],'>=','<=') },
35 1     1   8 '<=' => sub { $_[0]->_c($_[1],$_[2],'<=','>=') },
36 3     3   6 '==' => sub { $_[0]->_c($_[1],$_[2],'==','==') },
37 1     1   3 '!=' => sub { $_[0]->_c($_[1],$_[2],'!=','!=') },
38 0     0   0 'lt' => sub { $_[0]->_c($_[1],$_[2],'lt','gt') },
39 1     1   4 'gt' => sub { $_[0]->_c($_[1],$_[2],'gt','lt') },
40 0     0   0 'le' => sub { $_[0]->_c($_[1],$_[2],'le','ge') },
41 0     0   0 'ge' => sub { $_[0]->_c($_[1],$_[2],'ge','le') },
42 2     2   5 'eq' => sub { $_[0]->_c($_[1],$_[2],'eq','eq') },
43 1     1   3 'ne' => sub { $_[0]->_c($_[1],$_[2],'ne','ne') },
44 13     13   118 fallback => 1;
  13         20  
  13         225  
45             }
46             {
47             package Stats::LikeR::Pred;
48 26     26   324 sub _leaf { bless { op => $_[2], col => $_[1], val => $_[3] }, 'Stats::LikeR::Pred' }
49 4     4   44 sub _node { bless { op => $_[0], l => $_[1], r => $_[2] }, 'Stats::LikeR::Pred' }
50             use overload
51 2     2   4 '&' => sub { Stats::LikeR::Pred::_node('and', $_[0], $_[1]) },
52 1     1   2 '|' => sub { Stats::LikeR::Pred::_node('or', $_[0], $_[1]) },
53 1     1   3 '!' => sub { Stats::LikeR::Pred::_node('not', $_[0], undef) },
54 13     13   3986 fallback => 1;
  13         14  
  13         98  
55             }
56             sub summary {
57 5     5 1 661492 my ($data, %args);
58 5         36 my $current_sub = (split(/::/,(caller(0))[3]))[-1];
59              
60 5 100 66     30 if (@_ && ref $_[0]) {
61             # Handles: summary(\@arr) or summary(\@arr, nrows => 5) or summary(\%h, nrow => 3)
62 4         5 $data = shift;
63 4         7 %args = @_; # capture any trailing key/value pairs
64             } else {
65             # Handles: summary(@runif) or summary(@runif, nrows => 2)
66             # Extract known trailing named arguments from the flat list
67 1   33     20 while (@_ >= 2 && defined $_[-2] && !ref($_[-2]) && $_[-2] =~ /^(?:nrows|nrow)$/) {
      33        
      33        
68 0         0 my $val = pop @_;
69 0         0 my $key = pop @_;
70 0         0 $args{$key} = $val;
71             }
72             # The remaining items in @_ make up the actual data array
73 1         6 my @list = @_;
74 1         2 $data = \@list;
75             }
76             # Normalize nrow -> nrows, default to 10
77 5   50     24 $args{nrows} //= delete($args{nrow}) // 10;
      66        
78 5         9 my $ref_type = ref $data;
79 5 50 66     17 if (($ref_type ne 'ARRAY') && ($ref_type ne 'HASH')) {
80 0         0 die "$current_sub' data must either be a hash or an array, not \"$ref_type\"";
81             }
82 5         7 my $single_arr = 0;
83 5 100 100     18 if (($ref_type eq 'ARRAY') && (ref $data->[0] eq '')) {
84 2         3 $single_arr = 1;
85             }
86 5         11 my @header = ('# values', 'Min.', '1st Qu.', 'Median', 'Mean', '3rd Qu.', 'Max.');
87 5         6 my @out;
88 5 100       13 if ($single_arr == 1) {
    100          
    50          
89 2         6 push @out, '-' x 75;
90 2         10 my $header = sprintf('%9s ' x scalar @header, @header);
91 2         3 push @out, $header;
92 2         3 push @out, '-' x 75;
93 2         22 my @undef = grep {!defined $data->[$_]} 0..scalar @{ $data }-1;
  198         193  
  2         14  
94 2 50       7 if (scalar @undef > 0) {
95 0         0 say STDERR join (',', @undef);
96 0         0 die "The above indices are not defined in $current_sub";
97             }
98 2         3 my @numeric = grep {looks_like_number($_)} @{ $data };
  198         230  
  2         4  
99 2         48 my $q = quantile(\@numeric, probs => [0.25, 0.75]);
100 2         73 my $vals = sprintf('%9.4g ' x scalar @header, scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));
101 2         9 push @out, $vals;
102             } elsif ($ref_type eq 'ARRAY') {
103 1         13 push @out, '-' x 75;
104 1         5 my $header = sprintf('%9s ' x scalar @header, @header);
105 1         3 unshift @header, 'Index';
106 1         2 $header = 'Index ' . $header;
107 1         1 push @out, $header;
108 1         2 push @out, '-' x 75;
109 1         1 my $rows_printed = 0;
110 1         4 foreach my $index (0..$#$data) {
111 2         2 my @undef = grep {!defined $data->[$index][$_]} 0..scalar @{ $data->[$index] }-1;
  18         34  
  2         5  
112 2 50       5 if (scalar @undef > 0) {
113 0         0 say STDERR join (',', @undef);
114 0         0 die "The above indices are not defined for index $index in $current_sub";
115             }
116 2         3 my @numeric = grep {looks_like_number($_)} @{ $data->[$index] };
  18         23  
  2         3  
117 2         16 my $q = quantile(\@numeric, probs => [0.25, 0.75]);
118 2         27 my $vals = sprintf('%6.4g', $index) . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));
119 2         4 push @out, $vals;
120 2         3 $rows_printed++;
121 2 50       6 last if $rows_printed >= $args{nrows}; # Changed to >= just to be safe
122             }
123             } elsif ($ref_type eq 'HASH') {
124 2         5 push @out, '-' x 78;
125 2         8 my $header = sprintf('%9s ' x scalar @header, @header);
126 2         3 unshift @header, 'Key';
127 2         5 $header = ' Key ' . $header;
128 2         2 push @out, $header;
129 2         3 push @out, '-' x 78;
130 2         2 my $rows_printed = 0;
131 2         3 foreach my $key (sort {lc $a cmp lc $b} keys %{ $data }) {
  4         10  
  2         10  
132 3         4 my @undef = grep {!defined $data->{$key}[$_]} 0..scalar @{ $data->{$key} }-1;
  27         31  
  3         7  
133 3 50       8 if (scalar @undef > 0) {
134 0         0 say STDERR join (',', @undef);
135 0         0 die "The above indices are not defined for key $key in $current_sub";
136             }
137 3         3 my @numeric = grep {looks_like_number($_)} @{ $data->{$key} };
  27         58  
  3         4  
138 3         24 my $q = quantile(\@numeric, probs => [0.25, 0.75]);
139 3         6 my $print_key = substr($key, 0, 9);
140 3 50       8 if ((length $print_key) < 9) { # make sure that short keys line up correctly
141 3         7 $print_key .= ' ' x (9 - length $print_key);
142             }
143 3         40 my $vals = $print_key . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));
144 3         6 push @out, $vals;
145 3         4 $rows_printed++;
146 3 100       11 last if $rows_printed >= $args{nrows};
147             }
148             }
149 5         60 say join ("\n", @out);
150 5         27 return \@out;
151             }
152              
153             sub read_table {
154 523     523 1 736915 my $file = shift;
155 523 50       5180 die "\"$file\" is not a file" unless -f $file;
156 523 50       3195 die "\"$file\" is not readable" unless -r $file;
157              
158 523         663 my %input_args = @_;
159 523 100       769 if (defined $input_args{delim}) {
160 1         3 $input_args{sep} = delete $input_args{delim};
161             }
162              
163 523         534 my $default_sep = ',';
164 523 100       2033 if ($file =~ /\.tsv$/i) {
    50          
165 2         4 $default_sep = "\t";
166             } elsif ($file =~ /\.csv$/i) {
167 521         594 $default_sep = ",";
168             }
169              
170 523         1148 my %args = (
171             sep => $default_sep,
172             comment => '#',
173             %input_args,
174             );
175              
176 523         693 my %allowed_args = map {$_ => 1} (
  3138         3906  
177             'comment', 'output.type', 'filter', 'row.names', 'sep', 'delim'
178             );
179 523         930 my @undef_args = sort grep {!$allowed_args{$_}} keys %args;
  1067         1501  
180 523 50       813 if (scalar @undef_args > 0) {
181 0         0 my $current_sub = (split(/::/,(caller(0))[3]))[-1]; # only needed on the error path
182 0         0 say join (', ', @undef_args);
183 0         0 die "the above args aren't defined for $current_sub";
184             }
185 523   100     1276 $args{'output.type'} = $args{'output.type'} // 'aoh';
186 523 100       1127 if ($args{'output.type'} !~ m/^(?:aoh|hoa|hoh)$/) {
187 1         14 die "\"$args{'output.type'}\" isn't allowed";
188             }
189 522         550 my $filter = $args{filter};
190 522 50 66     1308 if (defined $filter && ref($filter) eq 'CODE') {
    50 66        
191 0         0 $filter = { 0 => $filter };
192             } elsif (defined $filter && ref($filter) ne 'HASH') {
193 0         0 die "'filter' must be a CODE or HASH reference";
194             }
195 522         591 my (@data, %data, @header, %mapped_filters, @sorted_filter_flds, %seen_rownames);
196             _parse_csv_file($file, $args{sep} // '', $args{comment} // '', sub {
197 6718     6718   7178 my ($line_ref) = @_;
198 6718 100       7792 if (!@header) {
199             # --- HEADER PROCESSING (copy made only here; runs once) ---
200 522         907 my @line = @$line_ref;
201 522 50 33     2691 $line[0] =~ s/^\Q$args{comment}\E// if @line && defined $line[0];
202             # NOTE: trailing-empty stripping removed — the header is now treated like
203             # data rows, so a consistent trailing separator no longer produces a false
204             # "Alignment error". The alignment check still rejects genuinely ragged rows.
205 522         752 @header = @line;
206 522 100 66     1180 if ((scalar @header > 0) && ($header[0] eq '')) {
207 10         15 $header[0] = 'row_name';
208             }
209 522         526 my %seen_h;
210 522         561 my @dup_cols = grep { $seen_h{$_}++ } @header;
  1166         1792  
211 522 100       722 warn "read_table: duplicate column name(s) in $file: @dup_cols (later values win)\n" if @dup_cols;
212 522 100 100     893 if (($args{'output.type'} eq 'hoh') && (not defined $args{'row.names'})) {
213 4         7 $args{'row.names'} = $header[0];
214             }
215 522 50 66     734 if ((defined $args{'row.names'}) && (!grep {$_ eq $args{'row.names'}} @header)) {
  61         80  
216 0         0 die "\"$args{'row.names'}\" isn't in the header of $file";
217             }
218 522 100       669 if ($filter) {
219 4         11 for my $k (keys %$filter) {
220 4 50       17 if ($k =~ /^\d+$/) {
221 0         0 $mapped_filters{$k} = $filter->{$k};
222             } else {
223 4         50 my ($idx) = grep { $header[$_] eq $k } 0..$#header;
  44         55  
224 4 50       13 die "Filter column '$k' not found in header" unless defined $idx;
225 4         18 $mapped_filters{$idx + 1} = $filter->{$k};
226             }
227             }
228 4         13 @sorted_filter_flds = sort { $a <=> $b } keys %mapped_filters; # constant per file
  0         0  
229             }
230 522         2093 return;
231             }
232             # --- DATA PROCESSING (operate on $line_ref directly; no per-row array copy) ---
233 6196 100       7248 if (scalar @$line_ref != scalar @header) {
234 1         12 die "Alignment error on $file (" . scalar(@$line_ref) . " fields vs " . scalar(@header) . " headers).";
235             }
236 6195         5423 my %line_hash;
237 6195         7706 for my $i (0 .. $#header) {
238 74732         65383 my $v = $line_ref->[$i];
239 74732 100 66     146376 $line_hash{$header[$i]} = (!defined($v) || $v eq '') ? undef : $v;
240             }
241             # --- APPLY FILTERS ---
242 6195 100       7181 if (@sorted_filter_flds) {
243 1847         10085 local %_ = %line_hash; # row available as %_; set once per row, not per field
244 1847         2557 my $skip = 0;
245 1847         1773 foreach my $fld (@sorted_filter_flds) {
246 1847 50       2438 local $_ = $fld == 0 ? $line_ref : $line_ref->[$fld - 1];
247 1847 100       2469 if (!$mapped_filters{$fld}->($line_ref, \%line_hash)) { $skip = 1; last; }
  1131         2192  
  1131         1103  
248 716 50       1798 if ($fld > 0) { # write back any mutation the callback made to $_
249 716         683 $line_ref->[$fld - 1] = $_;
250 716 100 66     1659 $line_hash{$header[$fld - 1]} = (!defined($_) || $_ eq '') ? undef : $_;
251             }
252             }
253 1847 100       11027 return if $skip;
254             }
255             # Populate requested data structure
256 5064 100       7635 if ($args{'output.type'} eq 'aoh') {
    100          
    50          
257 1863         12097 push @data, \%line_hash;
258             } elsif ($args{'output.type'} eq 'hoa') {
259 1114         1153 push @{ $data{$_} }, $line_hash{$_} for @header;
  15757         30964  
260             } elsif ($args{'output.type'} eq 'hoh') {
261 2087         2131 my $row_name = $line_hash{$args{'row.names'}};
262             warn "read_table: duplicate row name '$row_name' in $file (later values win)\n"
263 2087 100       3267 if $seen_rownames{$row_name}++;
264 2087         1962 foreach my $col (@header) {
265 29171 100       32791 next if $col eq $args{'row.names'};
266 27084         52166 $data{$row_name}{$col} = $line_hash{$col};
267             }
268             }
269 522   50     15799 });
      50        
270 521 100       6397 if ($args{'output.type'} eq 'aoh') {
271 509         2146 return \@data;
272             } else { # hoa or hoh
273 12         7422 return \%data;
274             }
275             }
276              
277             1;
278             =encoding utf8
279              
280             =head1 Synopsis
281              
282             Get basic statistical functions working in Perl as if they were part of List::Util, like C, C, C, etc.
283             I've used Artificial Intelligence tools such as Claude, Gemini, and Grok to write this as well as using my own gray matter.
284             There are other similar tools on CPAN, but I want speed and a form like List::Util, which I've gotten here with the help of AI, which often required many attempts to do correctly.
285             This is meant to call subroutines directly through eXternal Subroutines (XS) for performance and portability.
286              
287             There B other modules on CPAN that can do B of this, but this works the way that I B it to.
288              
289             =head1 Functions/Subroutines
290              
291             ========================================================================
292              
293             =head2 add_data
294              
295             Add data to an existing hash or array reference. This function acts as the equivalent of adding new rows, as well as an C (described below). It dynamically infers your target data structure, handles deeply nested records, and seamlessly coerces mismatched data shapes to preserve the structural integrity of your primary reference.
296              
297             =head3 Hash of Hashes (HoH)
298              
299             When the target is a Hash of Hashes, incoming hash keys update existing rows, and new keys create new rows.
300              
301             $data = { 'Jack Smith' => { age => 30 } };
302            
303             $n = {
304             'Jack Smith' => { # Update existing (Hash)
305             dept => 'Engineering'
306             },
307             'Jane Doe' => { age => 25, dept => 'Sales' }, # Add new (Hash)
308             'Invalid' => 'Not a reference' # Edge case safety
309             };
310            
311             add_data($data, $n);
312              
313             B
314              
315             {
316             "Jack Smith": {
317             "age": 30,
318             "dept": "Engineering"
319             },
320             "Jane Doe": {
321             "age": 25,
322             "dept": "Sales"
323             }
324             }
325              
326             =head3 Hash of Arrays (HoA)
327              
328             When the target is a Hash of Arrays, incoming arrays are pushed onto the existing arrays, appending the new elements, similarly to R's C.
329              
330             $data = { 'Project Alpha' => [ 'task1', 'task2' ] };
331            
332             $n = {
333             'Project Alpha' => [ 'task3' ], # Appends to existing array
334             'Project Beta' => [ 'task1', 'task2' ] # Creates new array row
335             };
336            
337             add_data($data, $n);
338              
339             B
340              
341             {
342             "Project Alpha": [ "task1", "task2", "task3" ],
343             "Project Beta": [ "task1", "task2" ]
344             }
345              
346             =head3 Array of Hashes / Arrays (AoH / AoA)
347              
348             C now natively supports Array references at the root level. When targeting an Array, it iterates through the source array and merges data at the corresponding indices.
349              
350             $data = [
351             { id => 1, name => 'Alice' }
352             ];
353            
354             $n = [
355             { role => 'Admin' }, # Updates index 0
356             { id => 2, name => 'Bob' } # Creates index 1
357             ];
358            
359             add_data($data, $n);
360              
361             B
362              
363             [
364             { "id": 1, "name": "Alice", "role": "Admin" },
365             { "id": 2, "name": "Bob" }
366             ]
367              
368             =head3 Advanced Structural Coercion & Cross-Merging
369              
370             C strictly enforces the primary structure of your target reference (determined by inspecting its outer and inner bounds). If you mix Array and Hash types, the function automatically coerces the incoming data to match the target.
371              
372             B<1. Inner Coercion (Mixing Rows):>
373              
374             =over
375              
376             =item * B Source Array rows are read in pairs and converted to key-value pairs.
377              
378             =item * B Source Hash rows are flattened into key-value pairs and pushed onto the array.
379              
380             =back
381              
382             B<2. Root-Level Coercion (Mixing Outer Containers):>
383              
384             =over
385              
386             =item * B The function evaluates the Hash keys as numeric indices. (e.g., source key C<"0"> merges into target array index C<[0]>). Non-numeric keys are safely ignored.
387              
388             =item * B The function converts the Array indices into stringified Hash keys. (e.g., source array index C<[1]> merges into target hash key C<"1">).
389              
390             =back
391              
392             =head3 Source is a mixed Hash. Keys dictate the target array index!
393              
394             $n = {
395             '0' => { y => 20 }, # Merges into $data->[0]
396             '1' => [ 'z', 30 ], # Array pair coerced to Hash, creates $data->[1]
397             'ignored' => { k => 'v' } # Ignored: cannot map to an array index
398             };
399            
400             add_data($data, $n);
401              
402             B
403              
404             [
405             { "x": 10, "y": 20 },
406             { "z": 30 }
407             ]
408              
409             NB: If C is called on a completely empty target reference (e.g., C<$data = {}> or C<$data = []>), it will intelligently infer the required inner structure (Hashes vs Arrays) by inspecting the first valid row of the source data.
410              
411             =head2 aov
412              
413             Warning: assumes normal distribution
414              
415             aov(
416             {
417             yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
418             ctrl => [1, 1, 1, 0, 0, 0]
419             },
420             'yield ~ ctrl');
421              
422             which returns
423              
424             {
425             ctrl {
426             Df 1,
427             "F value" 25.6000000000001,
428             "Mean Sq" 1.70666666666667,
429             Pr(>F) 0.00718232855871859,
430             "Sum Sq" 1.70666666666667
431             },
432             Residuals {
433             Df 4,
434             "Mean Sq" 0.0666666666666665,
435             "Sum Sq" 0.266666666666666
436             }
437             }
438              
439             You can also perform Two-Way ANOVA with categorical interactions using the C<*> operator. The parser will implicitly evaluate the main effects alongside the interaction:
440              
441             my $res_2way = aov($data_2way, 'len ~ supp * dose');
442              
443             It is robust against rank deficiency; collinear terms will gracefully receive 0 degrees of freedom and 0 sum of squares, matching R's behavior.
444              
445             =head3 Input Parameters
446              
447              
448              
449             =begin html
450              
451            
452            
453            
454             Parameter
455             Type
456             Default
457             Description
458             Example
459            
460            
461            
462            
463             data_sv
464             HashRef or ArrayRef
465             (Required)
466             The dataset to analyze. Accepts a Hash of Arrays (HoA) or Array of Hashes (AoH). If no formula is provided, it must be an HoA to allow automatic stacking (mimicking R's stack() on a named list).
467            
468            
469            
470             formula_sv
471             String
472             undef
473             A symbolic description of the model to be fitted. If omitted, the formula automatically defaults to 'Value ~ Group' and the input data is stacked.
474             'yield ~ N * P'
475            
476            
477            
478              
479             =end html
480              
481              
482              
483             =head3 Output Variables
484              
485             The function returns a single C containing the evaluated statistical results. Because the keys map dynamically to the terms parsed from your formula, the structure will vary based on your inputs.
486              
487              
488              
489             =begin html
490              
491            
492            
493            
494             Parameter
495             Type
496             Default
497             Description
498             Example
499            
500            
501            
502            
503             (Term Name)
504             HashRef
505             undef
506             A nested hash for each independent term in the formula (e.g., 'Group', 'N:P'), containing its ANOVA table statistics.
507             {'Df' => 1, 'Sum Sq' => 14.2, 'Mean Sq' => 14.2, 'F value' => 25.81, 'Pr(>F)' => 0.0004}
508            
509            
510             Residuals
511             HashRef
512             undef
513             A nested hash containing the residual (error) statistics for the fitted model.
514             {'Df' => 10, 'Sum Sq' => 5.5, 'Mean Sq' => 0.55}
515            
516            
517             group_stats
518             HashRef
519             undef
520             A nested hash containing descriptive statistics (mean and size / count) for every column evaluated in the original unstacked data structure.
521             {'mean' => {'A' => 2.1, 'B' => 5.4}, 'size' => {'A' => 10, 'B' => 10}}
522            
523            
524            
525              
526             =end html
527              
528              
529              
530             =head3 omitting formula
531              
532             As of version 0.07, in the case of an omitted formula, stacking is done:
533              
534             aov(
535             {
536             yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
537             ctrl => [1, 1, 1, 0, 0, 0]
538             },
539             );
540              
541             is the equivalent of:
542              
543             yield <- c(5.5, 5.4, 5.8, 4.5, 4.8, 4.2)
544             ctrl <- c(1, 1, 1, 0, 0, 0)
545            
546             # Combine them into a named list (the R equivalent of your hash)
547             my_list <- list(yield = yield, ctrl = ctrl)
548            
549             # Convert the list into a "long" dataframe
550             # This creates two columns: "values" and "ind" (the group name)
551             my_data <- stack(my_list)
552            
553             # Rename columns for clarity (optional but good practice)
554             colnames(my_data) <- c("Value", "Group")
555             anova_model <- aov(Value ~ Group, data = my_data)
556             summary(anova_model)
557              
558             in R
559              
560             =head2 cfilter
561              
562             Select B out of a table and return it in the same shape. A column is
563             the inner (second-level) key of a B or an B,
564             or the outer key of a B:
565              
566             use Stats::LikeR;
567             my %hoa = ( x => [1,2,3], y => [4,5,6], z => [0,0,0] );
568             cfilter(\%hoa, keep => ['x','y']); # { x => [1,2,3], y => [4,5,6] }
569             cfilter(\%hoa, remove => ['z']); # { x => [1,2,3], y => [4,5,6] }
570              
571             C takes exactly one of C or C. C returns only the
572             matching columns; C returns everything except them. The result is the
573             same shape as the input (HoH → HoH, HoA → HoA, AoH → AoH), with cell values
574             copied and the original structure left untouched.
575              
576             =head3 Selecting by name
577              
578             Pass an array ref of column names. Naming a column that is not present in the
579             data is an error (it catches typos), and a row that happens not to contain a
580             kept column simply comes back without it:
581              
582             my @aoh = ( { a => 1, b => 2 }, { a => 3 } );
583             cfilter(\@aoh, keep => ['b']); # [ { b => 2 }, {} ]
584              
585             =head3 Selecting by a predicate
586              
587             Instead of names, C/C accept a B — a CODE ref or a
588             function name — evaluated once per column. It is called as
589              
590             $predicate->($column_values, $column_name)
591              
592             where C<$column_values> is an array ref of the column's B cells (undef
593             and missing cells are dropped, so functions like C get clean input).
594             With C, columns for which the predicate is true are kept; with C,
595             those columns are dropped.
596              
597             # Keep only the constant columns (standard deviation zero):
598             my $const = cfilter(\%hoa, keep => sub { sd($_[0]) == 0 }); # { z => [0,0,0] }
599             # Drop the constant columns instead:
600             my $varying = cfilter(\%hoa, remove => sub { sd($_[0]) == 0 }); # { x=>..., y=>... }
601             # A bare function name resolves in Stats::LikeR:: (use a package for your own):
602             cfilter(\%hoa, keep => 'some_predicate');
603              
604             A bare string is always treated as a B, not a single column
605             name, so to keep one column by name use an array ref: C<< keep =E ['x'] >>.
606              
607             =head3 Errors
608              
609             C dies (via C) when:
610              
611             =over
612              
613             =item * neither C nor C is given, or both are,
614              
615             =item * a named column is not present in the data,
616              
617             =item * the selector is neither an array ref nor a code ref / function name, or the
618             function name cannot be resolved,
619              
620             =item * an unknown option is given, or the options are not C<< name =E value >> pairs,
621              
622             =item * the data is not a hash/array reference of the expected shape (a hash of hash
623             refs or array refs, or an array of hash refs).
624              
625             =back
626              
627             =head2 chisq_test
628              
629             The C function performs chi-squared contingency table tests and goodness-of-fit tests. It natively accepts both arrays and hashes (1D and 2D) and mathematically mirrors R's C, returning a structured hash reference of the results.
630              
631             For 2x2 matrices, Yates' Continuity Correction is applied automatically.
632              
633             =head3 Accepted Inputs
634              
635              
636              
637             =begin html
638              
639            
640            
641            
642             Input Type
643             Data Structure
644             Applied Test
645            
646            
647            
648            
649             1D Array
650             [ $v1, $v2, ... ]
651             Chi-squared test for given probabilities
652            
653            
654             2D Array
655             [ [ $v1, $v2 ], [ $v3, $v4 ] ]
656             Pearson's Chi-squared test (Yates' correction if 2x2)
657            
658            
659             1D Hash
660             { key1 => $v1, key2 => $v2 }
661             Chi-squared test for given probabilities
662            
663            
664             2D Hash
665             { row1 => { c1 => $v1, c2 => $v2 } }
666             Pearson's Chi-squared test (Yates' correction if 2x2)
667            
668            
669            
670              
671             =end html
672              
673              
674              
675             =head3 Output Object Structure
676              
677             The function returns a single Hash Reference containing the following key-value pairs. The internal structure of C and C will always identically match the structure of your input.
678              
679              
680              
681             =begin html
682              
683            
684            
685            
686             Key
687             Data Type
688             Description
689            
690            
691            
692            
693             data.name
694             String
695             Identifies the input type (e.g., "Perl ArrayRef" or "Perl HashRef").
696            
697            
698             expected
699             Array/Hash Ref
700             The expected frequencies, matching the geometry of the input.
701            
702            
703             method
704             String
705             The specific statistical test applied.
706            
707            
708             observed
709             Array/Hash Ref
710             The original data passed to the function.
711            
712            
713             p.value
714             Float
715             The calculated p-value of the test.
716            
717            
718             parameter
719             Hash Ref
720             Contains the degrees of freedom (df).
721            
722            
723             statistic
724             Hash Ref
725             Contains the test statistic (X-squared).
726            
727            
728            
729              
730             =end html
731              
732              
733              
734             =head3 Two-Dimensional Array
735              
736             Passing an Array of Arrays (AoA) triggers a standard Pearson's Chi-squared test. If the input is exactly a 2x2 matrix, Yates' continuity correction is applied automatically.
737              
738             my $test_data = [
739             [762, 327, 468],
740             [484, 239, 477]
741             ];
742             my $res = chisq_test($test_data);
743              
744             B
745              
746             {
747             'data.name' => 'Perl ArrayRef',
748             'expected' => [
749             [ 703.671381936888, 319.645266594124, 533.683351468988 ],
750             [ 542.328618063112, 246.354733405876, 411.316648531012 ]
751             ],
752             'method' => "Pearson's Chi-squared test",
753             'observed' => [
754             [ 762, 327, 468 ],
755             [ 484, 239, 477 ]
756             ],
757             'p.value' => 2.95358918321176e-07,
758             'parameter' => { 'df' => 2 },
759             'statistic' => { 'X-squared' => 30.0701490957547 }
760             }
761              
762             =head3 1-Dimensional Array (Goodness of Fit)
763              
764             Passing a flat Array Reference triggers a Goodness of Fit test, assuming equal expected probabilities across all items.
765              
766             my $data = [10, 20, 30];
767             my $res = chisq_test($data);
768              
769             B
770              
771             {
772             'data.name' => 'Perl ArrayRef',
773             'expected' => [ 20, 20, 20 ],
774             'method' => 'Chi-squared test for given probabilities',
775             'observed' => [ 10, 20, 30 ],
776             'p.value' => 0.00673794699908547,
777             'parameter' => { 'df' => 2 },
778             'statistic' => { 'X-squared' => 10 }
779             }
780              
781             =head3 2-Dimensional Hash (Pearson's Chi-squared)
782              
783             Passing a Hash of Hashes (HoH) applies the exact same logic as a 2D Array, but preserves your nested string keys in the output. This is particularly useful when mapping data extracted directly from JSON, databases, or categorical mappings.
784              
785             my $data = {
786             GroupA => { Success => 10, Failure => 15 },
787             GroupB => { Success => 20, Failure => 5 }
788             };
789            
790             my $res = chisq_test($data);
791              
792             B
793              
794             {
795             'data.name' => 'Perl HashRef',
796             'expected' => {
797             'GroupA' => { 'Failure' => 10, 'Success' => 15 },
798             'GroupB' => { 'Failure' => 10, 'Success' => 15 }
799             },
800             'method' => "Pearson's Chi-squared test with Yates' continuity correction",
801             'observed' => {
802             'GroupA' => { 'Failure' => 15, 'Success' => 10 },
803             'GroupB' => { 'Failure' => 5, 'Success' => 20 }
804             },
805             'p.value' => 0.00937475878430379,
806             'parameter' => { 'df' => 1 },
807             'statistic' => { 'X-squared' => 6.75 }
808             }
809              
810             =head3 One-Dimensional Hash (Goodness of Fit)
811              
812             Flat Hash References evaluate Goodness of Fit while preserving your categorical keys in the C and C output blocks.
813              
814             my $data = {
815             Apples => 10,
816             Oranges => 20,
817             Bananas => 30
818             };
819            
820             my $res = chisq_test($data);
821              
822             =head1 C
823              
824             Apply a B to every pair of columns in a table and collect
825             the answers in a hash of hashes.
826              
827             It's the workhorse behind things like correlation matrices: give it your data and
828             the name of a function that takes two columns (C, C, …) and you get
829             back every column compared against every other column.
830              
831             use Stats::LikeR;
832            
833             my %data = (
834             height => [ 170, 165, 180, 175 ],
835             weight => [ 70, 60, 85, 77 ],
836             age => [ 30, 41, 25, 38 ],
837             );
838            
839             my $result = col2col(\%data, 'cor');
840            
841             # $result->{height}{weight} == correlation of height vs weight
842             # $result->{height}{age} == correlation of height vs age
843             # ...and so on for every pair
844              
845             ========================================================================
846              
847             =head2 Arguments
848              
849             col2col( $data, $command, $cols, %options )
850             col2col( $data, $command, \%options ) # options in place of $cols
851              
852              
853              
854             =begin html
855              
856            
857            
858            
859             Position
860             Argument
861             What it is
862            
863            
864            
865            
866             1
867             $data
868             Your table, as a reference (see Data shapes below).
869            
870            
871             2
872             $command
873             A code block or the name of a two-column function.
874            
875            
876             3
877             $cols
878             (optional) Which columns to use as the "from" side. Omit for all.
879            
880            
881             4+
882             %options
883             (optional) na, skip.errors, … (see Options).
884            
885            
886            
887              
888             =end html
889              
890              
891              
892             ========================================================================
893              
894             =head2 Data shapes
895              
896             C understands three layouts. In every case a B is the thing that
897             gets compared, and the result is keyed by column name.
898              
899             B — keys are column names:
900              
901             my %hoa = ( a => [1, 2, 3], b => [4, 5, 6] );
902              
903             B — First keys are row names, second keys are columns:
904              
905             my %hoh = (
906             row1 => { a => 1, b => 4 },
907             row2 => { a => 2, b => 5 },
908             );
909              
910             B — each element is a row, inner keys are columns:
911              
912             my @aoh = ( { a => 1, b => 4 }, { a => 2, b => 5 } );
913              
914             All three produce the same result for the same underlying numbers. Missing or
915             C cells are handled by the C option (below).
916              
917             ========================================================================
918              
919             =head2 The command
920              
921             The second argument is the function applied to each pair of columns. It is called
922             as:
923              
924             $command->( $column_a, $column_b ) # two ARRAY refs
925              
926             so inside a block the two columns arrive in C<@_>:
927              
928             my $result = col2col(\%data, sub {
929             my ($x, $y) = @_; # $x and $y are array refs
930             cor($x, $y);
931             });
932              
933             You can also pass a B. A bare name is looked up in
934             C, so these two are equivalent:
935              
936             col2col(\%data, 'cor');
937             col2col(\%data, sub { cor($_[0], $_[1]) });
938              
939             ========================================================================
940              
941             =head2 The result
942              
943             Always a hash of hashes: B<< C<< $result-E{from}{to} >> >>.
944              
945             for my $from (sort keys %$result) {
946             for my $to (sort keys %{ $result->{$from} }) {
947             printf "%s vs %s = %s\n", $from, $to, $result->{$from}{$to};
948             }
949             }
950              
951             A column is never compared with itself, so C<< $result-E{a}{a} >> does not exist.
952              
953             ========================================================================
954              
955             =head2 Restricting columns (C<$cols>)
956              
957             By default every column is used as the "from" side. The third argument narrows
958             that down — handy when you only care about one variable.
959              
960             # all columns vs all columns
961             my $all = col2col(\%data, 'cor');
962             # just ONE column vs every other column
963             my $one = col2col(\%data, 'cor', 'height');
964             my $cors = $one->{height}; # { weight => ..., age => ... }
965             # a FEW specific columns vs every other column
966             my $few = col2col(\%data, 'cor', ['height', 'weight']);
967              
968             The "to" side is always every other column; C<$cols> only limits the outer keys.
969              
970             ========================================================================
971              
972             =head2 Options
973              
974             Options can be given two ways:
975              
976             col2col(\%data, 'cor', $cols, 'skip.errors' => 0); # after $cols
977             col2col(\%data, 'cor', { 'skip.errors' => 0 }); # hash ref, no $cols needed
978              
979             The hash-ref form is convenient when you have B column restriction — it saves
980             you from passing a placeholder. (A hash ref I C<$cols>, so you can't use
981             it to restrict columns at the same time; use the trailing form for that.)
982              
983             =head3 C — how undefined values are handled
984              
985             Real data has gaps. C decides what the function sees.
986              
987              
988              
989             =begin html
990              
991            
992            
993            
994             Value
995             Behaviour
996             Use for
997            
998            
999            
1000            
1001             'pairwise' (default)
1002             A row is used for a pair only if both columns are defined there. The two columns arrive aligned and equal-length.
1003             Paired stats like cor.
1004            
1005            
1006             'omit'
1007             Each column drops its own undefined values independently. The two columns may end up different lengths.
1008             Unpaired tests like t_test, kruskal_test, where a gap in one sample shouldn't discard a value in the other.
1009            
1010            
1011             'keep'
1012             Every row is passed through, undef and all.
1013             When your function does its own missing-data handling.
1014            
1015            
1016            
1017              
1018             =end html
1019              
1020              
1021              
1022             # correlation: keep only complete pairs (the default)
1023             col2col(\%data, 'cor');
1024             # two-sample test: each column keeps its own values
1025             col2col(\%data, 't_test', undef, na => 'omit');
1026             col2col(\%data, 't_test', { na => 'omit' }); # same, no placeholder
1027              
1028             C / C remain as boolean aliases for backward compatibility:
1029             C means C<'pairwise'>, C means C<'keep'>. Don't combine them with C.
1030              
1031             =head3 C — keep going when a pair fails I<(default: true)>
1032              
1033             Some functions croak on degenerate input — for example C dies if a column has
1034             zero variance. By default C B that croak per pair: instead of
1035             aborting the whole run, it stores the B of the error message in that
1036             cell, so the result tells you I pair failed and I. Every other cell is
1037             computed normally.
1038              
1039             my $r = col2col(\%data, 'cor');
1040             # a good pair: $r->{a}{b} == 0.83
1041             # a bad pair: $r->{a}{const} eq 'cor: standard deviation of y is 0'
1042              
1043             To restore the old "die on the first error" behaviour, turn it off:
1044              
1045             col2col(\%data, 'cor', undef, 'skip.errors' => 0);
1046             col2col(\%data, 'cor', { 'skip.errors' => 0 });
1047              
1048             Only errors from B are trapped. Mistakes in the call itself
1049             (unknown column, bad data, unknown function name, unknown option) always die.
1050              
1051             ========================================================================
1052              
1053             =head2 Worked examples
1054              
1055             B
1056              
1057             my $m = col2col(\%data, 'cor');
1058              
1059             B
1060              
1061             my $col = 'Testosterone, total (nmol/L)';
1062             my $cors = col2col($hoa, 'cor', $col)->{$col};
1063             for my $other (sort { ($cors->{$b} // -2) <=> ($cors->{$a} // -2) } keys %$cors) {
1064             next unless $cors->{$other} =~ /^-?\d/; # skip cells holding an error message
1065             printf "%-30s % .3f\n", $other, $cors->{$other};
1066             }
1067              
1068             B
1069              
1070             my $t = col2col($hoa, 't_test', undef, na => 'omit');
1071              
1072             B
1073              
1074             my $m = col2col($hoa, 'cor');
1075             for my $from (sort keys %$m) {
1076             for my $to (sort keys %{ $m->{$from} }) {
1077             my $v = $m->{$from}{$to};
1078             warn "$from vs $to: $v\n" if defined $v && $v !~ /^-?\d/; # non-numeric = error
1079             }
1080             }
1081              
1082             ========================================================================
1083              
1084             =head2 Gotchas
1085              
1086             =over
1087              
1088             =item * B, C<($col_a, $col_b)> — not a column and
1089             a name. Unpack with C.
1090              
1091             =item * B<< C<'pairwise'> can still hit a constant I. >> A column with overall
1092             variance can be flat on just the rows it shares with one partner, so C may
1093             still croak for that pair. With the default C, that shows up as a
1094             message in the single offending cell rather than killing the run.
1095              
1096             =item * B<< C does not modify your data. >> It reads the table and returns a new
1097             hash of hashes.
1098              
1099             =item * B — i.e.
1100             C is the inner ("to") key. So C<< $result-E{A}{B} >> reading C<…deviation of y is 0>
1101             means column C is the degenerate one for that pair.
1102              
1103             =back
1104              
1105             =head2 cor
1106              
1107             cor($array1, $array2, $method = 'pearson'),
1108              
1109             that is, C is the default and will be used if C<$method> is not specified.
1110              
1111             Just like R, C, C, and C are available
1112              
1113             If you provide an array of arrays (a matrix), C will compute the correlation matrix automatically.
1114              
1115             =head2 cor_test
1116              
1117             my $result = cor_test(
1118             'x' => $x,
1119             'y' => $y,
1120             alternative => 'two.sided',
1121             method => 'pearson',
1122             continuity => 1
1123             );
1124              
1125             C safely handles C (or C) values seamlessly by computing over pairwise complete observations.
1126              
1127             =head2 cov
1128              
1129             cov($array1, $array2, 'pearson')
1130              
1131             or
1132              
1133             cov($array1, $array2, 'spearman')
1134              
1135             or
1136              
1137             cov($array1, $array2, 'kendall')
1138              
1139             =head2 dnorm
1140              
1141             gives the density of the normal distribution, with the specified mean and standard deviation.
1142              
1143             In other words, the predicted height of the value C, given a mean, standard deviation, and whether or not to use a log value.
1144              
1145             returns a single scalar/number if a single value is given, otherwise returns an array reference.
1146              
1147             Usage:
1148              
1149             dnorm(4) # assumes a mean of 0 and standard deviation of 1
1150              
1151             but default mean, standard deviation, and log can be passed as parameters:
1152              
1153             $x = dnorm(0, mean => 0, sd => 2, 'log' => 0);
1154              
1155             =head2 filter
1156              
1157             Return a new data frame containing only the rows of C<$df> that match a predicate. The original C<$df> is never modified.
1158              
1159             my $df2 = filter($df, col('column.name') > 4);
1160              
1161             C accepts a predicate in one of two forms:
1162              
1163             =over
1164              
1165             =item 1. a B<< C expression >> — a small, composable comparison built with overloaded operators, and
1166              
1167             =item 2. a B — for anything the operators can't express (multiple columns, regexes, arbitrary logic), in the same spirit as the C option of L<#>.
1168              
1169             =back
1170              
1171             Both C and C
1172              
1173             =head3 Arguments
1174              
1175              
1176              
1177             =begin html
1178              
1179            
1180            
1181            
1182             Position
1183             Name
1184             Description
1185            
1186            
1187            
1188            
1189             1
1190             $df
1191             The data frame to filter. Either an array of hashes (AoH — e.g. the default output of read_table) or a hash of arrays (HoA).
1192            
1193            
1194             2
1195             predicate
1196             Either a col() comparison object or a CODE reference.
1197            
1198            
1199            
1200              
1201             =end html
1202              
1203              
1204              
1205             The return value is a B data frame of the B as the input (AoH in → AoH out, HoA in → HoA out). For an HoA, every column is filtered in parallel by row index, so all returned columns stay the same length and aligned.
1206              
1207             =head3 The C form
1208              
1209             C is a deferred reference to a column. It carries no data — only the column name — so it can be compared with a literal (or another value) to build a predicate that C evaluates once per row.
1210              
1211             filter($df, col('age') >= 18); # keep rows where age >= 18
1212             filter($df, col('sex') eq 'f'); # keep rows where sex is 'f'
1213             filter($df, 18 <= col('age')); # operands may be in either order
1214              
1215             =head3 Comparison operators
1216              
1217              
1218              
1219             =begin html
1220              
1221            
1222            
1223            
1224             Kind
1225             Operators
1226             Comparison
1227            
1228            
1229            
1230            
1231             Numeric
1232             > < >= <= == !=
1233             numeric (the cell and the value are compared as numbers)
1234            
1235            
1236             String
1237             gt lt ge le eq ne
1238             string (the cell and the value are compared as strings)
1239            
1240            
1241            
1242              
1243             =end html
1244              
1245              
1246              
1247             C may appear on either side of the operator; C<< 4 E col('x') >> is automatically rewritten to the equivalent C<< col('x') E 4 >>.
1248              
1249             =head3 Combining predicates: C<&>, C<|>, C
1250              
1251             Predicates compose with bitwise C<&> (and), C<|> (or), and C (not):
1252              
1253             filter($df, (col('age') > 18) & (col('sex') eq 'f')); # and
1254             filter($df, (col('grp') eq 'a') | (col('grp') eq 'c')); # or
1255             filter($df, !(col('x') > 100)); # not
1256              
1257             Comparison operators bind more tightly than C<&> and C<|>, so C<< (col('a') E 4) & (col('b') E 2) >> is parsed correctly, but the parentheses are recommended for readability.
1258              
1259             =head3 The code-reference form
1260              
1261             For logic the operators can't express, pass a C. It is called once per row; the B, available both as C<$_> and as the first argument C<$_[0]>. Return a true value to keep the row.
1262              
1263             filter($df, sub { $_->{x} > 4 && $_->{grp} eq 'a' });
1264             filter($df, sub { $_->{name} =~ /^A/ });
1265             filter($df, sub { $_[0]{score} > $_[0]{threshold} });
1266              
1267             For an HoA, each row is assembled into a temporary hash reference (C<< { column =E value, ... } >>) before the sub is called, so the same C<< $_-E{column} >> syntax works regardless of the input shape.
1268              
1269             =head3 Examples
1270              
1271             use Stats::LikeR;
1272             my $df = read_table('patients.csv'); # array of hashes
1273             # numeric threshold
1274             my $adults = filter($df, col('Age') >= 18);
1275             # combine conditions
1276             my $target = filter($df, (col('Age') >= 18) & (col('Sex') eq 'f'));
1277             # arbitrary logic with a coderef
1278             my $flagged = filter($df, sub { $_->{ALT} > 40 || $_->{AST} > 40 });
1279             # hash-of-arrays input -> hash-of-arrays output, columns filtered in parallel
1280             my $hoa = read_table('patients.csv', 'output.type' => 'hoa');
1281             my $sub = filter($hoa, col('Age') > 32);
1282             # $sub->{Age}, $sub->{Sex}, ... are all the same length and row-aligned
1283              
1284             =head3 Behavior and notes
1285              
1286             =over
1287              
1288             =item * B C builds and returns a new frame; C<$df> is left untouched.
1289              
1290             =item * B<< A missing or C cell never matches >> a C comparison. For example C<< col('x') E 0 >> silently drops any row that has no C value or whose C is C.
1291              
1292             =item * B, into the returned frame: the returned array references the I row hashes as the input (fast, low-memory). Mutating a row in the result would therefore also change it in the original. HoA values are copied into fresh arrays.
1293              
1294             =item * B are well defined: a predicate true for every row returns a copy-shaped frame with all rows; a predicate true for none returns an empty frame (C<[]> for AoH, a hash of empty arrays for HoA).
1295              
1296             =item * B Passing a non-reference, an array element that is not a hash reference, or an HoA column that is not an array reference raises a descriptive error.
1297              
1298             =item * B The C/operator layer is pure Perl (operator overloading); the per-row evaluation is done in XS.
1299              
1300             =back
1301              
1302             =head3 See also
1303              
1304             C (whose C option applies the same coderef convention while reading a file), C.
1305              
1306             =head2 fisher_test
1307              
1308             =head3 array reference entry
1309              
1310             my $array_data = [
1311             [10, 2],
1312             [3, 15]
1313             ];
1314             my $res1 = fisher_test($array_data);
1315              
1316             which returns a hash reference:
1317              
1318             {
1319             alternative "two.sided",
1320             conf_int [
1321             [0] 2.75343836564204,
1322             [1] 300.682787419401
1323             ],
1324             conf_level 0.95,
1325             estimate {
1326             "odds ratio" 21.3053312750168
1327             },
1328             method "Fisher's Exact Test for Count Data",
1329             p_value 0.000536724119143435
1330             }
1331              
1332             =head3 hash reference entry
1333              
1334             $ft = fisher_test( {
1335             Guess => {
1336             Milk => 3, Tea => 1
1337             },
1338             Truth => {
1339             Milk => 1, Tea => 3
1340             }
1341             });
1342              
1343             =head2 glm
1344              
1345             takes a hash of an array as input
1346              
1347             my %tooth_growth = (
1348             dose => [qw(0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
1349             1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
1350             0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0
1351             2.0 2.0 2.0)],
1352             len => [qw(4.2 11.5 7.3 5.8 6.4 10.0 11.2 11.2 5.2 7.0 16.5 16.5 15.2 17.3 22.5
1353             17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5
1354             15.2 21.5 17.6 9.7 14.5 10.0 8.2 9.4 16.5 9.7 19.7 23.3 23.6 26.4 20.0
1355             25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0)],
1356             supp => [qw(VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC
1357             VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
1358             OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ)]
1359             );
1360            
1361             my $glm_teeth = glm(
1362             data => \%tooth_growth,
1363             formula => 'len ~ dose + supp',
1364             family => 'gaussian'
1365             );
1366              
1367             In addition to the C default, it fully supports logistic regression using the C family parameter via Iteratively Reweighted Least Squares (IRLS):
1368              
1369             my $glm_bin = glm(formula => 'am ~ wt + hp', data => \%mtcars, family => 'binomial');
1370              
1371             =head3 Input Parameters
1372              
1373              
1374              
1375             =begin html
1376              
1377            
1378            
1379            
1380             Parameter
1381             Type
1382             Default
1383             Description
1384             Example
1385            
1386            
1387            
1388            
1389             formula
1390             String
1391             None (Required)
1392             A symbolic description of the model to be fitted. Supports operators like +, :, *, ^, and -1 (to remove the intercept).
1393             'am ~ wt + hp', 'y ~ x - 1'
1394            
1395            
1396             data
1397             HashRef or ArrayRef
1398             None (Required)
1399             The dataset containing the variables used in the formula. Accepts either a Hash of Arrays (HoA) or an Array of Hashes (AoH).
1400             \%mtcars, [{x => 1, y => 2}, ...]
1401            
1402            
1403             family
1404             String
1405             'gaussian'
1406             A description of the error distribution and link function to be used in the model. Currently supports 'gaussian' (identity link) and 'binomial' (logit link).
1407             'binomial'
1408            
1409            
1410            
1411              
1412             =end html
1413              
1414              
1415              
1416             =head3 Output variables
1417              
1418              
1419              
1420             =begin html
1421              
1422            
1423            
1424            
1425             Variable
1426             Type
1427             Description
1428             Example
1429            
1430            
1431            
1432            
1433             aic
1434             Double
1435             Akaike's Information Criterion for the fitted model.
1436             123.45
1437            
1438            
1439             boundary
1440             Integer (Boolean)
1441             1 if the fitted values computationally reached the 0 or 1 boundary (specific to the binomial family), 0 otherwise.
1442             0
1443            
1444            
1445             coefficients
1446             HashRef
1447             A hash mapping the expanded model term names to their estimated coefficient values.
1448             {'Intercept' => 1.5, 'wt' => -0.5}
1449            
1450            
1451             converged
1452             Integer (Boolean)
1453             1 if the Iteratively Reweighted Least Squares (IRLS) algorithm converged within the maximum iterations, 0 otherwise.
1454             1
1455            
1456            
1457             deviance
1458             Double
1459             The residual deviance of the fitted model.
1460             15.2
1461            
1462            
1463             deviance.resid
1464             HashRef
1465             A hash mapping data row names to their computed deviance residuals.
1466             {'Mazda RX4' => 0.12}
1467            
1468            
1469             df.null
1470             Integer
1471             The residual degrees of freedom for the null model.
1472             31
1473            
1474            
1475             df.residual
1476             Integer
1477             The residual degrees of freedom for the fitted model.
1478             30
1479            
1480            
1481             family
1482             String
1483             The statistical family used to fit the model.
1484             "gaussian"
1485            
1486            
1487             fitted.values
1488             HashRef
1489             A hash mapping data row names to the fitted mean values (the model's predictions on the scale of the response).
1490             {'Mazda RX4' => 0.85}
1491            
1492            
1493             iter
1494             Integer
1495             The number of IRLS iterations performed before convergence or hitting the iteration limit.
1496             4
1497            
1498            
1499             null.deviance
1500             Double
1501             The deviance for the null model (a baseline model containing only an intercept, or an offset of 0 if the intercept is removed).
1502             43.5
1503            
1504            
1505             rank
1506             Integer
1507             The numeric rank of the fitted linear model (the number of estimated, non-aliased parameters).
1508             2
1509            
1510            
1511             summary
1512             HashRef
1513             A nested hash mapping each term to its detailed summary statistics, including Estimate, Std. Error, t value / z value, and Pr(> t ) / Pr(> z ). Aliased parameters return "NaN".
1514             {'wt' => {'Estimate' => -0.5, 'Std. Error' => 0.1, ...}}
1515            
1516            
1517             terms
1518             ArrayRef
1519             An ordered list of the expanded term names included in the model matrix.
1520             ['Intercept', 'wt', 'hp']
1521            
1522            
1523            
1524              
1525             =end html
1526              
1527              
1528              
1529             =head2 group_by
1530              
1531             Take a hash of arrays, hash of hashes, or array of hashes, and group a column by another column.
1532              
1533             my $aoh_data = [
1534             { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 },
1535             { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 },
1536             { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 },
1537             { 'Gender' => 'Female' } # Intentional missing target value
1538             ];
1539              
1540             as well as
1541              
1542             $hoh_data = {
1543             'Patient_A' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 },
1544             'Patient_B' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 },
1545             'Patient_C' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 },
1546             'Patient_D' => { 'Gender' => 'Female' }, # Intentional missing target value
1547             'Patient_E' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => undef } # Explicit undef
1548             };
1549              
1550             and
1551              
1552             my $hoa_data = {
1553             'Gender' => ['Male', 'Female', 'Male', 'Female'],
1554             'Testosterone, total (nmol/L)' => [22.1, 2.5, 19.4, undef ]
1555             };
1556              
1557             then run the function thus:
1558              
1559             group_by( $hoa_data, 'Testosterone, total (nmol/L)', 'Gender');
1560              
1561             The output can be thought of like a hash, with the first string broken down by the second.
1562              
1563             all become hash of arrays:
1564              
1565             {
1566             Female [
1567             [0] 1.8
1568             ],
1569             Male [
1570             [0] 18.2,
1571             [1] 20.5
1572             ]
1573             }
1574              
1575             returns an empty array of hashes if neither target nor group keys are found.
1576              
1577             =head3 Filtering
1578              
1579             Data can be further broken down with filter/subs like in C:
1580              
1581             my $testosterone = group_by($d, # group testosterone by "Gender"
1582             'Testosterone, total (nmol/L)',
1583             'Gender',
1584             { 'Race/Hispanic origin w/ NH Asian' => sub { $_ eq $n } },# filter
1585             { 'Testosterone, total (nmol/L)' => sub { $_ ne 'NA' } } # filter
1586             );
1587              
1588             where each filter filters on the columns, e.g. second hash keys.
1589              
1590             =head2 hoh2hoa
1591              
1592             Convert a B (row-major: outer key = row, inner key = column)
1593             into a B (column-major: key = column, value = that column's
1594             cells down the rows).
1595              
1596             use Stats::LikeR;
1597            
1598             my %hoh = (
1599             'r1' => { 'a' => 1, 'b' => 2 },
1600             'r2' => { 'a' => 3, 'b' => 4 },
1601             );
1602            
1603             my $hoa = hoh2hoa(\%hoh);
1604              
1605             which returns
1606             {
1607             a => [1, 3],
1608             b => [2, 4],
1609             }
1610              
1611             =head3 Behavior
1612              
1613             =over
1614              
1615             =item * B are the union of every inner key, so a key that appears in only
1616             some rows still becomes a column.
1617              
1618             =item * B are emitted in sorted outer-key (row-name) order, and that one order
1619             is used for every column, so the arrays stay aligned and the result is
1620             reproducible regardless of hash ordering.
1621              
1622             =item * B — a missing inner key, or a cell whose value is C — are filled
1623             with the fill value (see C below). Every column therefore has
1624             exactly one entry per row.
1625              
1626             =item * Values are B into the result; the original structure is left
1627             untouched.
1628              
1629             =item * An B hash of hashes returns an empty hash of arrays (it is not an
1630             error).
1631              
1632             =back
1633              
1634             =head3 Options
1635              
1636             Options are passed as trailing C<< name =E value >> pairs.
1637              
1638              
1639              
1640             =begin html
1641              
1642            
1643            
1644            
1645             Option
1646             Default
1647             Meaning
1648            
1649            
1650            
1651            
1652             undef.val
1653             undef
1654             Value used to fill a missing key or an undef cell. Any defined scalar works, including 0 and ''. Passing undef keeps the default.
1655            
1656            
1657             row.names
1658             (none)
1659             If set to a string, an extra column of that name is added holding the sorted row labels, aligned with the data. Dies if the name collides with an existing column.
1660            
1661            
1662            
1663              
1664             =end html
1665              
1666              
1667              
1668             # Ragged input with an explicit fill string:
1669             my %ragged = (
1670             'r1' => { 'a' => 1, 'b' => 2 },
1671             'r2' => { 'a' => 3, 'c' => 9 },
1672             );
1673             my $hoa = hoh2hoa(\%ragged, 'undef.val' => 'NA');
1674             # {
1675             # a => [1, 3 ],
1676             # b => [2, 'NA'],
1677             # c => ['NA', 9 ],
1678             # }
1679            
1680             # Keep the row labels as a column:
1681             my $with_ids = hoh2hoa(\%ragged, 'row.names' => 'id');
1682             # {
1683             # id => ['r1', 'r2'],
1684             # a => [1, 3 ],
1685             # b => [2, undef],
1686             # c => [undef, 9 ],
1687             # }
1688              
1689             =head3 Errors
1690              
1691             C dies (via C) when:
1692              
1693             =over
1694              
1695             =item * the argument is not a hash reference,
1696              
1697             =item * any value in the hash is not itself a hash reference,
1698              
1699             =item * an unknown option is given, or the options are not C<< name =E value >> pairs,
1700              
1701             =item * C is not a plain string, or it names an already-present column.
1702              
1703             =back
1704              
1705             =head2 hist
1706              
1707             Computes the histogram of the given data values, operating in single $O(N)$ pass performance. It returns the bin counts, computed breaks, midpoints, and density.
1708              
1709             my $res = hist([1, 2, 2, 3, 3, 3, 4, 4, 5], breaks => 4);
1710              
1711             If C is not explicitly provided, it defaults to calculating the number of bins using Sturges' formula.
1712              
1713             =head2 kruskal_test
1714              
1715             Essentially the test determines if all groups have the same median (same distribution) (an excellent review is at https://library.virginia.edu/data/articles/getting-started-with-the-kruskal-wallis-test)
1716              
1717             Performs a Kruskal-Wallis rank sum test, see
1718             https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/kruskal.test
1719              
1720             =head3 hash of array entry
1721              
1722             I feel that this is better, and more easily read, than what you get in R:
1723              
1724             my %x = (
1725             'normal.subjects' => [2.9, 3.0, 2.5, 2.6, 3.2],
1726             'obs. airway disease' => [3.8, 2.7, 4.0, 2.4],
1727             'asbestosis' => [2.8, 3.4, 3.7, 2.2, 2.0]
1728             );
1729             $kt = kruskal_test(\%x);
1730              
1731             =head3 R-like array entry
1732              
1733             my @xk = (2.9, 3.0, 2.5, 2.6, 3.2); # normal subjects
1734             my @yk = (3.8, 2.7, 4.0, 2.4); # with obstructive airway disease
1735             my @zk = (2.8, 3.4, 3.7, 2.2, 2.0); # with asbestosis
1736             my @x = (@xk, @yk, @zk);
1737             my @g = (
1738             (map {'Normal subjects'} 0..4),
1739             (map {'Subjects with obstructive airway disease'} 0..3),
1740             map {'Subjects with asbestosis'} 0..4
1741             );
1742             my $kt = kruskal_test(\@x, \@g);
1743              
1744             =head2 ks_test
1745              
1746             The Kolmogorov-Smirnov test, which tests whether or not two arrays/lists of data are part of the same distribution is implemented simply:
1747              
1748             $ks = ks_test(\@x, \@y, alternative => 'greater');
1749              
1750             returning a hash reference.
1751              
1752             Also, a single array can be tested against a normal distribution:
1753              
1754             $ks = ks_test($ksx, 'pnorm');
1755              
1756             The p-value precision is about 1e-8, which I want to improve, but am not sure how.
1757              
1758             =head2 ljoin
1759              
1760             Consider a hash: C<$h{$row}{$col}>, and another hash C<$i{$row}{$col}>.
1761             C will add information for C<$col> in C<%i> for each C<$row> to C<%h>, where C<$row> exists in both C<%h> and C<%i>
1762              
1763             For example,
1764              
1765             {
1766             "Jack Smith" {
1767             age 30
1768             }
1769             }
1770              
1771             and a second hash,
1772             {
1773             "Jack Smith" {
1774             dept "Engineering"
1775             },
1776             "Jane Doe" {
1777             age 25
1778             }
1779             }
1780              
1781             in this case, running C will modify \%h to result:
1782              
1783             {
1784             "Jack Smith" {
1785             age 30,
1786             dept "Engineering"
1787             }
1788             }
1789              
1790             =head2 lm
1791              
1792             This is the linear models function.
1793              
1794             $lm = lm(formula => 'mpg ~ wt + hp', data => $mtcars);
1795              
1796             where C<$mtcars> is a hash of hashes
1797              
1798             C also supports generating interaction terms directly within the formula using the C<*> operator:
1799              
1800             my $lm = lm(formula => 'mpg ~ wt * hp^2', data => \%mtcars);
1801              
1802             If your data contains missing numbers (C or C), C handles listwise deletion dynamically to ensure mathematical integrity before fitting.
1803              
1804             the dot operator also works:
1805              
1806             $lm = lm(formula => 'y ~ .', data => $dot_data);
1807              
1808             =head2 matrix
1809              
1810             my $mat1 = matrix(
1811             data => [1..6],
1812             nrow => 2
1813             );
1814              
1815             You can also pass C<< byrow =E 1 >> if you want the matrix populated row-wise instead of column-wise.
1816              
1817             As of version 0.10, parameters do not need to be named, so that C works more like R:
1818              
1819             my $d = matrix(rnorm(32000), 1000, 32);
1820              
1821             works as C, C, and C
1822              
1823             =head2 max
1824              
1825             max(1,2,3);
1826              
1827             or
1828              
1829             my @arr = 1..8;
1830             max(@arr, 4, 5)
1831              
1832             as of version 0.02, max will die if any undefined values are provided
1833              
1834             =head2 mean
1835              
1836             mean(1,2,3);
1837              
1838             or
1839              
1840             my @arr = 1..8;
1841             mean(@arr, 4, 5)
1842              
1843             or
1844              
1845             mean([1,1], [2,2]) # 1.5
1846              
1847             as of version 0.02, mean will die if any undefined values are provided
1848              
1849             =head2 median
1850              
1851             works like mean, taking array references and arrays:
1852              
1853             median( $test_data[$i][0] )
1854              
1855             as of version 0.02, median will die if any undefined values are provided
1856              
1857             =head2 min
1858              
1859             min(1,2,3);
1860              
1861             or
1862              
1863             my @arr = 1..8;
1864             min(@arr, 4, 5)
1865              
1866             as of version 0.02, min will die if any undefined values are provided
1867              
1868             =head2 mode
1869              
1870             Takes either an array or an array reference, and returns an array of the most common scalars (numbers or strings)
1871              
1872             @arr = mode([1,3,3,3]); # returns (3)
1873            
1874             @arr = mode('a','a','c','c','z'); # returns ('a', 'c')
1875              
1876             =head2 oneway_test
1877              
1878             Like ANOVA/aov but does not assume normality
1879              
1880             =head3 hash of array input
1881              
1882             $test_data = oneway_test({
1883             yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
1884             ctrl => [1, 1, 1, 0, 0, 0]
1885             });
1886              
1887             which will output a hash reference:
1888              
1889             {
1890             Group {
1891             Df 1,
1892             "F value" 177.504798464491,
1893             "Mean Sq" 61.6533333333333,
1894             Pr(>F) 1.31343255160843e-07,
1895             "Sum Sq" 61.6533333333333
1896             },
1897             group_stats {
1898             mean {
1899             ctrl 0.5,
1900             yield 5.03333333333333
1901             },
1902             size {
1903             ctrl 6,
1904             yield 6
1905             }
1906             },
1907             Residuals {
1908             Df 9.81767348326473,
1909             "Mean Sq" 0.353783749200256,
1910             "Sum Sq" 3.47333333333333
1911             }
1912              
1913             }
1914              
1915             =head3 array of array input
1916              
1917             oneway_test([
1918             [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],
1919             [1, 1, 1, 0, 0, 0]
1920             ]);
1921              
1922             which will output a nearly identical hash reference as for hash of arrays:
1923              
1924             {
1925             Group {
1926             Df 1,
1927             "F value" 177.504798464491,
1928             "Mean Sq" 61.6533333333333,
1929             Pr(>F) 1.31343255160843e-07,
1930             "Sum Sq" 61.6533333333333
1931             },
1932             group_stats {
1933             mean {
1934             "Index 0" 5.03333333333333,
1935             "Index 1" 0.5
1936             },
1937             size {
1938             "Index 0" 6,
1939             "Index 1" 6
1940             }
1941             },
1942             Residuals {
1943             Df 9.81767348326473,
1944             "Mean Sq" 0.353783749200256,
1945             "Sum Sq" 3.47333333333333
1946             }
1947             }
1948              
1949             =head2 p_adjust
1950              
1951             Returns array of false-discovery-rate-corrected p-values, where methods available are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr"
1952              
1953             my @q = p_adjust(\@pvalues, $method);
1954              
1955             =head2 power_t_test
1956              
1957             $test_data = power_t_test(
1958             n => 30, delta => 0.5,
1959             sd => 1.0, sig_level => 0.05
1960             );
1961              
1962             It also allows configuring the test type (C<< type =E 'one.sample' >>, C<'two.sample'>, C<'paired'>) and alternative hypothesis (C<< alternative =E 'one.sided' >>). You can also pass C<< strict =E 1 >> to strictly evaluate both tails of the distribution.
1963              
1964              
1965              
1966             =begin html
1967              
1968            
1969            
1970            
1971             Parameter
1972             Type
1973             Default
1974             Description
1975            
1976            
1977            
1978            
1979             n
1980             Float
1981             undef
1982             Number of observations (per group for two-sample, pairs for paired).
1983            
1984            
1985             delta
1986             Float
1987             undef
1988             True difference in means.
1989            
1990            
1991             sd
1992             Float
1993             1.0
1994             Standard deviation.
1995            
1996            
1997             sig_level
1998             Float
1999             0.05
2000             Significance level (Type I error probability). Also accepts sig.level.
2001            
2002            
2003             power
2004             Float
2005             undef
2006             Power of test (1 minus Type II error probability).
2007            
2008            
2009             type
2010             String
2011             "two.sample"
2012             Type of t-test: "two.sample", "one.sample", or "paired".
2013            
2014            
2015             alternative
2016             String
2017             "two.sided"
2018             One- or two-sided test: "two.sided", "one.sided", "greater", or "less".
2019            
2020            
2021             strict
2022             Boolean
2023             0 (False)
2024             Use strict interpretation of two-sided power calculations.
2025            
2026            
2027             tol
2028             Float
2029             ~1.22e-4
2030             Numerical tolerance used for the internal root-finding algorithm.
2031            
2032            
2033            
2034              
2035             =end html
2036              
2037              
2038              
2039             =head2 prcomp
2040              
2041             Principal Component Analysis
2042              
2043             =head3 Options
2044              
2045              
2046              
2047             =begin html
2048              
2049            
2050            
2051            
2052             Option
2053             Type
2054             Default
2055             Description
2056            
2057            
2058            
2059            
2060             center
2061             Boolean
2062             1 (True)
2063             If true, the variables are shifted to be zero-centered before the analysis takes place.
2064            
2065            
2066             scale
2067             Boolean
2068             0 (False)
2069             If true, the variables are scaled to have unit variance before the analysis takes place. Note: If a column has zero variance, the function will croak to prevent division by zero.
2070            
2071            
2072             retx
2073             Boolean
2074             1 (True)
2075             If true, the rotated data (the original data multiplied by the rotation matrix) is returned under the key x.
2076            
2077            
2078             tol
2079             Number
2080             undef
2081             A value indicating the magnitude below which components should be omitted. Components are omitted if their standard deviation is less than or equal to tol times the standard deviation of the first component.
2082            
2083            
2084             rank
2085             Integer
2086             undef
2087             Optionally specify a strict limit on the number of principal components to return. The function will return min(rank, rows, columns) components.
2088            
2089            
2090            
2091              
2092             =end html
2093              
2094              
2095              
2096             =head3 Results
2097              
2098             =head4 Returned Data Structure
2099              
2100             The C function returns a HashRef containing the following keys representing the results of the Principal Component Analysis:
2101              
2102              
2103              
2104             =begin html
2105              
2106            
2107            
2108            
2109             Key
2110             Type
2111             Description
2112            
2113            
2114            
2115            
2116             sdev
2117             ArrayRef[Number]
2118             The standard deviations of the principal components. Mathematically, these are the square roots of the eigenvalues of the covariance matrix.
2119            
2120            
2121             rotation
2122             ArrayRef[ArrayRef]
2123             A 2D array representing the matrix of variable loadings (the eigenvectors). Each inner array represents a row, and the columns correspond to the principal components.
2124            
2125            
2126             x
2127             ArrayRef[ArrayRef]
2128             A 2D array containing the rotated data (often referred to as PCA scores). This is the original data projected onto the principal components. Note: Only present if the retx option is true.
2129            
2130            
2131             center
2132             ArrayRef[Number] or 0
2133             The centering values used (typically the column means). Returns false (0) if centering was disabled.
2134            
2135            
2136             scale
2137             ArrayRef[Number] or 0
2138             The scaling values used (typically the column standard deviations). Returns false (0) if scaling was disabled.
2139            
2140            
2141             varnames
2142             ArrayRef[String]
2143             The sorted names of the original variables. Note: Only present if the input data was a Hash of Arrays (HoA) or a Hash of Hashes (HoH).
2144            
2145            
2146            
2147              
2148             =end html
2149              
2150              
2151              
2152             =head3 Using array of arrays
2153              
2154             my $aoa = [
2155             [2, 4],
2156             [4, 2],
2157             [6, 6]
2158             ];
2159            
2160             my $pca = prcomp($aoa);
2161              
2162             which returns
2163              
2164             {
2165             center [
2166             [0] 4,
2167             [1] 4
2168             ],
2169             rotation [
2170             [0] [
2171             [0] 0.707106781186547,
2172             [1] 0.707106781186548
2173             ],
2174             [1] [
2175             [0] 0.707106781186548,
2176             [1] -0.707106781186547
2177             ]
2178             ],
2179             scale 0,
2180             sdev [
2181             [0] 2.44948974278318,
2182             [1] 1.4142135623731
2183             ],
2184             x [
2185             [0] [
2186             [0] -1.41421356237309,
2187             [1] -1.4142135623731
2188             ],
2189             [1] [
2190             [0] -1.4142135623731,
2191             [1] 1.41421356237309
2192             ],
2193             [2] [
2194             [0] 2.82842712474619,
2195             [1] 2.22044604925031e-16
2196             ]
2197             ]
2198             }
2199              
2200             =head3 Hash of Arrays
2201              
2202             my $hoa = { B => [4, 2, 6], A => [2, 4, 6] };
2203             my $pca = prcomp($hoa);
2204              
2205             =head2 quantile
2206              
2207             Calculates sample quantiles using R's continuous Type 7 interpolation.
2208              
2209             my $quantile = quantile('x' => [1..99], probs => [0.05, 0.1, 0.25]);
2210              
2211             If the C parameter is omitted, it behaves identically to R by defaulting to the 0, 25, 50, 75, and 100 percentiles (C). The returned hash keys match R's standardized naming convention (e.g., C<"25%">, C<"33.3%">).
2212              
2213             =head2 rbinom
2214              
2215             Create a binomial distribution of numbers
2216              
2217             my $binom = rbinom( n => $n, prob => 0.5, size => 9);
2218              
2219             It hooks directly into Perl's internal PRNG system, respecting C seeds.
2220              
2221             =head2 read_table
2222              
2223             I've tried to make this as simple as possible, trying to follow from R:
2224              
2225             my $test_data = read_table('t/HepatitisCdata.csv');
2226              
2227             =head3 options
2228              
2229              
2230              
2231             =begin html
2232              
2233            
2234            
2235            
2236             Option
2237             Description
2238             Example
2239            
2240            
2241            
2242            
2243             comment
2244             Comment character, by default #
2245             comment => % or whatever; does not apply with header
2246            
2247            
2248             output.type
2249             data type for output: array of hash, hash of array, or hash of hash
2250             'output.type' => 'aoh'
2251            
2252            
2253             filter
2254             Only take in rows with a certain filter
2255             filter => { Sex => sub {$_ eq 'f'} }
2256            
2257            
2258             row.names
2259             include row names in retrieved data; off by default
2260            
2261            
2262            
2263             sep
2264             field separator character; synonym with delim
2265             sep => "\t"
2266            
2267            
2268             delim
2269             field separator character; synonym with sep
2270             delim => "\t"
2271            
2272            
2273            
2274              
2275             =end html
2276              
2277              
2278              
2279             output types can be AOH (aoa), HOA (hoa), HOH (hoh)
2280              
2281             read_table($filename, 'output.type' => 'aoh');
2282            
2283             read_table($filename, 'output.type' => 'hoa');
2284              
2285             and, like Text::CSV_XS, filters can be applied in order to save RAM on big files:
2286              
2287             $test_data = read_table(
2288             't/HepatitisCdata.csv',
2289             filter => {
2290             Sex => sub {$_ eq 'f'} # where "Sex" is the column name, and "$_" is the value for that column
2291             },
2292             'output.type' => 'aoh'
2293             );
2294              
2295             the default delimiter is C<,>
2296             Suffixes C<.csv> and C<.tsv> are automatically detected from file names, but if specified, are overridden by C and/or C. C is given priority.
2297              
2298             =head2 rnorm
2299              
2300             Make a normal distribution of numbers, with pre-set mean C, standard deviation C, and number C.
2301              
2302             my ($rmean, $sd, $n) = (10, 2, 9999);
2303             my $normals = rnorm( n => $n, mean => $rmean, sd => $sd);
2304              
2305             =head2 runif
2306              
2307             Make an approximately uniform distribution into an array
2308              
2309             =head3 named arguments
2310              
2311             my $unif = runif( n => $n, min => 0, max => 1);
2312              
2313             where C is the number of items, the values are between C and C
2314              
2315             =head3 positional args
2316              
2317             this is to match R's behavior:
2318              
2319             runif( 9 )
2320              
2321             will make 9 numbers in [0,1]
2322              
2323             runif(9, 0, 99)
2324              
2325             will match C, C, and C respectively
2326              
2327             =head2 sample
2328              
2329             take a sample of hash or array slices.
2330              
2331             my $h = sample(\%h, 4); # take 4 hash keys and their values into $h
2332              
2333             or, alternatively, with arrays:
2334              
2335             my $arr = sample(\@arr, 3); # take 3 indices of an array
2336              
2337             =head2 scale
2338              
2339             my @scaled_results = scale(1..5);
2340              
2341             You can also pass an options hash to disable centering or scaling:
2342              
2343             my @scaled_results = scale(1..5, { center => false, scale => 1 });
2344              
2345             It fully supports matrix operations. By passing an array of arrays, C processes the data column by column independently:
2346              
2347             my $scaled_mat = scale([[1, 2], [3, 4], [5, 6]]);
2348              
2349             =head2 sd
2350              
2351             my $stdev = sd(2,4,4,4,5,5,7,9);
2352              
2353             Correct answer is 2.1380899352994
2354              
2355             C can accept both array references as well as arrays:
2356              
2357             my $stdev = sd([2,4,4,4,5,5,7,9]);
2358              
2359             As of version 0.02, sd will croak/die if any undefined values are provided.
2360              
2361             =head2 seq
2362              
2363             Works as closely as I can to R's seq, which is very similar to Perl's C loops. Returns an array, not an array reference.
2364              
2365             =head3 Standard integer sequence
2366              
2367             say 'seq(1, 5):';
2368             my @seq = seq(1, 5);
2369             say join(', ', @seq), "\n";
2370            
2371             say 'seq(1, 2, 0.25):';
2372             @seq = seq(1, 2, 0.25);
2373              
2374             =head3 Fractional steps
2375              
2376             say 'seq(1, 2, 0.25):';
2377             @seq = seq(1, 2, 0.25);
2378             say join(", ", @seq), "\n";
2379             for (my $idx = 2; $idx >= 1; $idx -= 0.25) { # count down to pop
2380             is_approx(pop @seq, $idx, "seq item $idx with fractional step");
2381             }
2382              
2383             =head3 Negative steps
2384              
2385             say 'seq(10, 5, -1):';
2386             @seq = seq(10, 5, -1);
2387             say join(", ", @seq), "\n";
2388             for (my $idx = 5; $idx <= 10; $idx++) { # count down to pop
2389             is_approx(pop @seq, $idx, "seq item $idx with negative step");
2390             }
2391              
2392             =head2 shapiro_test
2393              
2394             tests to see if an array reference is normally distributed, returns a p-value and a statistic
2395              
2396             my $shapiro = shapiro_test(
2397             [1..5]
2398             );
2399              
2400             and returns the hash reference:
2401              
2402             {
2403             p.value 0.589650577093106,
2404             p_value 0.589650577093106,
2405             statistic 0.960870680168535,
2406             W 0.960870680168535
2407             }
2408              
2409             =head2 sum
2410              
2411             returns sum, but using both arrays and array references.
2412              
2413             my $test_data = [1..8];
2414             sum($test_data)
2415              
2416             which I prefer, compared to List::Util's required casting into an array:
2417              
2418             sum(@{ $test_data });
2419              
2420             which passing a reference is shorter and much easier to read. Stats::LikeR, however, will work for B
2421              
2422             as of version 0.02, C will cause the script to die if any undefined values are provided
2423              
2424             =head2 summary
2425              
2426             Analogous to R's C, but does not deal with outputs from other functions.
2427             C only describes data as it is entered.
2428             An option C or its synonym C specifies the maximum number of rows that will print.
2429              
2430             =head3 array of array input
2431              
2432             my @arr;
2433             foreach my $i (0..18) {
2434             push @arr, runif(22);
2435             }
2436              
2437             and then C, or C
2438              
2439             ---------------------------------------------------------------------------
2440             Index # values Min. 1st Qu. Median Mean 3rd Qu. Max.
2441             ---------------------------------------------------------------------------
2442             0 22 0.04312 0.286 0.4975 0.5121 0.7296 0.9633
2443             1 22 0.05932 0.1483 0.495 0.4737 0.7699 0.9371
2444             2 22 0.02742 0.1588 0.4045 0.4325 0.6682 0.9878
2445             3 22 0.009233 0.2552 0.5398 0.5147 0.7755 0.9808
2446             4 22 0.06727 0.2432 0.5019 0.4855 0.7121 0.9043
2447             5 22 0.001032 0.1646 0.3021 0.3727 0.5704 0.9556
2448              
2449             =head3 hash of array input
2450              
2451             $test_data = summary(
2452             {
2453             A => runif(9),
2454             B => runif(9)
2455             },
2456             );
2457              
2458             =head2 t_test
2459              
2460             There are 1-sample and 2-sample t-tests, from one or two arrays:
2461              
2462             my $t_test = t_test( $array1, mu => 0.2334 );
2463              
2464             or 2-sample:
2465              
2466             $t_test = t_test(
2467             $array1, $array2,
2468             paired => 1
2469             );
2470              
2471             returns a hash reference, which looks like:
2472              
2473             conf_int => [
2474             -0.06672889, 0.25672889
2475             ],
2476             df => 5,
2477             estimate => 0.095,
2478             p_value => 0.19143688433660,
2479             statistic => 1.50996688705414
2480              
2481             the two groups compared can be specified, though not necessarily, as C and C, just like in R:
2482              
2483             $t_test = t_test(
2484             'x' => $array1, 'y' => $array2,
2485             paired => 1
2486             );
2487              
2488             =head3 Parameters
2489              
2490              
2491              
2492             =begin html
2493              
2494            
2495            
2496            
2497             Parameter
2498             Type
2499             Default
2500             Description
2501            
2502            
2503            
2504            
2505             x
2506             Array Reference
2507             Required
2508             The first vector of data. Must contain at least 2 elements.
2509            
2510            
2511             y
2512             Array Reference
2513             undef
2514             The second vector of data. Required for two-sample or paired tests.
2515            
2516            
2517             mu
2518             Float
2519             0.0
2520             The true value of the mean (or difference in means) for the null hypothesis.
2521            
2522            
2523             paired
2524             Boolean
2525             FALSE
2526             If true, performs a paired t-test. x and y must be the same length.
2527            
2528            
2529             var_equal
2530             Boolean
2531             FALSE
2532             If true, assumes equal variances (standard two-sample). If false, performs Welch's t-test with unequal variances.
2533            
2534            
2535             conf_level
2536             Float
2537             0.95
2538             Confidence level for the returned confidence interval. Must be between 0 and 1.
2539            
2540            
2541             alternative
2542             String
2543             "two.sided"
2544             Direction of the alternative hypothesis: "two.sided", "less", or "greater".
2545            
2546            
2547            
2548              
2549             =end html
2550              
2551              
2552              
2553             =head3 Return Hash
2554              
2555              
2556              
2557             =begin html
2558              
2559            
2560            
2561            
2562             Key
2563             Description
2564            
2565            
2566            
2567            
2568             statistic
2569             The computed t-statistic.
2570            
2571            
2572             df
2573             Degrees of freedom for the test.
2574            
2575            
2576             p_value
2577             The calculated p-value based on the test directionality.
2578            
2579            
2580             conf_int
2581             An Array Reference containing two elements: [lower_bound, upper_bound].
2582            
2583            
2584             estimate
2585             The estimated mean of x (one-sample) OR the mean of the differences (paired).
2586            
2587            
2588             estimate_x
2589             The estimated mean of the x vector (only returned in two-sample tests).
2590            
2591            
2592             estimate_y
2593             The estimated mean of the y vector (only returned in two-sample tests).
2594            
2595            
2596            
2597              
2598             =end html
2599              
2600              
2601              
2602             =head2 transpose
2603              
2604             Transposes a two-dimensional data structure, swapping rows and columns. Accepts either an array of arrays or a hash of hashes.
2605             Returns a new reference of the same type; the input is never modified.
2606              
2607             =head3 Array of array input
2608              
2609             Takes a reference to an array of array references and returns a new AoA where C.
2610              
2611             my $matrix = [[1, 2, 3], [4, 5, 6]];
2612             my $t = transpose($matrix);
2613             # [[1, 4],
2614             # [2, 5],
2615             # [3, 6]]
2616              
2617             All rows must be the same length; a ragged input is a fatal error.
2618             C is valid as an element value and is preserved exactly. An empty outer array or an array of empty rows both return C<[]>.
2619              
2620             Dies if:
2621             - any inner element is not an array reference
2622             - rows differ in length (ragged array)
2623              
2624             =head3 Hash of hash input
2625              
2626             Takes a reference to a hash of hash references and returns a new HoH where C.
2627              
2628             my $table = { alice => { score => 97, grade => 'A' }, bob => { score => 84, grade => 'B' } };
2629             my $t = transpose($table);
2630             # { score => { alice => 97, bob => 84 },
2631             # grade => { alice => 'A', bob => 'B' } }
2632              
2633             Inner keys do not need to be uniform across rows. If a given column key appears in only some rows, the output hash for that column will simply contain only those rows — no padding or C-filling is performed.
2634              
2635             my $sparse = {
2636             a => { x => 1, y => 2 },
2637             b => { x => 3, z => 4 } };
2638            
2639             my $t = transpose($sparse);
2640             # { x => { a => 1, b => 3 },
2641             # y => { a => 2 },
2642             # z => { b => 4 } }
2643              
2644             An empty outer hash or an outer hash whose inner hashes are all empty both return C<{}>.
2645              
2646             Dies if any inner element is not a hash reference
2647              
2648             =head2 value_counts
2649              
2650             Count the values in a given data set, return a hash reference showing how many times each particular value is present.
2651              
2652             =head3 Scalar
2653              
2654             $hash = value_counts('c');
2655              
2656             returns C<< { c =E 1 } >>
2657              
2658             =head3 Array reference
2659              
2660             value_counts(['a','b','b']);
2661              
2662             returns C<< { a =E 1, b =E 2} >>
2663              
2664             =head3 Array
2665              
2666             my $value_counts = value_counts('a','b','b');
2667              
2668             like an array reference above, returns C<< { a =E 1, b =E 2} >>
2669              
2670             =head3 Hash
2671              
2672             my $value_counts = value_counts( { A => 'a', B => 'a', C => 'b' } );
2673              
2674             returns C<< { a =E 2, b =E 1} >>
2675              
2676             =head3 Hash of array
2677              
2678             my $value_counts = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']});
2679              
2680             without a key (like above), the occurences of C, C, and C are counted.
2681              
2682             With a key, like C for above, only values within that hash key are counted:
2683              
2684             my $vc = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']}, 'a');
2685              
2686             =head3 Hash of hash (table)
2687              
2688             $hash = value_counts( {
2689             A => {
2690             a => 'x',
2691             b => 'z'
2692             },
2693             B => {
2694             a => 'x'
2695             },
2696             C => {
2697             a => 'y'
2698             }
2699             }, 'a');
2700              
2701             the column, or second hash key, that you wish to count, is specified at the command line
2702              
2703             =head2 var
2704              
2705             as simple as possible:
2706              
2707             var(2, 4, 5, 8, 9)
2708              
2709             as of version 0.02, C will die if any undefined values are provided
2710              
2711             like C, C, etc., C can accept array references, to make code simpler:
2712              
2713             my $ref = \@arr;
2714             var($ref) = var(@arr)
2715              
2716             =head2 var_test
2717              
2718             As described by R: Performs an F test to compare the variances of two samples from normal populations
2719              
2720             use Stats::LikeR;
2721            
2722             my @x = (2.9, 3.0, 2.5, 2.6, 3.2);
2723             my @y = (3.8, 2.7, 4.0, 2.4);
2724            
2725             my $vt = var_test(\@x, \@y);
2726              
2727             also, conf_level can be set:
2728              
2729             $vt = var_test(\@x, \@y, conf_level => 0.99);
2730              
2731             as well as a ratio (from R: the hypothesized ratio of the population variances of C and C:
2732              
2733             $test_data = var_test(\@xk, \@yk, ratio => 2);
2734              
2735             =head2 wilcox_test
2736              
2737             $test_data = wilcox_test(
2738             [1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30],
2739             [0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29]
2740             );
2741              
2742             It fully supports paired tests (C<< paired =E 1 >>) and can calculate exact p-values (the default for C<< N E 50 >> without ties). If ties are encountered, it automatically switches to an approximation with continuity correction.
2743              
2744             =head2 write_table
2745              
2746             mimics R's C, with data as first argument to subroutine, and output file as second
2747              
2748             write_table(\@data_aoh, $tmp_file, sep => "\t", 'row.names' => 1);
2749              
2750             You can also precisely filter and reorder which columns are written by passing an array reference to C:
2751              
2752             write_table(\@data, $tmp_file, sep => "\t", 'col.names' => ['c', 'a']);
2753              
2754             undefined variables are printed as C by default, but can be set as you wish using C
2755              
2756             write_table(\%data_hoa, '/tmp/undef.val.tsv', sep => "\t", 'undef.val' => 'nan')
2757              
2758             as of version 0.07, C determines comma and tab-separated delimiters from the filename, but will override if C or C are explicitly set.
2759              
2760             Args can also be accepted:
2761              
2762             write_table( 'data' => \%flat, 'file' => $f );
2763              
2764             =head1 changes
2765              
2766             =head2 0.14
2767              
2768             C function added for rows
2769              
2770             C reads undefined values to C instead of C, which makes calculations easier
2771              
2772             C writes undef by default as an empty string C<''>
2773              
2774             C transforms a hash of hashes into an hash of arrays
2775              
2776             C uses C instead of C to allow for high-precision 128-bit floats to be used on quadmath machines when available: https://www.cpantesters.org/cpan/report/296f4868-631f-11f1-abba-ff15558d240b
2777              
2778             Numerous switches from C to C for local precision, like above
2779              
2780             numerous changes to C for ease of use and working with datasets with numerous undefined values
2781              
2782             dist.ini now links to math library when compiling: https://www.cpantesters.org/cpan/report/785e26d8-6397-11f1-89c0-dc066e8775ea
2783              
2784             C now should be complete, errors with confidence intervals fixed
2785              
2786             =head2 0.13
2787              
2788             C: speed improvements; commented headers are now allowed
2789              
2790             C: fix for
2791              
2792             Attempt to free temp prematurely: SV 0x56417a2ae610 at t/write_table.t line 182.
2793             main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203
2794             Attempt to free unreferenced scalar: SV 0x56417a2ae610 at t/write_table.t line 183.
2795             main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203
2796              
2797             C gives better warnings for incorrect types of data given
2798              
2799             Numerous changes to dist.ini to improve CPAN testing, especially for Win32
2800              
2801             =head2 0.12
2802              
2803             C can also take hash of arrays, and various mixes of data types
2804              
2805             C: Addition of C keywords in many places; should improve CPU performance
2806              
2807             Better POD formatting, correction of output hash for README's C
2808              
2809             C can now accept hash of hashes as input
2810              
2811             new C function for switching 2D hash keys and 2D array indices, and C for comparing columns against columns
2812              
2813             removed unused function from C helpers
2814              
2815             C: addition of restrict keywords in preinit, should improve CPU performance
2816              
2817             MANIFEST.skip changed to MANIFEST.SKIP to improve CPAN testing
2818              
2819             using C for tests of C, which may or may not work with CPAN testers (experimental)
2820              
2821             Added function name to warnings, so I actually know which function is producing the error
2822              
2823             C can also take C and C as args, in addition to positions
2824              
2825             fixed C as it could hang if given empty C or C
2826              
2827             Added C<__EXTENSIONS__> to source XS file for better CPAN testing
2828              
2829             =head2 0.11
2830              
2831             better POD formatting for tables
2832              
2833             addition of MANIFEST.skip to get better testing results on CPAN
2834              
2835             C: bugfix for when there is no intercept in the formula, new test cases in t/glm.t
2836              
2837             C now accepts simple hashes as input, in addition to hash of arrays, hash of hashes, and arrays of hashes
2838              
2839             Better documentation for t-test
2840              
2841             =head2 0.10
2842              
2843             changes to compilation for CPAN, trying to get this work on Windows
2844              
2845             Addition of C and C
2846              
2847             C will work without key names, just like in R. Testing for C has improved.
2848              
2849             =head2 0.09
2850              
2851             context changes in XS C, C, and C to get better CPAN testing results
2852              
2853             C keywords added to C to increase speed
2854              
2855             =head2 0.08
2856              
2857             Speed improvement in C of hashes.
2858              
2859             Addition of C, C, C, C, and C functions
2860              
2861             Chi-squared function no longer has Perl wrapper, and all code is in XS, which should result in a minor speed increase with 1 less function call.
2862              
2863             Compiler changes for GNU source and inclusion of C, to ensure more CPAN testing works better.
2864              
2865             C now returns hash-of-hash in {row}{column}
2866              
2867             =head2 0.07
2868              
2869             Addition of C function.
2870              
2871             Formulas can now be omitted from C, resulting in a stacked calculation as R would think.
2872              
2873             Addition of C for multi-group comparisons that does not assume normality like C does.
2874              
2875             C and C now automatically set separators for C<.csv> files as C<,> and C<.tsv> files as C<"\t">, respectively, so these values no longer need to be specified separately from the file name.
2876              
2877             =head2 0.06
2878              
2879             Changed compiler options so that Solaris will work
2880              
2881             signed integers changed to unsigned in C
2882              
2883             Added restrict keywords to C, and made C to C
2884              
2885             =head2 0.05
2886              
2887             Leak testing for C
2888              
2889             removal of Data::Printer dependency for easier CPAN testing
2890              
2891             switched several C variable to C so that clang doesn't complain
2892              
2893             added restrict keyword for C
2894              
2895             =head2 0.04
2896              
2897             addition of C function
2898              
2899             GNU source, to maximize compatibility and ease installation
2900              
2901             removal of JSON dependency to ease installation
2902              
2903             =head2 0.03
2904              
2905             Compatibility back to Perl 5.10
2906              
2907             =head2 0.02
2908              
2909             back-compatible to Perl 5.10, instead of original 5.40, ensuring more people can use it
2910              
2911             added var_test
2912              
2913             mean, min, sum, median, var, and max die with undefined values, and print the offending indices
2914              
2915             "group_stats" added to aov, for TukeyHSD in the future
2916              
2917             "cor" dies when given data with standard deviation of 0
2918              
2919             C now has C option, which shows how undefined values are printed to tables, which is C by default.