File Coverage

blib/lib/Stats/LikeR.pm

Criterion	Covered	Total	%
statement	387	395	97.9
branch	212	236	89.8
condition	60	81	74.0
subroutine	35	35	100.0
pod	4	4	100.0
total	698	751	92.9

are exported by default.

line stmt bran cond sub pod time code

1 #!/usr/bin/env perl

2 # ABSTRACT: Get basic statistical functions, like in R, but with Perl using XS for performance

3 require 5.010;

4 23 23 1201268 use strict;

23 37

23 810

5 23 23 151 use feature 'say';

23 37

23 6359

6 package Stats::LikeR;

7 our $VERSION = 0.16;

8 require XSLoader;

9 23 23 14427 use Devel::Confess 'color';

23 224576

23 87

10 23 23 2358 use warnings FATAL => 'all';

23 60

23 1067

11 23 23 9284 use autodie ':default';

23 341909

23 80

12 23 23 110512 use Exporter 'import';

23 38

23 891

13 23 23 106 use Scalar::Util 'looks_like_number';

23 32

23 14366

14 XSLoader::load('Stats::LikeR', $VERSION);

15 our @EXPORT_OK = qw(add_data aoh2hoa aov cfilter chisq_test col col2col cor cor_test cov csort dnorm filter fisher_test glm group_by hoh2hoa hist kruskal_test ks_test ljoin lm matrix max mean median min mode oneway_test p_adjust power_t_test prcomp quantile rbinom read_table rnorm runif sample scale sd seq shapiro_test sum summary t_test transpose value_counts var var_test view wilcox_test write_table);

16 our @EXPORT = @EXPORT_OK;

18 require XSLoader;

19 # ---- filter DSL: col() builds a predicate via overloading (pure Perl) -------

20 # Exported: filter (XS) and col. Place col()/Col/Pred near the top of the .pm;

21 # they need no XS. filter() is the XSUB.

22 50 50 1 274588 sub col { Stats::LikeR::Col->new($_[0]) }

23 {

24 package Stats::LikeR::Col;

25 50 33 50 351 sub new { bless { name => $_[1] }, ref($_[0]) || $_[0] }

26 # build a comparison leaf; if operands were swapped (4 > col('x')), flip the op

27 sub _c {

28 50 50 98 my ($self, $val, $swapped, $op, $flip) = @_;

29 50 100 127 Stats::LikeR::Pred->_leaf($self->{name}, $swapped ? $flip : $op, $val);

30 }

31 use overload

32 19 19 42 '>' => sub { $_[0]->_c($_[1],$_[2],'>','<') },

33 7 7 23 '<' => sub { $_[0]->_c($_[1],$_[2],'<','>') },

34 2 2 7 '>=' => sub { $_[0]->_c($_[1],$_[2],'>=','<=') },

35 3 3 9 '<=' => sub { $_[0]->_c($_[1],$_[2],'<=','>=') },

36 4 4 11 '==' => sub { $_[0]->_c($_[1],$_[2],'==','==') },

37 2 2 6 '!=' => sub { $_[0]->_c($_[1],$_[2],'!=','!=') },

38 2 2 6 'lt' => sub { $_[0]->_c($_[1],$_[2],'lt','gt') },

39 2 2 6 'gt' => sub { $_[0]->_c($_[1],$_[2],'gt','lt') },

40 2 2 6 'le' => sub { $_[0]->_c($_[1],$_[2],'le','ge') },

41 2 2 5 'ge' => sub { $_[0]->_c($_[1],$_[2],'ge','le') },

42 3 3 10 'eq' => sub { $_[0]->_c($_[1],$_[2],'eq','eq') },

43 2 2 7 'ne' => sub { $_[0]->_c($_[1],$_[2],'ne','ne') },

44 23 23 206 fallback => 1;

23 43

23 489

45 }

46 {

47 package Stats::LikeR::Pred;

48 50 50 504 sub _leaf { bless { op => $_[2], col => $_[1], val => $_[3] }, 'Stats::LikeR::Pred' }

49 8 8 68 sub _node { bless { op => $_[0], l => $_[1], r => $_[2] }, 'Stats::LikeR::Pred' }

50 use overload

51 4 4 11 '&' => sub { Stats::LikeR::Pred::_node('and', $_[0], $_[1]) },

52 2 2 6 '|' => sub { Stats::LikeR::Pred::_node('or', $_[0], $_[1]) },

53 2 2 6 '!' => sub { Stats::LikeR::Pred::_node('not', $_[0], undef) },

54 23 23 7861 fallback => 1;

23 31

23 157

55 }

56 sub summary {

57 15 15 1 799852 my ($data, %args);

58 15 128 my $current_sub = (split(/::/,(caller(0))[3]))[-1];

60 15 100 66 99 if (@_ && ref $_[0]) {

61 # Handles: summary(\@arr) or summary(\@arr, nrows => 5) or summary(\%h, nrow => 3)

62 10 16 $data = shift;

63 10 22 %args = @_; # capture any trailing key/value pairs

64 } else {

65 # Handles: summary(@runif) or summary(@runif, nrows => 2)

66 # Extract known trailing named arguments from the flat list

67 5 33 51 while (@_ >= 2 && defined $_[-2] && !ref($_[-2]) && $_[-2] =~ /^(?:nrows|nrow)$/) {

68 2 3 my $val = pop @_;

69 2 3 my $key = pop @_;

70 2 11 $args{$key} = $val;

71 }

72 # The remaining items in @_ make up the actual data array

73 5 17 my @list = @_;

74 5 14 $data = \@list;

75 }

76 # Normalize nrow -> nrows, default to 10

77 15 100 105 $args{nrows} //= delete($args{nrow}) // 10;

78 15 28 my $ref_type = ref $data;

79 15 100 100 72 if (($ref_type ne 'ARRAY') && ($ref_type ne 'HASH')) {

80 1 14 die "$current_sub' data must either be a hash or an array, not \"$ref_type\"";

81 }

82 14 21 my $single_arr = 0;

83 14 100 100 53 if (($ref_type eq 'ARRAY') && (ref $data->[0] eq '')) {

84 8 15 $single_arr = 1;

85 }

86 14 37 my @header = ('# values', 'Min.', '1st Qu.', 'Median', 'Mean', '3rd Qu.', 'Max.');

87 14 19 my @out;

88 14 100 40 if ($single_arr == 1) {

100

89 8 14 push @out, '-' x 75;

90 8 42 my $header = sprintf('%9s ' x scalar @header, @header);

91 8 14 push @out, $header;

92 8 12 push @out, '-' x 75;

93 8 12 my @undef = grep {!defined $data->[$_]} 0..scalar @{ $data }-1;

219 402

8 30

94 8 100 45 if (scalar @undef > 0) {

95 1 27 say STDERR join (',', @undef);

96 1 13 die "The above indices are not defined in $current_sub";

97 }

98 7 10 my @numeric = grep {looks_like_number($_)} @{ $data };

216 334

7 28

99 7 158 my $q = quantile(\@numeric, probs => [0.25, 0.75]);

100 7 202 my $vals = sprintf('%9.4g ' x scalar @header, scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));

101 7 30 push @out, $vals;

102 } elsif ($ref_type eq 'ARRAY') {

103 3 7 push @out, '-' x 75;

104 3 19 my $header = sprintf('%9s ' x scalar @header, @header);

105 3 7 unshift @header, 'Index';

106 3 7 $header = 'Index ' . $header;

107 3 5 push @out, $header;

108 3 5 push @out, '-' x 75;

109 3 5 my $rows_printed = 0;

110 3 10 foreach my $index (0..$#$data) {

111 6 11 my @undef = grep {!defined $data->[$index][$_]} 0..scalar @{ $data->[$index] }-1;

36 90

6 15

112 6 50 18 if (scalar @undef > 0) {

113 0 0 say STDERR join (',', @undef);

114 0 0 die "The above indices are not defined for index $index in $current_sub";

115 }

116 6 13 my @numeric = grep {looks_like_number($_)} @{ $data->[$index] };

36 86

6 13

117 6 60 my $q = quantile(\@numeric, probs => [0.25, 0.75]);

118 6 118 my $vals = sprintf('%6.4g', $index) . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));

119 6 14 push @out, $vals;

120 6 9 $rows_printed++;

121 6 100 28 last if $rows_printed >= $args{nrows}; # Changed to >= just to be safe

122 }

123 } elsif ($ref_type eq 'HASH') {

124 3 8 push @out, '-' x 78;

125 3 18 my $header = sprintf('%9s ' x scalar @header, @header);

126 3 10 unshift @header, 'Key';

127 3 9 $header = ' Key ' . $header;

128 3 6 push @out, $header;

129 3 6 push @out, '-' x 78;

130 3 6 my $rows_printed = 0;

131 3 7 foreach my $key (sort {lc $a cmp lc $b} keys %{ $data }) {

4 17

3 19

132 5 11 my @undef = grep {!defined $data->{$key}[$_]} 0..scalar @{ $data->{$key} }-1;

33 64

5 17

133 5 50 16 if (scalar @undef > 0) {

134 0 0 say STDERR join (',', @undef);

135 0 0 die "The above indices are not defined for key $key in $current_sub";

136 }

137 5 9 my @numeric = grep {looks_like_number($_)} @{ $data->{$key} };

33 73

5 11

138 5 60 my $q = quantile(\@numeric, probs => [0.25, 0.75]);

139 5 15 my $print_key = substr($key, 0, 9);

140 5 50 15 if ((length $print_key) < 9) { # make sure that short keys line up correctly

141 5 12 $print_key .= ' ' x (9 - length $print_key);

142 }

143 5 116 my $vals = $print_key . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));

144 5 15 push @out, $vals;

145 5 7 $rows_printed++;

146 5 100 45 last if $rows_printed >= $args{nrows};

147 }

148 }

149 13 110 say join ("\n", @out);

150 13 77 return \@out;

151 }

152

153 sub read_table {

154 552 552 1 975312 my $file = shift;

155 552 100 5058 die "read_table: \"$file\" is not a file\n" unless -f $file;

156 551 50 3342 die "read_table: \"$file\" is not readable\n" unless -r $file;

157

158 551 727 my %input_args = @_;

159 551 100 1009 if (exists $input_args{delim}) {

160 # FIX: sep + delim together used to silently prefer delim

161 die "read_table: pass either 'sep' or 'delim', not both\n"

162 2 100 13 if exists $input_args{sep};

163 1 3 $input_args{sep} = delete $input_args{delim};

164 }

165

166 550 100 1475 my $default_sep = $file =~ /\.tsv$/i ? "\t" : ',';

167 550 1196 my %args = (

168 sep => $default_sep,

169 comment => '#',

170 %input_args,

171 );

172

173 550 724 my %allowed_args = map { $_ => 1 } (

2750 3378

174 'comment', 'output.type', 'filter', 'row.names', 'sep',

175 );

176 550 962 my @undef_args = sort grep { !$allowed_args{$_} } keys %args;

1139 1649

177 550 100 851 if (@undef_args) {

178 1 3 my $current_sub = ( split /::/, (caller(0))[3] )[-1];

179 # FIX: no more printing to STDOUT from a library ('say'); the die

180 # message carries the offending argument names itself

181 1 39 die "the args \"@undef_args\" aren't defined for $current_sub\n";

182 }

183 549 100 1219 my $otype = $args{'output.type'} // 'aoh';

184 549 100 1236 die "read_table: output.type \"$otype\" isn't allowed (aoh, hoa, hoh)\n"

185 unless $otype =~ m/^(?:aoh|hoa|hoh)$/;

186

187 547 590 my $filter = $args{filter};

188 547 100 100 1350 if (defined $filter && ref($filter) eq 'CODE') {

100

189 1 5 $filter = { 0 => $filter };

190 } elsif (defined $filter && ref($filter) ne 'HASH') {

191 1 6 die "'filter' must be a CODE or HASH reference\n";

192 }

193

194 546 560 my (@data, %data, @header, @uniq_header,

195 %mapped_filters, @sorted_filter_flds, %seen_rownames);

196 546 508 my $data_row = 0;

197 _parse_csv_file($file, $args{sep} // '', $args{comment} // '', sub {

198 6780 6780 7810 my ($line_ref) = @_;

199 6780 100 8376 if (!@header) {

200 # --- HEADER PROCESSING (copy made only here; runs once) ---

201 546 918 my @line = @$line_ref;

202 $line[0] =~ s/^\Q$args{comment}\E//

203 546 50 33 3461 if @line && defined $line[0] && length( $args{comment} // '' );

204 546 763 @header = @line;

205 546 100 66 1164 if (@header && $header[0] eq '') {

206 11 28 $header[0] = 'row_name';

207 }

208 546 501 my %seen_h;

209 # FIX: hoa output with duplicate column names used to push the

210 # same cell once PER OCCURRENCE, silently corrupting the column

211 # lengths. Iterate unique names (order preserved) instead.

212 546 576 @uniq_header = grep { !$seen_h{$_}++ } @header;

1223 2272

213 546 574 my @dup_cols = grep { $seen_h{$_} > 1 } @uniq_header;

1221 1462

214 546 100 1147 warn "read_table: duplicate column name(s) in $file: @dup_cols (later values win)\n"

215 if @dup_cols;

216 546 100 100 811 if ($otype eq 'hoh' && !defined $args{'row.names'}) {

217 6 12 $args{'row.names'} = $header[0];

218 }

219 546 100 100 814 if (defined $args{'row.names'}

220 71 112 && !grep { $_ eq $args{'row.names'} } @header) {

221 1 7 die "\"$args{'row.names'}\" isn't in the header of $file\n";

222 }

223 545 100 687 if ($filter) {

224 11 28 for my $k (keys %$filter) {

225 12 100 40 if ($k =~ /^\d+$/) {

226 # FIX: a numeric key past the last column used to be

227 # accepted and then silently extended every row via

228 # the $_ write-back

229 3 100 32 die "read_table: numeric filter key $k exceeds the "

230 . scalar(@header) . " columns of $file\n"

231 if $k > @header;

232 2 4 $mapped_filters{$k} = $filter->{$k};

233 } else {

234 9 20 my ($idx) = grep { $header[$_] eq $k } 0 .. $#header;

54 67

235 9 100 22 die "Filter column '$k' not found in header\n"

236 unless defined $idx;

237 8 26 $mapped_filters{ $idx + 1 } = $filter->{$k};

238 }

239 }

240 9 22 @sorted_filter_flds = sort { $a <=> $b } keys %mapped_filters;

1 5

241 }

242 543 2052 return;

243 }

244 # --- DATA PROCESSING (operate on $line_ref directly; no row copy) ---

245 6234 5488 $data_row++;

246 6234 100 7575 if (@$line_ref != @header) {

247 # FIX: alignment errors now say WHICH row is ragged

248 2 18 die sprintf "Alignment error on %s data row %d (%d fields vs %d headers).\n",

249 $file, $data_row, scalar @$line_ref, scalar @header;

250 }

251 6232 5715 my %line_hash;

252 6232 8025 for my $i (0 .. $#header) {

253 74819 72881 my $v = $line_ref->[$i];

254 74819 100 66 158118 $line_hash{ $header[$i] } = ( !defined($v) || $v eq '' ) ? undef : $v;

255 }

256 # --- APPLY FILTERS ---

257 6232 100 7821 if (@sorted_filter_flds) {

258 # FIX: 'local %_ = %line_hash' copied the whole row for every

259 # filtered row AND went stale after a $_ write-back. Aliasing the

260 # glob makes %_ THE row hash: zero copy, mutations always visible.

261 1862 1903 local *_ = \%line_hash;

262 1862 1720 my $skip = 0;

263 1862 1899 foreach my $fld (@sorted_filter_flds) {

264 1865 100 2706 local $_ = $fld == 0 ? $line_ref : $line_hash{ $header[ $fld - 1 ] };

265 1865 100 2533 if ( !$mapped_filters{$fld}->( $line_ref, \%line_hash ) ) {

266 1137 2247 $skip = 1;

267 1137 1111 last;

268 }

269 728 100 1820 if ( $fld > 0 ) { # write back any mutation made to $_

270 727 709 $line_ref->[ $fld - 1 ] = $_;

271 727 100 100 1635 $line_hash{ $header[ $fld - 1 ] }

272 = ( !defined($_) || $_ eq '' ) ? undef : $_;

273 }

274 }

275 1862 100 8849 return if $skip;

276 }

277 # Populate requested data structure

278 5095 100 7354 if ($otype eq 'aoh') {

100

279 1886 14820 push @data, \%line_hash;

280 } elsif ($otype eq 'hoa') {

281 1118 1172 push @{ $data{$_} }, $line_hash{$_} for @uniq_header;

15767 29527

282 } elsif ($otype eq 'hoh') {

283 2091 2318 my $row_name = $line_hash{ $args{'row.names'} };

284 # FIX: an undef/empty row-name cell used to become the '' hash

285 # key with "uninitialized" warnings; now it dies with the row

286 die sprintf "read_table: undefined row name (column '%s') in %s data row %d\n",

287 2091 100 2439 $args{'row.names'}, $file, $data_row

288 unless defined $row_name;

289 warn "read_table: duplicate row name '$row_name' in $file (later values win)\n"

290 2090 100 3509 if $seen_rownames{$row_name}++;

291 2090 2036 foreach my $col (@uniq_header) {

292 29180 100 33709 next if $col eq $args{'row.names'};

293 27090 54994 $data{$row_name}{$col} = $line_hash{$col};

294 }

295 }

296 546 50 15670 });

297 540 100 6861 if ($otype eq 'aoh') {

298 524 2113 return \@data;

299 } else { # hoa or hoh

300 16 7933 return \%data;

301 }

302 }

303 # ---------------------------------------------------------------------------

304 # view(): an R-style `head` for the structures read_table() returns.

305 #

306 # Handles all three output.type values:

307 # aoh -> ARRAY of HASH refs

308 # hoa -> HASH of ARRAY refs

309 # hoh -> HASH of HASH refs

310 #

311 # Pure Perl; the only dependency is Scalar::Util (core). There is nothing for

312 # XS to speed up here: view only touches the first n rows, and the total-row

313 # count is O(1) for every structure (scalar @$aoh, array length, scalar keys).

314 # Keeping it pure Perl also keeps install portable (no compiler needed).

315 #

316 # Usage:

317 # view($data); # first 6 rows, like head()

318 # view($data, n => 20); # first 20 rows

319 # view($data, cols => [...]); # pin explicit column order

320 # view($data, na => '.', max_width => 30, gap => 1, ellipsis => '~');

321 # view($data, 'row.names' => 'id'); # use column 'id' as the row label

322 # view($data, to => \*STDERR); # print elsewhere

323 # my $s = view($data, return_only => 1); # capture string, suppress print

324 #

325 # Column order: read_table stores rows as hashes, so the original CSV column

326 # order is gone. view sorts columns by name for a stable layout and treats a

327 # column literally named 'row_name' (read_table's label for a leading blank

328 # header) as the row label. Pass cols => [...] to override the order.

329 # ---------------------------------------------------------------------------

330

331 sub view {

332 41 41 1 164210 my $data = shift;

333 41 112 my %args = @_;

334

335 # --- reject unknown arguments (mirrors read_table/write_table) ---

336 41 74 my %allowed = map { $_ => 1 } qw(

492 651

337 n rows na max_width ellipsis gap cols columns

338 to return_only row.names row_names

339 );

340 41 98 my @bad = sort grep { !$allowed{$_} } keys %args;

60 114

341 41 100 90 die "view: unknown argument(s): @bad\n" if @bad;

342

343 # --- n / rows (synonyms); reject conflicting or non-integer values ---

344 die "view: pass either 'n' or 'rows', not both\n"

345 40 100 100 112 if exists $args{n} && exists $args{rows};

346 my $n = exists $args{rows} ? $args{rows}

347 : exists $args{n} ? $args{n}

348 39 100 89 : 6;

100

349 39 100 66 285 die "view: 'n'/'rows' must be a non-negative integer\n"

350 unless defined $n && $n =~ /^\d+$/;

351

352 37 100 53 my $na = exists $args{na} ? $args{na} : 'NA';

353 37 100 62 my $maxw = exists $args{max_width} ? $args{max_width} : 80;

354 37 100 48 my $ell = exists $args{ellipsis} ? $args{ellipsis} : '...';

355 37 50 45 my $gap = exists $args{gap} ? (' ' x $args{gap}) : ' ';

356 37 100 80 my $ucols = $args{cols} || $args{columns};

357 37 42 my $fh = $args{to};

358 37 41 my $quiet = $args{return_only};

359

360 # 'row.names' takes precedence over the row_names alias (both accepted)

361 my $label_col = exists $args{'row.names'} ? $args{'row.names'}

362 : exists $args{row_names} ? $args{row_names}

363 37 100 67 : undef;

100

364

365 37 58 my $rt = ref $data;

366 37 100 100 85 die "view: expected an ARRAY (AoH) or HASH (HoA/HoH) reference, got "

100

367 . ($rt || 'a non-reference') . "\n"

368 unless $rt eq 'ARRAY' or $rt eq 'HASH';

369

370 35 48 my ($kind, @cols, @labels, @raw, $total, $lab_header);

371 35 100 52 if ($rt eq 'ARRAY') { # ---- AoH ----

372 22 23 $kind = 'AoH';

373 22 25 $total = scalar @$data;

374 22 100 27 my $show = $n < $total ? $n : $total;

375

376 22 100 42 if ($ucols) {

377 3 5 @cols = @$ucols;

378 } else {

379 # BUG FIX: scan at least one row when data exists, so the header

380 # still lists columns even when showing 0 rows (n => 0). PERF:

381 # collect unique keys once, then sort once -- not sort-per-row.

382 19 100 32 my $scan = $show > 0 ? $show : ($total > 0 ? 1 : 0);

100

383 19 22 my %seen;

384 19 37 for my $i (0 .. $scan - 1) {

385 29 28 my $row = $data->[$i];

386 29 100 47 next unless ref $row eq 'HASH';

387 28 74 $seen{$_} = 1 for keys %$row;

388 }

389 19 57 @cols = sort keys %seen;

390 }

391 my $lc = defined $label_col ? $label_col

392 22 100 36 : (grep { $_ eq 'row_name' } @cols) ? 'row_name' : undef;

32 100 44

393 22 100 53 if (defined $lc) {

394 3 3 @cols = grep { $_ ne $lc } @cols;

6 10

395 3 5 $lab_header = $lc;

396 }

397 22 30 for my $i (0 .. $show - 1) {

398 31 33 my $row = $data->[$i];

399 31 100 40 $row = {} unless ref $row eq 'HASH';

400 31 100 42 push @labels, defined $lc ? $row->{$lc} : ($i + 1);

401 31 34 push @raw, [ map { $row->{$_} } @cols ];

50 83

402 }

403 22 100 39 $lab_header = '' unless defined $lab_header;

404 } elsif ($rt eq 'HASH') {

405 13 23 my @keys = keys %$data;

406 13 12 my $sample;

407 13 100 19 for my $k (@keys) { $sample = $data->{$k}; last if defined $sample; }

13 14

13 21

408 13 15 my $vt = ref $sample;

409

410 13 100 28 if (!@keys) { # ---- empty ----

100

411 # BUG FIX: an empty hash used to die ("neither ARRAY nor HASH");

412 # treat it as an empty table, mirroring an empty AoH.

413 1 2 $kind = 'Hash';

414 1 1 $total = 0;

415 1 1 $lab_header = '';

416 } elsif ($vt eq 'ARRAY') { # ---- HoA ----

417 5 5 $kind = 'HoA';

418 5 100 14 my @allcols = $ucols ? @$ucols : sort @keys;

419 5 6 $total = 0;

420 5 6 for my $k (@keys) {

421 10 100 14 next unless ref $data->{$k} eq 'ARRAY';

422 9 8 my $l = scalar @{ $data->{$k} };

9 9

423 9 100 16 $total = $l if $l > $total;

424 }

425 5 50 7 my $show = $n < $total ? $n : $total;

426 my $lc = defined $label_col ? $label_col

427 5 50 11 : (grep { $_ eq 'row_name' } @allcols) ? 'row_name' : undef;

7 100 11

428 5 100 7 @cols = grep { !defined $lc || $_ ne $lc } @allcols;

9 41

429 5 100 11 $lab_header = defined $lc ? $lc : '';

430 5 12 for my $i (0 .. $show - 1) {

431 push @labels, defined $lc

432 13 50 23 ? (ref $data->{$lc} eq 'ARRAY' ? $data->{$lc}[$i] : undef)

100

433 : ($i + 1);

434 push @raw, [ map {

435 13 100 18 ref $data->{$_} eq 'ARRAY' ? $data->{$_}[$i] : undef

21 51

436 } @cols ];

437 }

438 } elsif ($vt eq 'HASH') { # ---- HoH ----

439 6 6 $kind = 'HoH';

440 6 7 $total = scalar @keys;

441 6 18 my @rk = sort @keys;

442 6 100 9 my $show = $n < $total ? $n : $total;

443 6 50 32 my @shown = $show > 0 ? @rk[0 .. $show - 1] : ();

444 6 100 9 if ($ucols) {

445 1 1 @cols = @$ucols;

446 } else {

447 # PERF: gather unique inner keys, sort once.

448 5 6 my %seen;

449 5 6 for my $rkk (@shown) {

450 9 100 14 next unless ref $data->{$rkk} eq 'HASH';

451 8 8 $seen{$_} = 1 for keys %{ $data->{$rkk} };

8 24

452 }

453 5 12 @cols = sort keys %seen;

454 }

455 6 100 11 @cols = grep { $_ ne $label_col } @cols if defined $label_col;

4 7

456 # BUG FIX: honour the row_names alias here too (was 'row.names'

457 # only, so row_names => 'x' showed a 'row_name' header).

458 6 100 9 $lab_header = defined $label_col ? $label_col : 'row_name';

459 6 6 for my $rkk (@shown) {

460 11 12 push @labels, $rkk;

461 11 100 18 my $inner = ref $data->{$rkk} eq 'HASH' ? $data->{$rkk} : {};

462 11 12 push @raw, [ map { $inner->{$_} } @cols ];

18 35

463 }

464 } else { # ---- flat hash ----

465 # BUG/FEATURE: a hash whose values are plain scalars

466 # ({ a => 1, b => 2, ... }) used to die ("neither ARRAY nor

467 # HASH"). Render it like write_table's flat hash: one row, keys

468 # as columns, a numeric '1' row label.

469 1 2 $kind = 'Hash';

470 1 1 $total = 1;

471 1 50 3 my $show = $n < $total ? $n : $total;

472 1 50 3 if ($ucols) {

473 0 0 @cols = @$ucols;

474 } else {

475 1 4 @cols = sort @keys;

476 }

477 my $lc = defined $label_col ? $label_col

478 1 50 3 : (grep { $_ eq 'row_name' } @cols) ? 'row_name' : undef;

3 50 5

479 1 50 3 if (defined $lc) {

480 0 0 @cols = grep { $_ ne $lc } @cols;

0 0

481 0 0 $lab_header = $lc;

482 }

483 1 50 2 $lab_header = '' unless defined $lab_header;

484 1 3 for my $i (0 .. $show - 1) {

485 1 50 2 push @labels, defined $lc ? $data->{$lc} : ($i + 1);

486 1 2 push @raw, [ map { $data->{$_} } @cols ];

3 5

487 }

488 }

489 }

490

491 # numeric detection per column (drives right/left alignment)

492 35 65 my @numeric = (1) x scalar @cols;

493 35 44 for my $r (@raw) {

494 56 79 for my $j (0 .. $#cols) {

495 92 90 my $v = $r->[$j];

496 92 100 126 next unless defined $v;

497 84 100 172 $numeric[$j] = 0 unless looks_like_number($v);

498 }

499 }

500 35 100 53 my $lab_numeric = @labels ? 1 : 0;

501 35 36 for my $l (@labels) {

502 48 100 66 134 $lab_numeric = 0, last unless defined $l && looks_like_number($l);

503 }

504

505 # stringify + sanitize + truncate cells (column names left intact)

506 35 36 my $ell_len = length $ell;

507 my $clean = sub {

508 148 148 161 my $v = shift;

509 148 100 197 return $na unless defined $v;

510 140 136 $v = "$v";

511 140 162 $v =~ s/\t/\\t/g; $v =~ s/\r/\\r/g; $v =~ s/\n/\\n/g;

140 149

140 147

512 140 100 66 256 if ($maxw && length($v) > $maxw) {

513 2 2 my $keep = $maxw - $ell_len;

514 2 50 4 $keep = 0 if $keep < 0;

515 2 4 $v = substr($v, 0, $keep) . $ell;

516 }

517 140 236 return $v;

518 35 178 };

519 35 47 my @lab_s = map { $clean->($_) } @labels;

56 96

520 35 43 my @rows_s = map { [ map { $clean->($_) } @$_ ] } @raw;

56 64

92 88

521 35 54 my @head_s = @cols;

522

523 # column widths

524 35 36 my $lab_w = length $lab_header;

525 35 100 44 for my $s (@lab_s) { my $l = length $s; $lab_w = $l if $l > $lab_w; }

56 54

56 91

526 35 35 my @w = map { length $_ } @head_s;

54 66

527 35 39 for my $r (@rows_s) {

528 56 66 for my $j (0 .. $#cols) {

529 92 103 my $l = length $r->[$j];

530 92 100 136 $w[$j] = $l if $l > $w[$j];

531 }

532 }

533

534 my $pad = sub {

535 237 237 295 my ($s, $width, $right) = @_;

536 237 50 243 my $g = $width - length $s; $g = 0 if $g < 0;

237 308

537 237 100 502 return $right ? (' ' x $g) . $s : $s . (' ' x $g);

538 35 93 };

539

540 35 37 my @out;

541 35 39 my $shown = scalar @rows_s;

542 35 100 167 push @out, sprintf("# %s: %d row%s x %d col%s (showing %d)",

100

543 $kind, $total, ($total == 1 ? '' : 's'),

544 scalar(@cols), (@cols == 1 ? '' : 's'), $shown);

545

546 35 60 my @hcells = ( $pad->($lab_header, $lab_w, 0) );

547 35 83 push @hcells, $pad->($head_s[$_], $w[$_], $numeric[$_]) for 0 .. $#cols;

548 35 76 push @out, join($gap, @hcells);

549

550 35 45 for my $ri (0 .. $#rows_s) {

551 56 66 my @cells = ( $pad->($lab_s[$ri], $lab_w, $lab_numeric) );

552 56 96 push @cells, $pad->($rows_s[$ri][$_], $w[$_], $numeric[$_]) for 0 .. $#cols;

553 56 133 push @out, join($gap, @cells);

554 }

555 35 50 59 push @out, sprintf("# ... %d more row%s", $total - $shown,

100

556 ($total - $shown == 1 ? '' : 's')) if $shown < $total;

557

558 35 65 my $str = join("\n", @out) . "\n";

559 35 50 57 unless ($quiet) { defined $fh ? print {$fh} $str : print $str; }

1 100 7

1 5

560 35 461 return $str;

561 }

562

563 1;

564 =encoding utf8

565

566 =head1 Synopsis

567

568 Get basic statistical functions working in Perl as if they were part of List::Util, like C, C, C, etc.

569 I've used Artificial Intelligence tools such as Claude, Gemini, and Grok to write this as well as using my own gray matter.

570 There are other similar tools on CPAN, but I want speed and a form like List::Util, which I've gotten here with the help of AI, which often required many attempts to do correctly.

571 This is meant to call subroutines directly through eXternal Subroutines (XS) for performance and portability.

572

573 There B other modules on CPAN that can do B of this, but this works the way that I B it to.

574

575 =head1 Functions/Subroutines

576

577 ========================================================================

578

579 =head2 add_data

580

581 Add data to an existing hash or array reference. This function acts as the equivalent of adding new rows, as well as an C (described below). It dynamically infers your target data structure, handles deeply nested records, and seamlessly coerces mismatched data shapes to preserve the structural integrity of your primary reference.

582

583 =head3 Hash of Hashes (HoH)

584

585 When the target is a Hash of Hashes, incoming hash keys update existing rows, and new keys create new rows.

586

587 $data = { 'Jack Smith' => { age => 30 } };

588

589 $n = {

590 'Jack Smith' => { # Update existing (Hash)

591 dept => 'Engineering'

592 },

593 'Jane Doe' => { age => 25, dept => 'Sales' }, # Add new (Hash)

594 'Invalid' => 'Not a reference' # Edge case safety

595 };

596

597 add_data($data, $n);

598

599 B

600

601 {

602 "Jack Smith": {

603 "age": 30,

604 "dept": "Engineering"

605 },

606 "Jane Doe": {

607 "age": 25,

608 "dept": "Sales"

609 }

610 }

611

612 =head3 Hash of Arrays (HoA)

613

614 When the target is a Hash of Arrays, incoming arrays are pushed onto the existing arrays, appending the new elements, similarly to R's C.

615

616 $data = { 'Project Alpha' => [ 'task1', 'task2' ] };

617 $n = {

618 'Project Alpha' => [ 'task3' ], # Appends to existing array

619 'Project Beta' => [ 'task1', 'task2' ] # Creates new array row

620 };

621 add_data($data, $n);

622

623 B

624

625 {

626 "Project Alpha": [ "task1", "task2", "task3" ],

627 "Project Beta": [ "task1", "task2" ]

628 }

629

630 =head3 Array of Hashes / Arrays (AoH / AoA)

631

632 C now natively supports Array references at the root level. When targeting an Array, it iterates through the source array and merges data at the corresponding indices.

633

634 $data = [

635 { id => 1, name => 'Alice' }

636 ];

637

638 $n = [

639 { role => 'Admin' }, # Updates index 0

640 { id => 2, name => 'Bob' } # Creates index 1

641 ];

642

643 add_data($data, $n);

644

645 B

646

647 [

648 { "id": 1, "name": "Alice", "role": "Admin" },

649 { "id": 2, "name": "Bob" }

650 ]

651

652 =head3 Advanced Structural Coercion & Cross-Merging

653

654 C strictly enforces the primary structure of your target reference (determined by inspecting its outer and inner bounds). If you mix Array and Hash types, the function automatically coerces the incoming data to match the target.

655

656 B<1. Inner Coercion (Mixing Rows):>

657

658 =over

659

660 =item * B Source Array rows are read in pairs and converted to key-value pairs.

661

662 =item * B Source Hash rows are flattened into key-value pairs and pushed onto the array.

663

664 =back

665

666 B<2. Root-Level Coercion (Mixing Outer Containers):>

667

668 =over

669

670 =item * B The function evaluates the Hash keys as numeric indices. (e.g., source key C<"0"> merges into target array index C<[0]>). Non-numeric keys are safely ignored.

671

672 =item * B The function converts the Array indices into stringified Hash keys. (e.g., source array index C<[1]> merges into target hash key C<"1">).

673

674 =back

675

676 =head3 Source is a mixed Hash. Keys dictate the target array index!

677

678 $n = {

679 '0' => { y => 20 }, # Merges into $data->[0]

680 '1' => [ 'z', 30 ], # Array pair coerced to Hash, creates $data->[1]

681 'ignored' => { k => 'v' } # Ignored: cannot map to an array index

682 };

683

684 add_data($data, $n);

685

686 B

687

688 [

689 { "x": 10, "y": 20 },

690 { "z": 30 }

691 ]

692

693 NB: If C is called on a completely empty target reference (e.g., C<$data = {}> or C<$data = []>), it will intelligently infer the required inner structure (Hashes vs Arrays) by inspecting the first valid row of the source data.

694

695 =head2 aoh2hoa

696

697 C — transpose an B (row-major) into a B (column-major).

698

699 my $hoa = aoh2hoa([ { a => 1, b => 2 }, { a => 3 } ]);

700 # $hoa = { a => [1, 3], b => [2, undef] }

701

702 Rows go in, columns come out: each distinct key across the input rows becomes one output column, and the values are gathered down that column in row order.

703

704 =head3 Arguments

705

706 C<$aoh> — an array ref of hash refs, one hash per row. This is the only argument, and it is required. Passing anything that is not an array ref is fatal:

707

708 aoh2hoa({ a => 1 }); # dies: argument must be an arrayref of hashrefs

709

710 =head3 Returns

711

712 A hash ref of array refs. Each key is a column name (the union of all keys seen across the rows); each value is an array ref holding that column's cells. Every column has exactly C elements, so the result is rectangular even when the input is ragged.

713

714 =head3 Behavior

715

716 The column set is the B of every row's keys — a key that appears in only some rows still produces a full-length column, with C in the rows that lacked it.

717

718 Each column is padded to exactly the row count. Cells missing from a given row come through as C, including trailing gaps (a column whose last contributing row is early still runs the full length). These absent cells are cheap holes in the array, not stored SVs.

719

720 Values are B (C), so the returned structure is independent of the input — mutating C<$aoh> afterward won't disturb the result. The copy is shallow: a value that is itself a reference is copied the same way C<< $col-E[$i] = $row-E{$k} >> would, i.e. the ref is duplicated but its referent is shared.

721

722 Keys are handled SV-first (C / C), so UTF-8 and otherwise non-trivial hash keys round-trip correctly.

723

724 A row that is B a hash ref is skipped rather than fatal: it contributes C to every column at its index. So a stray C or scalar in the input thins the columns at that position instead of dying.

725

726 =head3 Notes

727

728 The output column order follows hash iteration order and is therefore not guaranteed — sort the keys if you need a stable layout. Round-tripping through C (or the reverse) reconstructs the data but not necessarily the original key/row ordering, and rows originally absent a key will gain it as an explicit C.

729

730 =head2 aov

731

732 Warning: assumes normal distribution

733

734 aov(

735 {

736 yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],

737 ctrl => [1, 1, 1, 0, 0, 0]

738 },

739 'yield ~ ctrl');

740

741 which returns

742

743 {

744 ctrl {

745 Df 1,

746 "F value" 25.6000000000001,

747 "Mean Sq" 1.70666666666667,

748 Pr(>F) 0.00718232855871859,

749 "Sum Sq" 1.70666666666667

750 },

751 Residuals {

752 Df 4,

753 "Mean Sq" 0.0666666666666665,

754 "Sum Sq" 0.266666666666666

755 }

756 }

757

758 You can also perform Two-Way ANOVA with categorical interactions using the C<*> operator. The parser will implicitly evaluate the main effects alongside the interaction:

759

760 my $res_2way = aov($data_2way, 'len ~ supp * dose');

761

762 It is robust against rank deficiency; collinear terms will gracefully receive 0 degrees of freedom and 0 sum of squares, matching R's behavior.

763

764 =head3 Input Parameters

765

766

767

768 =begin html

769

770

771
772
773								Parameter
774								Type
775								Default
776								Description
777								Example
778
779
780
781
782								`data_sv`
783								`HashRef` or `ArrayRef`
784								(Required)
785								The dataset to analyze. Accepts a Hash of Arrays (HoA) or Array of Hashes (AoH). If no formula is provided, it must be an HoA to allow automatic stacking (mimicking R's `stack()` on a named list).
786
787
788
789								`formula_sv`
790								`String`
791								`undef`
792								A symbolic description of the model to be fitted. If omitted, the formula automatically defaults to `'Value ~ Group'` and the input data is stacked.
793								`'yield ~ N * P'`
794
795
796

797

798 =end html

799

800

801

802 =head3 Output Variables

803

804 The function returns a single C containing the evaluated statistical results. Because the keys map dynamically to the terms parsed from your formula, the structure will vary based on your inputs.

805

806

807

808 =begin html

809

810

811
812
813								Parameter
814								Type
815								Default
816								Description
817								Example
818
819
820
821
822								(Term Name)
823								`HashRef`
824								`undef`
825								A nested hash for each independent term in the formula (e.g., `'Group'`, `'N:P'`), containing its ANOVA table statistics.
826								`{'Df' => 1, 'Sum Sq' => 14.2, 'Mean Sq' => 14.2, 'F value' => 25.81, 'Pr(>F)' => 0.0004}`
827
828
829								`Residuals`
830								`HashRef`
831								`undef`
832								A nested hash containing the residual (error) statistics for the fitted model.
833								`{'Df' => 10, 'Sum Sq' => 5.5, 'Mean Sq' => 0.55}`
834
835
836								`group_stats`
837								`HashRef`
838								`undef`
839								A nested hash containing descriptive statistics (`mean` and `size` / count) for every column evaluated in the original unstacked data structure.
840								`{'mean' => {'A' => 2.1, 'B' => 5.4}, 'size' => {'A' => 10, 'B' => 10}}`
841
842
843

844

845 =end html

846

847

848

849 =head3 omitting formula

850

851 As of version 0.07, in the case of an omitted formula, stacking is done:

852

853 aov(

854 {

855 yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],

856 ctrl => [1, 1, 1, 0, 0, 0]

857 },

858 );

859

860 is the equivalent of:

861

862 yield <- c(5.5, 5.4, 5.8, 4.5, 4.8, 4.2)

863 ctrl <- c(1, 1, 1, 0, 0, 0)

864

865 # Combine them into a named list (the R equivalent of your hash)

866 my_list <- list(yield = yield, ctrl = ctrl)

867

868 # Convert the list into a "long" dataframe

869 # This creates two columns: "values" and "ind" (the group name)

870 my_data <- stack(my_list)

871

872 # Rename columns for clarity (optional but good practice)

873 colnames(my_data) <- c("Value", "Group")

874 anova_model <- aov(Value ~ Group, data = my_data)

875 summary(anova_model)

876

877 in R

878

879 =head2 cfilter

880

881 Select B out of a table and return it in the same shape. A column is

882 the inner (second-level) key of a B or an B,

883 or the outer key of a B:

884

885 use Stats::LikeR;

886 my %hoa = ( x => [1,2,3], y => [4,5,6], z => [0,0,0] );

887 cfilter(\%hoa, keep => ['x','y']); # { x => [1,2,3], y => [4,5,6] }

888 cfilter(\%hoa, remove => ['z']); # { x => [1,2,3], y => [4,5,6] }

889

890 C takes exactly one of C or C. C returns only the

891 matching columns; C returns everything except them. The result is the

892 same shape as the input (HoH → HoH, HoA → HoA, AoH → AoH), with cell values

893 copied and the original structure left untouched.

894

895 =head3 Selecting by name

896

897 Pass an array ref of column names. Naming a column that is not present in the

898 data is an error (it catches typos), and a row that happens not to contain a

899 kept column simply comes back without it:

900

901 my @aoh = ( { a => 1, b => 2 }, { a => 3 } );

902 cfilter(\@aoh, keep => ['b']); # [ { b => 2 }, {} ]

903

904 =head3 Selecting by a predicate

905

906 Instead of names, C/C accept a B — a CODE ref or a

907 function name — evaluated once per column. It is called as

908

909 $predicate->($column_values, $column_name)

910

911 where C<$column_values> is an array ref of the column's B cells (undef

912 and missing cells are dropped, so functions like C get clean input).

913 With C, columns for which the predicate is true are kept; with C,

914 those columns are dropped.

915

916 # Keep only the constant columns (standard deviation zero):

917 my $const = cfilter(\%hoa, keep => sub { sd($_[0]) == 0 }); # { z => [0,0,0] }

918 # Drop the constant columns instead:

919 my $varying = cfilter(\%hoa, remove => sub { sd($_[0]) == 0 }); # { x=>..., y=>... }

920 # A bare function name resolves in Stats::LikeR:: (use a package for your own):

921 cfilter(\%hoa, keep => 'some_predicate');

922

923 A bare string is always treated as a B, not a single column

924 name, so to keep one column by name use an array ref: C<< keep =E ['x'] >>.

925

926 =head3 Errors

927

928 C dies (via C) when:

929

930 =over

931

932 =item * neither C nor C is given, or both are,

933

934 =item * a named column is not present in the data,

935

936 =item * the selector is neither an array ref nor a code ref / function name, or the

937 function name cannot be resolved,

938

939 =item * an unknown option is given, or the options are not C<< name =E value >> pairs,

940

941 =item * the data is not a hash/array reference of the expected shape (a hash of hash

942 refs or array refs, or an array of hash refs).

943

944 =back

945

946 =head2 chisq_test

947

948 The C function performs chi-squared contingency table tests and goodness-of-fit tests. It natively accepts both arrays and hashes (1D and 2D) and mathematically mirrors R's C, returning a structured hash reference of the results.

949

950 For 2x2 matrices, Yates' Continuity Correction is applied automatically.

951

952 =head3 Accepted Inputs

953

954

955

956 =begin html

957

958

959
960
961								Input Type
962								Data Structure
963								Applied Test
964
965
966
967
968								1D Array
969								`[ $v1, $v2, ... ]`
970								Chi-squared test for given probabilities
971
972
973								2D Array
974								`[ [ $v1, $v2 ], [ $v3, $v4 ] ]`
975								Pearson's Chi-squared test (Yates' correction if 2x2)
976
977
978								1D Hash
979								`{ key1 => $v1, key2 => $v2 }`
980								Chi-squared test for given probabilities
981
982
983								2D Hash
984								`{ row1 => { c1 => $v1, c2 => $v2 } }`
985								Pearson's Chi-squared test (Yates' correction if 2x2)
986
987
988

989

990 =end html

991

992

993

994 =head3 Output Object Structure

995

996 The function returns a single Hash Reference containing the following key-value pairs. The internal structure of C and C will always identically match the structure of your input.

997

998

999

1000 =begin html

1001

1002

1003
1004
1005								Key
1006								Data Type
1007								Description
1008
1009
1010
1011
1012								data.name
1013								String
1014								Identifies the input type (e.g., `"Perl ArrayRef"` or `"Perl HashRef"`).
1015
1016
1017								expected
1018								Array/Hash Ref
1019								The expected frequencies, matching the geometry of the input.
1020
1021
1022								method
1023								String
1024								The specific statistical test applied.
1025
1026
1027								observed
1028								Array/Hash Ref
1029								The original data passed to the function.
1030
1031
1032								p.value
1033								Float
1034								The calculated p-value of the test.
1035
1036
1037								parameter
1038								Hash Ref
1039								Contains the degrees of freedom (`df`).
1040
1041
1042								statistic
1043								Hash Ref
1044								Contains the test statistic (`X-squared`).
1045
1046
1047

1048

1049 =end html

1050

1051

1052

1053 =head3 Two-Dimensional Array

1054

1055 Passing an Array of Arrays (AoA) triggers a standard Pearson's Chi-squared test. If the input is exactly a 2x2 matrix, Yates' continuity correction is applied automatically.

1056

1057 my $test_data = [

1058 [762, 327, 468],

1059 [484, 239, 477]

1060 ];

1061 my $res = chisq_test($test_data);

1062

1063 B

1064

1065 {

1066 'data.name' => 'Perl ArrayRef',

1067 'expected' => [

1068 [ 703.671381936888, 319.645266594124, 533.683351468988 ],

1069 [ 542.328618063112, 246.354733405876, 411.316648531012 ]

1070 ],

1071 'method' => "Pearson's Chi-squared test",

1072 'observed' => [

1073 [ 762, 327, 468 ],

1074 [ 484, 239, 477 ]

1075 ],

1076 'p.value' => 2.95358918321176e-07,

1077 'parameter' => { 'df' => 2 },

1078 'statistic' => { 'X-squared' => 30.0701490957547 }

1079 }

1080

1081 =head3 1-Dimensional Array (Goodness of Fit)

1082

1083 Passing a flat Array Reference triggers a Goodness of Fit test, assuming equal expected probabilities across all items.

1084

1085 my $data = [10, 20, 30];

1086 my $res = chisq_test($data);

1087

1088 B

1089

1090 {

1091 'data.name' => 'Perl ArrayRef',

1092 'expected' => [ 20, 20, 20 ],

1093 'method' => 'Chi-squared test for given probabilities',

1094 'observed' => [ 10, 20, 30 ],

1095 'p.value' => 0.00673794699908547,

1096 'parameter' => { 'df' => 2 },

1097 'statistic' => { 'X-squared' => 10 }

1098 }

1099

1100 =head3 2-Dimensional Hash (Pearson's Chi-squared)

1101

1102 Passing a Hash of Hashes (HoH) applies the exact same logic as a 2D Array, but preserves your nested string keys in the output. This is particularly useful when mapping data extracted directly from JSON, databases, or categorical mappings.

1103

1104 my $data = {

1105 GroupA => { Success => 10, Failure => 15 },

1106 GroupB => { Success => 20, Failure => 5 }

1107 };

1108

1109 my $res = chisq_test($data);

1110

1111 B

1112

1113 {

1114 'data.name' => 'Perl HashRef',

1115 'expected' => {

1116 'GroupA' => { 'Failure' => 10, 'Success' => 15 },

1117 'GroupB' => { 'Failure' => 10, 'Success' => 15 }

1118 },

1119 'method' => "Pearson's Chi-squared test with Yates' continuity correction",

1120 'observed' => {

1121 'GroupA' => { 'Failure' => 15, 'Success' => 10 },

1122 'GroupB' => { 'Failure' => 5, 'Success' => 20 }

1123 },

1124 'p.value' => 0.00937475878430379,

1125 'parameter' => { 'df' => 1 },

1126 'statistic' => { 'X-squared' => 6.75 }

1127 }

1128

1129 =head3 One-Dimensional Hash (Goodness of Fit)

1130

1131 Flat Hash References evaluate Goodness of Fit while preserving your categorical keys in the C and C output blocks.

1132

1133 my $data = {

1134 Apples => 10,

1135 Oranges => 20,

1136 Bananas => 30

1137 };

1138

1139 my $res = chisq_test($data);

1140

1141 =head2 C

1142

1143 Apply a B to every pair of columns in a table and collect

1144 the answers in a hash of hashes.

1145

1146 It's the workhorse behind things like correlation matrices: give it your data and

1147 the name of a function that takes two columns (C, C, …) and you get

1148 back every column compared against every other column.

1149

1150 use Stats::LikeR;

1151

1152 my %data = (

1153 height => [ 170, 165, 180, 175 ],

1154 weight => [ 70, 60, 85, 77 ],

1155 age => [ 30, 41, 25, 38 ],

1156 );

1157

1158 my $result = col2col(\%data, 'cor');

1159

1160 # $result->{height}{weight} == correlation of height vs weight

1161 # $result->{height}{age} == correlation of height vs age

1162 # ...and so on for every pair

1163

1164 ========================================================================

1165

1166 =head3 Arguments

1167

1168 col2col( $data, $command, $cols, %options )

1169 col2col( $data, $command, \%options ) # options in place of $cols

1170

1171

1172

1173 =begin html

1174

1175

1176
1177
1178								Position
1179								Argument
1180								What it is
1181
1182
1183
1184
1185								1
1186								`$data`
1187								Your table, as a reference (see Data shapes below).
1188
1189
1190								2
1191								`$command`
1192								A code block or the name of a two-column function.
1193
1194
1195								3
1196								`$cols`
1197								(optional) Which columns to use as the "from" side. Omit for all.
1198
1199
1200								4+
1201								`%options`
1202								(optional) `na`, `skip.errors`, … (see Options).
1203
1204
1205

1206

1207 =end html

1208

1209

1210

1211 ========================================================================

1212

1213 =head3 Data shapes

1214

1215 C understands three layouts. In every case a B is the thing that

1216 gets compared, and the result is keyed by column name.

1217

1218 B — keys are column names:

1219

1220 my %hoa = ( a => [1, 2, 3], b => [4, 5, 6] );

1221

1222 B — First keys are row names, second keys are columns:

1223

1224 my %hoh = (

1225 row1 => { a => 1, b => 4 },

1226 row2 => { a => 2, b => 5 },

1227 );

1228

1229 B — each element is a row, inner keys are columns:

1230

1231 my @aoh = ( { a => 1, b => 4 }, { a => 2, b => 5 } );

1232

1233 All three produce the same result for the same underlying numbers. Missing or

1234 C cells are handled by the C option (below).

1235

1236 ========================================================================

1237

1238 =head3 The command

1239

1240 The second argument is the function applied to each pair of columns. It is called

1241 as:

1242

1243 $command->( $column_a, $column_b ) # two ARRAY refs

1244

1245 so inside a block the two columns arrive in C<@_>:

1246

1247 my $result = col2col(\%data, sub {

1248 my ($x, $y) = @_; # $x and $y are array refs

1249 cor($x, $y);

1250 });

1251

1252 You can also pass a B. A bare name is looked up in

1253 C, so these two are equivalent:

1254

1255 col2col(\%data, 'cor');

1256 col2col(\%data, sub { cor($_[0], $_[1]) });

1257

1258 ========================================================================

1259

1260 =head3 The result

1261

1262 Always a hash of hashes: B<< C<< $result-E{from}{to} >> >>.

1263

1264 for my $from (sort keys %$result) {

1265 for my $to (sort keys %{ $result->{$from} }) {

1266 printf "%s vs %s = %s\n", $from, $to, $result->{$from}{$to};

1267 }

1268 }

1269

1270 A column is never compared with itself, so C<< $result-E{a}{a} >> does not exist.

1271

1272 ========================================================================

1273

1274 =head3 Restricting columns (C<$cols>)

1275

1276 By default every column is used as the "from" side. The third argument narrows

1277 that down — handy when you only care about one variable.

1278

1279 # all columns vs all columns

1280 my $all = col2col(\%data, 'cor');

1281 # just ONE column vs every other column

1282 my $one = col2col(\%data, 'cor', 'height');

1283 my $cors = $one->{height}; # { weight => ..., age => ... }

1284 # a FEW specific columns vs every other column

1285 my $few = col2col(\%data, 'cor', ['height', 'weight']);

1286

1287 The "to" side is always every other column; C<$cols> only limits the outer keys.

1288

1289 ========================================================================

1290

1291 =head3 Options

1292

1293 Options can be given two ways:

1294

1295 col2col(\%data, 'cor', $cols, 'skip.errors' => 0); # after $cols

1296 col2col(\%data, 'cor', { 'skip.errors' => 0 }); # hash ref, no $cols needed

1297

1298 The hash-ref form is convenient when you have B column restriction — it saves

1299 you from passing a placeholder. (A hash ref I C<$cols>, so you can't use

1300 it to restrict columns at the same time; use the trailing form for that.)

1301

1302 =head4 C — how undefined values are handled

1303

1304 Real data has gaps. C decides what the function sees.

1305

1306

1307

1308 =begin html

1309

1310

1311
1312
1313								Value
1314								Behaviour
1315								Use for
1316
1317
1318
1319
1320								`'pairwise'` (default)
1321								A row is used for a pair only if both columns are defined there. The two columns arrive aligned and equal-length.
1322								Paired stats like `cor`.
1323
1324
1325								`'omit'`
1326								Each column drops its own undefined values independently. The two columns may end up different lengths.
1327								Unpaired tests like `t_test`, `kruskal_test`, where a gap in one sample shouldn't discard a value in the other.
1328
1329
1330								`'keep'`
1331								Every row is passed through, `undef` and all.
1332								When your function does its own missing-data handling.
1333
1334
1335

1336

1337 =end html

1338

1339

1340

1341 # correlation: keep only complete pairs (the default)

1342 col2col(\%data, 'cor');

1343 # two-sample test: each column keeps its own values

1344 col2col(\%data, 't_test', undef, na => 'omit');

1345 col2col(\%data, 't_test', { na => 'omit' }); # same, no placeholder

1346

1347 C / C remain as boolean aliases for backward compatibility:

1348 C means C<'pairwise'>, C means C<'keep'>. Don't combine them with C.

1349

1350 =head4 C — keep going when a pair fails I<(default: true)>

1351

1352 Some functions croak on degenerate input — for example C dies if a column has

1353 zero variance. By default C B that croak per pair: instead of

1354 aborting the whole run, it stores the B of the error message in that

1355 cell, so the result tells you I pair failed and I. Every other cell is

1356 computed normally.

1357

1358 my $r = col2col(\%data, 'cor');

1359 # a good pair: $r->{a}{b} == 0.83

1360 # a bad pair: $r->{a}{const} eq 'cor: standard deviation of y is 0'

1361

1362 To restore the old "die on the first error" behaviour, turn it off:

1363

1364 col2col(\%data, 'cor', undef, 'skip.errors' => 0);

1365 col2col(\%data, 'cor', { 'skip.errors' => 0 });

1366

1367 Only errors from B are trapped. Mistakes in the call itself

1368 (unknown column, bad data, unknown function name, unknown option) always die.

1369

1370 ========================================================================

1371

1372 =head3 Worked examples

1373

1374 B

1375

1376 my $m = col2col(\%data, 'cor');

1377

1378 B

1379

1380 my $col = 'Testosterone, total (nmol/L)';

1381 my $cors = col2col($hoa, 'cor', $col)->{$col};

1382 for my $other (sort { ($cors->{$b} // -2) <=> ($cors->{$a} // -2) } keys %$cors) {

1383 next unless $cors->{$other} =~ /^-?\d/; # skip cells holding an error message

1384 printf "%-30s % .3f\n", $other, $cors->{$other};

1385 }

1386

1387 B

1388

1389 my $t = col2col($hoa, 't_test', undef, na => 'omit');

1390

1391 B

1392

1393 my $m = col2col($hoa, 'cor');

1394 for my $from (sort keys %$m) {

1395 for my $to (sort keys %{ $m->{$from} }) {

1396 my $v = $m->{$from}{$to};

1397 warn "$from vs $to: $v\n" if defined $v && $v !~ /^-?\d/; # non-numeric = error

1398 }

1399 }

1400

1401 ========================================================================

1402

1403 =head3 Gotchas

1404

1405 =over

1406

1407 =item * B, C<($col_a, $col_b)> — not a column and

1408 a name. Unpack with C.

1409

1410 =item * B<< C<'pairwise'> can still hit a constant I. >> A column with overall

1411 variance can be flat on just the rows it shares with one partner, so C may

1412 still croak for that pair. With the default C, that shows up as a

1413 message in the single offending cell rather than killing the run.

1414

1415 =item * B<< C does not modify your data. >> It reads the table and returns a new

1416 hash of hashes.

1417

1418 =item * B — i.e.

1419 C is the inner ("to") key. So C<< $result-E{A}{B} >> reading C<…deviation of y is 0>

1420 means column C is the degenerate one for that pair.

1421

1422 =back

1423

1424 =head2 cor

1425

1426 cor($array1, $array2, $method = 'pearson'),

1427

1428 that is, C is the default and will be used if C<$method> is not specified.

1429

1430 Just like R, C, C, and C are available

1431

1432 If you provide an array of arrays (a matrix), C will compute the correlation matrix automatically.

1433

1434 =head2 cor_test

1435

1436 my $result = cor_test(

1437 'x' => $x,

1438 'y' => $y,

1439 alternative => 'two.sided',

1440 method => 'pearson',

1441 continuity => 1

1442 );

1443

1444 C safely handles C (or C) values seamlessly by computing over pairwise complete observations.

1445

1446 =head2 cov

1447

1448 cov($array1, $array2, 'pearson')

1449

1450 or

1451

1452 cov($array1, $array2, 'spearman')

1453

1454 or

1455

1456 cov($array1, $array2, 'kendall')

1457

1458 =head2 csort

1459

1460 Sort a table by a column or by a custom comparator. Works on both common Perl table shapes and can transpose between them on the way out. Stable, non-destructive.

1461

1462 =head3 Signature

1463

1464 my $sorted = csort($data, $by);

1465 my $sorted = csort($data, $by, $output);

1466

1467 =over

1468

1469 =item * B<< C<$data> >> — your table, in either shape:

1470 =over

1471

1472 =item * B — arrayref of hashrefs (a list of rows): C<< [ {id=E1, v=E10}, {id=E2, v=E20} ] >>

1473

1474 =item * B — hashref of arrayrefs (parallel columns): C<< { id=E[1,2], v=E[10,20] } >>

1475

1476 =back

1477

1478 =item * B<< C<$by> >> — I to sort:

1479 =over

1480

1481 =item * a B (string), or

1482

1483 =item * a B coderef using C<$a> / C<$b>, just like Perl's C

1484

1485 =back

1486

1487 =item * B<< C<$output> >> I<(optional)> — C<'aoh'> or C<'hoa'> (case-insensitive). Defaults to the input shape; C also means "same as input".

1488

1489 =back

1490

1491 Returns a B structure. The input is never modified.

1492

1493 =head3 What it does

1494

1495 =over

1496

1497 =item * B — numeric if every defined value in that column looks like a number, otherwise string comparison. Missing / C values sort B (matching R's C).

1498

1499 =item * B — C<$a> and C<$b> are set in the comparator's I package, so a named sub from another package still sees its own C<$a>/C<$b>. For AoH they're the row hashrefs; for HoA they're per-row hashref views synthesized from the columns.

1500

1501 =item * B — equal keys keep their original order (merge sort, same as Perl C and R C).

1502

1503 =item * B — keep the input shape, or transpose: AoH→HoA builds the union of all row keys (ordered by first appearance, gaps filled with C); HoA→AoH emits one hashref per row.

1504

1505 =back

1506

1507 =head3 Examples

1508

1509 # by column, ascending numeric, AoH in / AoH out

1510 my $rows = csort($aoh, 'score');

1511

1512 # custom comparator (descending), HoA in / HoA out

1513 my $cols = csort($hoa, sub { $b->{score} <=> $a->{score} });

1514

1515 # sort an AoH but hand it back as columns (HoA)

1516 my $cols = csort($aoh, 'name', 'hoa');

1517

1518 =head3 Notes

1519

1520 =over

1521

1522 =item * B AoH output reuses the original row hashrefs (re-ordered); HoA output permutes every column in lockstep.

1523

1524 =item * Empty and single-row tables are handled for all four in/out combinations.

1525

1526 =item * An invalid C<$output> value croaks.

1527

1528 =back

1529

1530 =head2 dnorm

1531

1532 gives the density of the normal distribution, with the specified mean and standard deviation.

1533

1534 In other words, the predicted height of the value C, given a mean, standard deviation, and whether or not to use a log value.

1535

1536 returns a single scalar/number if a single value is given, otherwise returns an array reference.

1537

1538 Usage:

1539

1540 dnorm(4) # assumes a mean of 0 and standard deviation of 1

1541

1542 but default mean, standard deviation, and log can be passed as parameters:

1543

1544 $x = dnorm(0, mean => 0, sd => 2, 'log' => 0);

1545

1546 =head2 filter

1547

1548 Return a new data frame containing only the rows of C<$df> that match a predicate. The original C<$df> is never modified.

1549

1550 my $df2 = filter($df, col('column.name') > 4);

1551

1552 C accepts a predicate in one of two forms:

1553

1554 =over

1555

1556 =item 1. a B<< C expression >> — a small, composable comparison built with overloaded operators, and

1557

1558 =item 2. a B — for anything the operators can't express (multiple columns, regexes, arbitrary logic), in the same spirit as the C option of L<#>.

1559

1560 =back

1561

1562 Both C and C

1563

1564 =head3 Arguments

1565

1566

1567

1568 =begin html

1569

1570

1571
1572
1573								Position
1574								Name
1575								Description
1576
1577
1578
1579
1580								1
1581								`$df`
1582								The data frame to filter. Either an array of hashes (AoH — e.g. the default output of `read_table`) or a hash of arrays (HoA).
1583
1584
1585								2
1586								predicate
1587								Either a `col()` comparison object or a `CODE` reference.
1588
1589
1590

1591

1592 =end html

1593

1594

1595

1596 The return value is a B data frame of the B as the input (AoH in → AoH out, HoA in → HoA out). For an HoA, every column is filtered in parallel by row index, so all returned columns stay the same length and aligned.

1597

1598 =head3 The C form

1599

1600 C is a deferred reference to a column. It carries no data — only the column name — so it can be compared with a literal (or another value) to build a predicate that C evaluates once per row.

1601

1602 filter($df, col('age') >= 18); # keep rows where age >= 18

1603 filter($df, col('sex') eq 'f'); # keep rows where sex is 'f'

1604 filter($df, 18 <= col('age')); # operands may be in either order

1605

1606 =head3 Comparison operators

1607

1608

1609

1610 =begin html

1611

1612

1613
1614
1615								Kind
1616								Operators
1617								Comparison
1618
1619
1620
1621
1622								Numeric
1623								`>` `<` `>=` `<=` `==` `!=`
1624								numeric (the cell and the value are compared as numbers)
1625
1626
1627								String
1628								`gt` `lt` `ge` `le` `eq` `ne`
1629								string (the cell and the value are compared as strings)
1630
1631
1632

1633

1634 =end html

1635

1636

1637

1638 C may appear on either side of the operator; C<< 4 E col('x') >> is automatically rewritten to the equivalent C<< col('x') E 4 >>.

1639

1640 =head3 Combining predicates: C<&>, C<|>, C

1641

1642 Predicates compose with bitwise C<&> (and), C<|> (or), and C (not):

1643

1644 filter($df, (col('age') > 18) & (col('sex') eq 'f')); # and

1645 filter($df, (col('grp') eq 'a') | (col('grp') eq 'c')); # or

1646 filter($df, !(col('x') > 100)); # not

1647

1648 Comparison operators bind more tightly than C<&> and C<|>, so C<< (col('a') E 4) & (col('b') E 2) >> is parsed correctly, but the parentheses are recommended for readability.

1649

1650 =head3 The code-reference form

1651

1652 For logic the operators can't express, pass a C_{. It is called once per row; the B, available both as C<$_> and as the first argument C<$_[0]>. Return a true value to keep the row.}

1653

1654 filter($df, sub { $_->{x} > 4 && $_->{grp} eq 'a' });

1655 filter($df, sub { $_->{name} =~ /^A/ });

1656 filter($df, sub { $_[0]{score} > $_[0]{threshold} });

1657

1658 For an HoA, each row is assembled into a temporary hash reference (C<< { column =E value, ... } >>) before the sub is called, so the same C<< $_-E{column} >> syntax works regardless of the input shape.

1659

1660 =head3 Examples

1661

1662 use Stats::LikeR;

1663 my $df = read_table('patients.csv'); # array of hashes

1664 # numeric threshold

1665 my $adults = filter($df, col('Age') >= 18);

1666 # combine conditions

1667 my $target = filter($df, (col('Age') >= 18) & (col('Sex') eq 'f'));

1668 # arbitrary logic with a coderef

1669 my $flagged = filter($df, sub { $_->{ALT} > 40 || $_->{AST} > 40 });

1670 # hash-of-arrays input -> hash-of-arrays output, columns filtered in parallel

1671 my $hoa = read_table('patients.csv', 'output.type' => 'hoa');

1672 my $sub = filter($hoa, col('Age') > 32);

1673 # $sub->{Age}, $sub->{Sex}, ... are all the same length and row-aligned

1674

1675 =head3 Behavior and notes

1676

1677 =over

1678

1679 =item * B C builds and returns a new frame; C<$df> is left untouched.

1680

1681 =item * B<< A missing or C cell never matches >> a C comparison. For example C<< col('x') E 0 >> silently drops any row that has no C value or whose C is C.

1682

1683 =item * B, into the returned frame: the returned array references the I row hashes as the input (fast, low-memory). Mutating a row in the result would therefore also change it in the original. HoA values are copied into fresh arrays.

1684

1685 =item * B are well defined: a predicate true for every row returns a copy-shaped frame with all rows; a predicate true for none returns an empty frame (C<[]> for AoH, a hash of empty arrays for HoA).

1686

1687 =item * B Passing a non-reference, an array element that is not a hash reference, or an HoA column that is not an array reference raises a descriptive error.

1688

1689 =item * B The C/operator layer is pure Perl (operator overloading); the per-row evaluation is done in XS.

1690

1691 =back

1692

1693 =head3 See also

1694

1695 C (whose C option applies the same coderef convention while reading a file), C.

1696

1697 =head2 fisher_test

1698

1699 =head3 array reference entry

1700

1701 my $array_data = [

1702 [10, 2],

1703 [3, 15]

1704 ];

1705 my $res1 = fisher_test($array_data);

1706

1707 which returns a hash reference:

1708

1709 {

1710 alternative "two.sided",

1711 conf_int [

1712 [0] 2.75343836564204,

1713 [1] 300.682787419401

1714 ],

1715 conf_level 0.95,

1716 estimate {

1717 "odds ratio" 21.3053312750168

1718 },

1719 method "Fisher's Exact Test for Count Data",

1720 p_value 0.000536724119143435

1721 }

1722

1723 =head3 hash reference entry

1724

1725 $ft = fisher_test( {

1726 Guess => {

1727 Milk => 3, Tea => 1

1728 },

1729 Truth => {

1730 Milk => 1, Tea => 3

1731 }

1732 });

1733

1734 =head2 glm

1735

1736 takes a hash of an array as input

1737

1738 my %tooth_growth = (

1739 dose => [qw(0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

1740 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5

1741 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0

1742 2.0 2.0 2.0)],

1743 len => [qw(4.2 11.5 7.3 5.8 6.4 10.0 11.2 11.2 5.2 7.0 16.5 16.5 15.2 17.3 22.5

1744 17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5

1745 15.2 21.5 17.6 9.7 14.5 10.0 8.2 9.4 16.5 9.7 19.7 23.3 23.6 26.4 20.0

1746 25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0)],

1747 supp => [qw(VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC

1748 VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ

1749 OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ)]

1750 );

1751

1752 my $glm_teeth = glm(

1753 data => \%tooth_growth,

1754 formula => 'len ~ dose + supp',

1755 family => 'gaussian'

1756 );

1757

1758 In addition to the C default, it fully supports logistic regression using the C family parameter via Iteratively Reweighted Least Squares (IRLS):

1759

1760 my $glm_bin = glm(formula => 'am ~ wt + hp', data => \%mtcars, family => 'binomial');

1761

1762 =head3 Input Parameters

1763

1764

1765

1766 =begin html

1767

1768

1769
1770
1771								Parameter
1772								Type
1773								Default
1774								Description
1775								Example
1776
1777
1778
1779
1780								`formula`
1781								`String`
1782								None (Required)
1783								A symbolic description of the model to be fitted. Supports operators like `+`, `:`, `*`, `^`, and `-1` (to remove the intercept).
1784								`'am ~ wt + hp'`, `'y ~ x - 1'`
1785
1786
1787								`data`
1788								`HashRef` or `ArrayRef`
1789								None (Required)
1790								The dataset containing the variables used in the formula. Accepts either a Hash of Arrays (HoA) or an Array of Hashes (AoH).
1791								`\%mtcars`, `[{x => 1, y => 2}, ...]`
1792
1793
1794								`family`
1795								`String`
1796								`'gaussian'`
1797								A description of the error distribution and link function to be used in the model. Currently supports `'gaussian'` (identity link) and `'binomial'` (logit link).
1798								`'binomial'`
1799
1800
1801

1802

1803 =end html

1804

1805

1806

1807 =head3 Output variables

1808

1809

1810

1811 =begin html

1812

1813

1814
1815
1816								Variable
1817								Type
1818								Description
1819								Example
1820
1821
1822
1823
1824								`aic`
1825								`Double`
1826								Akaike's Information Criterion for the fitted model.
1827								`123.45`
1828
1829
1830								`boundary`
1831								`Integer (Boolean)`
1832								`1` if the fitted values computationally reached the `0` or `1` boundary (specific to the binomial family), `0` otherwise.
1833								`0`
1834
1835
1836								`coefficients`
1837								`HashRef`
1838								A hash mapping the expanded model term names to their estimated coefficient values.
1839								`{'Intercept' => 1.5, 'wt' => -0.5}`
1840
1841
1842								`converged`
1843								`Integer (Boolean)`
1844								`1` if the Iteratively Reweighted Least Squares (IRLS) algorithm converged within the maximum iterations, `0` otherwise.
1845								`1`
1846
1847
1848								`deviance`
1849								`Double`
1850								The residual deviance of the fitted model.
1851								`15.2`
1852
1853
1854								`deviance.resid`
1855								`HashRef`
1856								A hash mapping data row names to their computed deviance residuals.
1857								`{'Mazda RX4' => 0.12}`
1858
1859
1860								`df.null`
1861								`Integer`
1862								The residual degrees of freedom for the null model.
1863								`31`
1864
1865
1866								`df.residual`
1867								`Integer`
1868								The residual degrees of freedom for the fitted model.
1869								`30`
1870
1871
1872								`family`
1873								`String`
1874								The statistical family used to fit the model.
1875								`"gaussian"`
1876
1877
1878								`fitted.values`
1879								`HashRef`
1880								A hash mapping data row names to the fitted mean values (the model's predictions on the scale of the response).
1881								`{'Mazda RX4' => 0.85}`
1882
1883
1884								`iter`
1885								`Integer`
1886								The number of IRLS iterations performed before convergence or hitting the iteration limit.
1887								`4`
1888
1889
1890								`null.deviance`
1891								`Double`
1892								The deviance for the null model (a baseline model containing only an intercept, or an offset of 0 if the intercept is removed).
1893								`43.5`
1894
1895
1896								`rank`
1897								`Integer`
1898								The numeric rank of the fitted linear model (the number of estimated, non-aliased parameters).
1899								`2`
1900
1901
1902								`summary`
1903								`HashRef`
1904								A nested hash mapping each term to its detailed summary statistics, including `Estimate`, `Std. Error`, `t value` / `z value`, and `Pr(> t )` / `Pr(> z )`. Aliased parameters return `"NaN"`.
1905								`{'wt' => {'Estimate' => -0.5, 'Std. Error' => 0.1, ...}}`
1906
1907
1908								`terms`
1909								`ArrayRef`
1910								An ordered list of the expanded term names included in the model matrix.
1911								`['Intercept', 'wt', 'hp']`
1912
1913
1914

1915

1916 =end html

1917

1918

1919

1920 =head2 group_by

1921

1922 Take a hash of arrays, hash of hashes, or array of hashes, and group a column by another column.

1923

1924 my $aoh_data = [

1925 { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 },

1926 { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 },

1927 { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 },

1928 { 'Gender' => 'Female' } # Intentional missing target value

1929 ];

1930

1931 as well as

1932

1933 $hoh_data = {

1934 'Patient_A' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 },

1935 'Patient_B' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 },

1936 'Patient_C' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 },

1937 'Patient_D' => { 'Gender' => 'Female' }, # Intentional missing target value

1938 'Patient_E' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => undef } # Explicit undef

1939 };

1940

1941 and

1942

1943 my $hoa_data = {

1944 'Gender' => ['Male', 'Female', 'Male', 'Female'],

1945 'Testosterone, total (nmol/L)' => [22.1, 2.5, 19.4, undef ]

1946 };

1947

1948 then run the function thus:

1949

1950 group_by( $hoa_data, 'Testosterone, total (nmol/L)', 'Gender');

1951

1952 The output can be thought of like a hash, with the first string broken down by the second.

1953

1954 all become hash of arrays:

1955

1956 {

1957 Female [

1958 [0] 1.8

1959 ],

1960 Male [

1961 [0] 18.2,

1962 [1] 20.5

1963 ]

1964 }

1965

1966 returns an empty array of hashes if neither target nor group keys are found.

1967

1968 =head3 Filtering

1969

1970 Data can be further broken down with filter/subs like in C:

1971

1972 my $testosterone = group_by($d, # group testosterone by "Gender"

1973 'Testosterone, total (nmol/L)',

1974 'Gender',

1975 { 'Race/Hispanic origin w/ NH Asian' => sub { $_ eq $n } },# filter

1976 { 'Testosterone, total (nmol/L)' => sub { $_ ne 'NA' } } # filter

1977 );

1978

1979 where each filter filters on the columns, e.g. second hash keys.

1980

1981 =head2 hoh2hoa

1982

1983 Convert a B (row-major: outer key = row, inner key = column)

1984 into a B (column-major: key = column, value = that column's

1985 cells down the rows).

1986

1987 use Stats::LikeR;

1988

1989 my %hoh = (

1990 'r1' => { 'a' => 1, 'b' => 2 },

1991 'r2' => { 'a' => 3, 'b' => 4 },

1992 );

1993

1994 my $hoa = hoh2hoa(\%hoh);

1995

1996 which returns

1997 {

1998 a => [1, 3],

1999 b => [2, 4],

2000 }

2001

2002 =head3 Behavior

2003

2004 =over

2005

2006 =item * B are the union of every inner key, so a key that appears in only

2007 some rows still becomes a column.

2008

2009 =item * B are emitted in sorted outer-key (row-name) order, and that one order

2010 is used for every column, so the arrays stay aligned and the result is

2011 reproducible regardless of hash ordering.

2012

2013 =item * B — a missing inner key, or a cell whose value is C — are filled

2014 with the fill value (see C below). Every column therefore has

2015 exactly one entry per row.

2016

2017 =item * Values are B into the result; the original structure is left

2018 untouched.

2019

2020 =item * An B hash of hashes returns an empty hash of arrays (it is not an

2021 error).

2022

2023 =back

2024

2025 =head3 Options

2026

2027 Options are passed as trailing C<< name =E value >> pairs.

2028

2029

2030

2031 =begin html

2032

2033

2034
2035
2036								Option
2037								Default
2038								Meaning
2039
2040
2041
2042
2043								`undef.val`
2044								`undef`
2045								Value used to fill a missing key or an `undef` cell. Any defined scalar works, including `0` and `''`. Passing `undef` keeps the default.
2046
2047
2048								`row.names`
2049								(none)
2050								If set to a string, an extra column of that name is added holding the sorted row labels, aligned with the data. Dies if the name collides with an existing column.
2051
2052
2053

2054

2055 =end html

2056

2057

2058

2059 # Ragged input with an explicit fill string:

2060 my %ragged = (

2061 'r1' => { 'a' => 1, 'b' => 2 },

2062 'r2' => { 'a' => 3, 'c' => 9 },

2063 );

2064 my $hoa = hoh2hoa(\%ragged, 'undef.val' => 'NA');

2065 # {

2066 # a => [1, 3 ],

2067 # b => [2, 'NA'],

2068 # c => ['NA', 9 ],

2069 # }

2070

2071 # Keep the row labels as a column:

2072 my $with_ids = hoh2hoa(\%ragged, 'row.names' => 'id');

2073 # {

2074 # id => ['r1', 'r2'],

2075 # a => [1, 3 ],

2076 # b => [2, undef],

2077 # c => [undef, 9 ],

2078 # }

2079

2080 =head3 Errors

2081

2082 C dies (via C) when:

2083

2084 =over

2085

2086 =item * the argument is not a hash reference,

2087

2088 =item * any value in the hash is not itself a hash reference,

2089

2090 =item * an unknown option is given, or the options are not C<< name =E value >> pairs,

2091

2092 =item * C is not a plain string, or it names an already-present column.

2093

2094 =back

2095

2096 =head2 hist

2097

2098 Computes the histogram of the given data values, operating in single $O(N)$ pass performance. It returns the bin counts, computed breaks, midpoints, and density.

2099

2100 my $res = hist([1, 2, 2, 3, 3, 3, 4, 4, 5], breaks => 4);

2101

2102 If C is not explicitly provided, it defaults to calculating the number of bins using Sturges' formula.

2103

2104 =head2 kruskal_test

2105

2106 Essentially the test determines if all groups have the same median (same distribution) (an excellent review is at https://library.virginia.edu/data/articles/getting-started-with-the-kruskal-wallis-test)

2107

2108 Performs a Kruskal-Wallis rank sum test, see

2109 https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/kruskal.test

2110

2111 =head3 hash of array entry

2112

2113 I feel that this is better, and more easily read, than what you get in R:

2114

2115 my %x = (

2116 'normal.subjects' => [2.9, 3.0, 2.5, 2.6, 3.2],

2117 'obs. airway disease' => [3.8, 2.7, 4.0, 2.4],

2118 'asbestosis' => [2.8, 3.4, 3.7, 2.2, 2.0]

2119 );

2120 $kt = kruskal_test(\%x);

2121

2122 =head3 R-like array entry

2123

2124 my @xk = (2.9, 3.0, 2.5, 2.6, 3.2); # normal subjects

2125 my @yk = (3.8, 2.7, 4.0, 2.4); # with obstructive airway disease

2126 my @zk = (2.8, 3.4, 3.7, 2.2, 2.0); # with asbestosis

2127 my @x = (@xk, @yk, @zk);

2128 my @g = (

2129 (map {'Normal subjects'} 0..4),

2130 (map {'Subjects with obstructive airway disease'} 0..3),

2131 map {'Subjects with asbestosis'} 0..4

2132 );

2133 my $kt = kruskal_test(\@x, \@g);

2134

2135 =head2 ks_test

2136

2137 The Kolmogorov–Smirnov test checks whether two samples are drawn from the

2138 same distribution (two-sample), or whether a single sample is drawn from a

2139 given reference distribution (one-sample). It works by comparing the empirical

2140 cumulative distribution functions (ECDFs) and measuring the largest gap

2141 between them.

2142

2143 Two-sample form — pass two array references:

2144

2145 $ks = ks_test(\@x, \@y);

2146 $ks = ks_test(\@x, \@y, alternative => 'greater');

2147

2148 One-sample form — pass one array reference and the name of a reference CDF.

2149 Currently only C<'pnorm'> is supported, i.e. the standard normal distribution

2150 (mean 0, standard deviation 1):

2151

2152 $ks = ks_test(\@x, 'pnorm');

2153

2154 Arguments may be given positionally (as above) or by name:

2155

2156 $ks = ks_test(x => \@x, y => \@y, alternative => 'less', exact => 1);

2157

2158 Non-numeric and undefined elements are silently dropped before the test runs.

2159

2160 C selects which gap between the ECDFs is measured:

2161

2162 =over

2163

2164 =item * C<'two.sided'> (default) — the largest gap in either direction,

2165 D = sup |F_x − F_y|.

2166

2167 =item * C<'greater'> — the largest gap where x's ECDF rises above the other,

2168 D⁺ = sup (F_x − F_y).

2169

2170 =item * C<'less'> — the largest gap in the other direction, D⁻ = sup (F_y − F_x).

2171

2172 =back

2173

2174 These follow R's C convention: C<'greater'>/C<'less'> describe which CDF

2175 lies I the other, which (because a higher CDF means smaller values) is

2176 the opposite of which sample tends to be larger.

2177

2178 C controls how the p-value is computed. Omit it to let the test choose:

2179 the exact distribution is used for small samples (two-sample when nx·ny

2180 10000, one-sample when n < 100) and the asymptotic (Kolmogorov limiting)

2181 approximation otherwise. Pass C<< exact =E 1 >> to force the exact computation or

2182 C<< exact =E 0 >> to force the asymptotic one. Exact p-values cannot be computed

2183 when the data contain ties; if ties are present on the exact path, the test

2184 warns and falls back to the asymptotic p-value. (The exact one-sample test is

2185 only available for the two-sided alternative; a one-sided one-sample request

2186 also falls back to asymptotic.)

2187

2188 =head3 Return value

2189

2190 C returns a hash reference with four keys:

2191

2192 =over

2193

2194 =item * B<< C >> — the KS statistic for the chosen C: D, D⁺, or

2195 D⁻. It is the maximum distance between the two ECDFs (or, for the one-sample

2196 test, between the ECDF and the reference CDF), always in the range [0, 1].

2197 Larger values mean the distributions are further apart.

2198

2199 =item * B<< C >> — the probability, under the null hypothesis that the samples

2200 share a distribution, of observing a statistic at least this large. It is

2201 clamped to [0, 1]; a small value (e.g. < 0.05) is evidence against the null.

2202

2203 =item * B<< C >> — a human-readable description of exactly what was run, handy

2204 for logging or reproducing a result. One of:

2205 C<"Two-sample Kolmogorov-Smirnov exact test">,

2206 C<"Two-sample Kolmogorov-Smirnov test (asymptotic)">,

2207 C<"One-sample Kolmogorov-Smirnov exact test">, or

2208 C<"One-sample Kolmogorov-Smirnov test (asymptotic)">.

2209

2210 =item * B<< C >> — the alternative hypothesis that was applied

2211 (C<'two.sided'>, C<'greater'>, or C<'less'>), echoed back so the result is

2212 self-describing.

2213

2214 =back

2215

2216 For example:

2217

2218 my $ks = ks_test(\@x, \@y);

2219 if ($ks->{p_value} < 0.05) {

2220 printf "reject H0: D=%.4f, p=%.4g (%s)\n",

2221 $ks->{statistic}, $ks->{p_value}, $ks->{method};

2222 }

2223

2224 =head2 ljoin

2225

2226 Consider a hash: C<$h{$row}{$col}>, and another hash C<$i{$row}{$col}>.

2227 C will add information for C<$col> in C<%i> for each C<$row> to C<%h>, where C<$row> exists in both C<%h> and C<%i>

2228

2229 For example,

2230

2231 {

2232 "Jack Smith" {

2233 age 30

2234 }

2235 }

2236

2237 and a second hash,

2238 {

2239 "Jack Smith" {

2240 dept "Engineering"

2241 },

2242 "Jane Doe" {

2243 age 25

2244 }

2245 }

2246

2247 in this case, running C will modify \%h to result:

2248

2249 {

2250 "Jack Smith" {

2251 age 30,

2252 dept "Engineering"

2253 }

2254 }

2255

2256 =head2 lm

2257

2258 This is the linear models function.

2259

2260 $lm = lm(formula => 'mpg ~ wt + hp', data => $mtcars);

2261

2262 where C<$mtcars> is a hash of hashes

2263

2264 C also supports generating interaction terms directly within the formula using the C<*> operator:

2265

2266 my $lm = lm(formula => 'mpg ~ wt * hp^2', data => \%mtcars);

2267

2268 If your data contains missing numbers (C or C), C handles listwise deletion dynamically to ensure mathematical integrity before fitting.

2269

2270 the dot operator also works:

2271

2272 $lm = lm(formula => 'y ~ .', data => $dot_data);

2273

2274 =head2 matrix

2275

2276 my $mat1 = matrix(

2277 data => [1..6],

2278 nrow => 2

2279 );

2280

2281 You can also pass C<< byrow =E 1 >> if you want the matrix populated row-wise instead of column-wise.

2282

2283 As of version 0.10, parameters do not need to be named, so that C works more like R:

2284

2285 my $d = matrix(rnorm(32000), 1000, 32);

2286

2287 works as C, C, and C

2288

2289 =head2 max

2290

2291 max(1,2,3);

2292

2293 or

2294

2295 my @arr = 1..8;

2296 max(@arr, 4, 5)

2297

2298 as of version 0.02, max will die if any undefined values are provided

2299

2300 =head2 mean

2301

2302 mean(1,2,3);

2303

2304 or

2305

2306 my @arr = 1..8;

2307 mean(@arr, 4, 5)

2308

2309 or

2310

2311 mean([1,1], [2,2]) # 1.5

2312

2313 as of version 0.02, mean will die if any undefined values are provided

2314

2315 =head2 median

2316

2317 works like mean, taking array references and arrays:

2318

2319 median( $test_data[$i][0] )

2320

2321 as of version 0.02, median will die if any undefined values are provided

2322

2323 =head2 min

2324

2325 min(1,2,3);

2326

2327 or

2328

2329 my @arr = 1..8;

2330 min(@arr, 4, 5)

2331

2332 as of version 0.02, min will die if any undefined values are provided

2333

2334 =head2 mode

2335

2336 Takes either an array or an array reference, and returns an array of the most common scalars (numbers or strings)

2337

2338 @arr = mode([1,3,3,3]); # returns (3)

2339

2340 @arr = mode('a','a','c','c','z'); # returns ('a', 'c')

2341

2342 =head2 oneway_test

2343

2344 A one-way test for equality of group means that, unlike C/ANOVA, B

2345 assume equal variances>. By default it performs B (the

2346 same default as R's C), so the residual degrees of freedom are

2347 usually fractional. Pass C<< var_equal =E 1 >> for the classic equal-variance form.

2348

2349 use Stats::LikeR qw(oneway_test);

2350

2351 =head3 Input

2352

2353 C accepts your data in one of three shapes. In every case each

2354 I is a vector of at least two numeric observations.

2355

2356

2357

2358 =begin html

2359

2360

2361
2362
2363								Shape
2364								What it means
2365								Group labels
2366
2367
2368
2369
2370								Hash of arrays `{ a => [...], b => [...] }`
2371								Each key is a group (R's `stack()` view of a named list)
2372								the hash keys
2373
2374
2375								Array of arrays `[ [...], [...] ]`
2376								Each element is a group
2377								`"Index 0"`, `"Index 1"`, …
2378
2379
2380								Hash + `formula` `{ resp => [...], grp => [...] }, formula => 'resp ~ grp'`
2381								Long-format columns split by a factor column
2382								the distinct values of the factor
2383
2384
2385

2386

2387 =end html

2388

2389

2390

2391 =head3 Options

2392

2393

2394

2395 =begin html

2396

2397

2398
2399
2400								Option
2401								Default
2402								Meaning
2403
2404
2405
2406
2407								`var_equal` (alias `var.equal`)
2408								`0` (false)
2409								`0` → Welch's test (unequal variances). `1` → pooled-variance test.
2410
2411
2412								`formula`
2413								none
2414								`'response ~ factor'`. Only valid with a hash input; an error with an array of arrays.
2415
2416
2417

2418

2419 =end html

2420

2421

2422

2423 =head3 Data validation

2424

2425 Every observation must be B; an C or non-numeric

2426 cell makes the call C with the offending group and position. This matches

2427 the rest of C (C, C, C, … all die on C) and

2428 prevents missing values from being silently treated as C<0>.

2429

2430 Each group needs at least two observations, and you need at least two groups.

2431

2432 =head3 Output

2433

2434 A hash reference with three top-level keys:

2435

2436

2437

2438 =begin html

2439

2440

2441
2442
2443								Key
2444								Value
2445
2446
2447
2448
2449								factor name (`Group`, or the formula's factor, e.g. `supp`)
2450								the between-groups row: `Df`, `Sum Sq`, `Mean Sq`, `F value`, `Pr(>F)`
2451
2452
2453								`Residuals`
2454								the within-groups row: `Df`, `Sum Sq`, `Mean Sq` (`Df` is fractional under Welch)
2455
2456
2457								`group_stats`
2458								`{ mean => { group => mean, … }, size => { group => n, … } }`
2459
2460
2461

2462

2463 =end html

2464

2465

2466

2467 =head3 Examples

2468

2469 =head4 Hash of arrays (each key is a group)

2470

2471 my $res = oneway_test({

2472 yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],

2473 ctrl => [1, 1, 1, 0, 0, 0 ],

2474 });

2475

2476 {

2477 Group => {

2478 Df => 1,

2479 "Sum Sq" => 61.6533333333333,

2480 "Mean Sq" => 61.6533333333333,

2481 "F value" => 177.504798464491,

2482 "Pr(>F)" => 1.31343255160843e-07,

2483 },

2484 Residuals => {

2485 Df => 9.81767348326473, # fractional: Welch correction

2486 "Sum Sq" => 3.47333333333333,

2487 "Mean Sq" => 0.353783749200256,

2488 },

2489 group_stats => {

2490 mean => { ctrl => 0.5, yield => 5.03333333333333 },

2491 size => { ctrl => 6, yield => 6 },

2492 },

2493 }

2494

2495 =head4 Array of arrays (groups named by index)

2496

2497 my $res = oneway_test([

2498 [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],

2499 [1, 1, 1, 0, 0, 0 ],

2500 ]);

2501

2502 Identical to the hash form, except C is keyed by position:

2503

2504 group_stats => {

2505 mean => { "Index 0" => 5.03333333333333, "Index 1" => 0.5 },

2506 size => { "Index 0" => 6, "Index 1" => 6 },

2507 }

2508

2509 =head4 Long format with a formula

2510

2511 When your data is in columns rather than pre-split groups, name the response

2512 and factor columns with a formula. The factor's I become the groups and

2513 the factor's I becomes the top-level key:

2514

2515 my $res = oneway_test(

2516 {

2517 len => [4.2, 11.5, 7.3, 16.5, 17.3, 13.6, 23.6, 18.5, 33.9],

2518 supp => [qw(VC VC VC OJ OJ OJ HI HI HI)],

2519 },

2520 formula => 'len ~ supp',

2521 );

2522 # $res->{supp}, $res->{Residuals}, $res->{group_stats} ...

2523

2524 =head3 Classic equal-variance form

2525

2526 my $res = oneway_test(\%groups, var_equal => 1); # or 'var.equal' => 1

2527

2528 =head3 Notes

2529

2530 =over

2531

2532 =item * The default (Welch) does B require equal group sizes or equal variances;

2533 the pooled form (C<< var_equal =E 1 >>) assumes equal variances.

2534

2535 =item * C is only meaningful for a hash input. Passing it with an array of

2536 arrays is an error.

2537

2538 =item * Group order in the output is not guaranteed for hash inputs (it follows hash

2539 iteration order); read results by name, not position.

2540

2541 =item * Avoid naming a factor C or C in a formula, since those

2542 are reserved top-level keys in the result.

2543

2544 =back

2545

2546 =head2 p_adjust

2547

2548 Returns array of false-discovery-rate-corrected p-values, where methods available are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr"

2549

2550 my @q = p_adjust(\@pvalues, $method);

2551

2552 =head2 power_t_test

2553

2554 $test_data = power_t_test(

2555 n => 30, delta => 0.5,

2556 sd => 1.0, sig_level => 0.05

2557 );

2558

2559 It also allows configuring the test type (C<< type =E 'one.sample' >>, C<'two.sample'>, C<'paired'>) and alternative hypothesis (C<< alternative =E 'one.sided' >>). You can also pass C<< strict =E 1 >> to strictly evaluate both tails of the distribution.

2560

2561

2562

2563 =begin html

2564

2565

2566
2567
2568								Parameter
2569								Type
2570								Default
2571								Description
2572
2573
2574
2575
2576								`n`
2577								Float
2578								`undef`
2579								Number of observations (per group for two-sample, pairs for paired).
2580
2581
2582								`delta`
2583								Float
2584								`undef`
2585								True difference in means.
2586
2587
2588								`sd`
2589								Float
2590								1.0
2591								Standard deviation.
2592
2593
2594								`sig_level`
2595								Float
2596								0.05
2597								Significance level (Type I error probability). Also accepts `sig.level`.
2598
2599
2600								`power`
2601								Float
2602								`undef`
2603								Power of test (1 minus Type II error probability).
2604
2605
2606								`type`
2607								String
2608								`"two.sample"`
2609								Type of t-test: `"two.sample"`, `"one.sample"`, or `"paired"`.
2610
2611
2612								`alternative`
2613								String
2614								`"two.sided"`
2615								One- or two-sided test: `"two.sided"`, `"one.sided"`, `"greater"`, or `"less"`.
2616
2617
2618								`strict`
2619								Boolean
2620								0 (False)
2621								Use strict interpretation of two-sided power calculations.
2622
2623
2624								`tol`
2625								Float
2626								~`1.22e-4`
2627								Numerical tolerance used for the internal root-finding algorithm.
2628
2629
2630

2631

2632 =end html

2633

2634

2635

2636 =head2 prcomp

2637

2638 Principal Component Analysis

2639

2640 =head3 Options

2641

2642

2643

2644 =begin html

2645

2646

2647
2648
2649								Option
2650								Type
2651								Default
2652								Description
2653
2654
2655
2656
2657								`center`
2658								Boolean
2659								`1` (True)
2660								If true, the variables are shifted to be zero-centered before the analysis takes place.
2661
2662
2663								`scale`
2664								Boolean
2665								`0` (False)
2666								If true, the variables are scaled to have unit variance before the analysis takes place. Note: If a column has zero variance, the function will `croak` to prevent division by zero.
2667
2668
2669								`retx`
2670								Boolean
2671								`1` (True)
2672								If true, the rotated data (the original data multiplied by the rotation matrix) is returned under the key `x`.
2673
2674
2675								`tol`
2676								Number
2677								`undef`
2678								A value indicating the magnitude below which components should be omitted. Components are omitted if their standard deviation is less than or equal to `tol` times the standard deviation of the first component.
2679
2680
2681								`rank`
2682								Integer
2683								`undef`
2684								Optionally specify a strict limit on the number of principal components to return. The function will return `min(rank, rows, columns)` components.
2685
2686
2687

2688

2689 =end html

2690

2691

2692

2693 =head3 Results

2694

2695 =head4 Returned Data Structure

2696

2697 The C function returns a HashRef containing the following keys representing the results of the Principal Component Analysis:

2698

2699

2700

2701 =begin html

2702

2703

2704
2705
2706								Key
2707								Type
2708								Description
2709
2710
2711
2712
2713								`sdev`
2714								ArrayRef[Number]
2715								The standard deviations of the principal components. Mathematically, these are the square roots of the eigenvalues of the covariance matrix.
2716
2717
2718								`rotation`
2719								ArrayRef[ArrayRef]
2720								A 2D array representing the matrix of variable loadings (the eigenvectors). Each inner array represents a row, and the columns correspond to the principal components.
2721
2722
2723								`x`
2724								ArrayRef[ArrayRef]
2725								A 2D array containing the rotated data (often referred to as PCA scores). This is the original data projected onto the principal components. Note: Only present if the `retx` option is true.
2726
2727
2728								`center`
2729								ArrayRef[Number] or `0`
2730								The centering values used (typically the column means). Returns false (`0`) if centering was disabled.
2731
2732
2733								`scale`
2734								ArrayRef[Number] or `0`
2735								The scaling values used (typically the column standard deviations). Returns false (`0`) if scaling was disabled.
2736
2737
2738								`varnames`
2739								ArrayRef[String]
2740								The sorted names of the original variables. Note: Only present if the input data was a Hash of Arrays (HoA) or a Hash of Hashes (HoH).
2741
2742
2743

2744

2745 =end html

2746

2747

2748

2749 =head3 Using array of arrays

2750

2751 my $aoa = [

2752 [2, 4],

2753 [4, 2],

2754 [6, 6]

2755 ];

2756

2757 my $pca = prcomp($aoa);

2758

2759 which returns

2760

2761 {

2762 center [

2763 [0] 4,

2764 [1] 4

2765 ],

2766 rotation [

2767 [0] [

2768 [0] 0.707106781186547,

2769 [1] 0.707106781186548

2770 ],

2771 [1] [

2772 [0] 0.707106781186548,

2773 [1] -0.707106781186547

2774 ]

2775 ],

2776 scale 0,

2777 sdev [

2778 [0] 2.44948974278318,

2779 [1] 1.4142135623731

2780 ],

2781 x [

2782 [0] [

2783 [0] -1.41421356237309,

2784 [1] -1.4142135623731

2785 ],

2786 [1] [

2787 [0] -1.4142135623731,

2788 [1] 1.41421356237309

2789 ],

2790 [2] [

2791 [0] 2.82842712474619,

2792 [1] 2.22044604925031e-16

2793 ]

2794 ]

2795 }

2796

2797 =head3 Hash of Arrays

2798

2799 my $hoa = { B => [4, 2, 6], A => [2, 4, 6] };

2800 my $pca = prcomp($hoa);

2801

2802 =head2 quantile

2803

2804 Calculates sample quantiles using R's continuous Type 7 interpolation.

2805

2806 my $quantile = quantile('x' => [1..99], probs => [0.05, 0.1, 0.25]);

2807

2808 If the C parameter is omitted, it behaves identically to R by defaulting to the 0, 25, 50, 75, and 100 percentiles (C). The returned hash keys match R's standardized naming convention (e.g., C<"25%">, C<"33.3%">).

2809

2810 =head2 rbinom

2811

2812 Create a binomial distribution of numbers

2813

2814 my $binom = rbinom( n => $n, prob => 0.5, size => 9);

2815

2816 It hooks directly into Perl's internal PRNG system, respecting C seeds.

2817

2818 =head2 read_table

2819

2820 I've tried to make this as simple as possible, trying to follow from R:

2821

2822 my $test_data = read_table('t/HepatitisCdata.csv');

2823

2824 =head3 options

2825

2826

2827

2828 =begin html

2829

2830

2831
2832
2833								Option
2834								Description
2835								Example
2836
2837
2838
2839
2840								`comment`
2841								Comment character, by default `#`
2842								`comment => %` or whatever; does not apply with header
2843
2844
2845								`output.type`
2846								data type for output: array of hash, hash of array, or hash of hash
2847								`'output.type' => 'aoh'`
2848
2849
2850								`filter`
2851								Only take in rows with a certain filter
2852								`filter => { Sex => sub {$_ eq 'f'} }`
2853
2854
2855								`row.names`
2856								include row names in retrieved data; off by default
2857
2858
2859
2860								`sep`
2861								field separator character; synonym with `delim`
2862								`sep => "\t"`
2863
2864
2865								`delim`
2866								field separator character; synonym with `sep`
2867								`delim => "\t"`
2868
2869
2870

2871

2872 =end html

2873

2874

2875

2876 output types can be AOH (aoa), HOA (hoa), HOH (hoh)

2877

2878 read_table($filename, 'output.type' => 'aoh');

2879

2880 read_table($filename, 'output.type' => 'hoa');

2881

2882 and, like Text::CSV_XS, filters can be applied in order to save RAM on big files:

2883

2884 $test_data = read_table(

2885 't/HepatitisCdata.csv',

2886 filter => {

2887 Sex => sub {$_ eq 'f'} # where "Sex" is the column name, and "$_" is the value for that column

2888 },

2889 'output.type' => 'aoh'

2890 );

2891

2892 the default delimiter is C<,>

2893 Suffixes C<.csv> and C<.tsv> are automatically detected from file names, but if specified, are overridden by C and/or C. C is given priority.

2894

2895 =head2 rnorm

2896

2897 Make a normal distribution of numbers, with pre-set mean C, standard deviation C, and number C.

2898

2899 my ($rmean, $sd, $n) = (10, 2, 9999);

2900 my $normals = rnorm( n => $n, mean => $rmean, sd => $sd);

2901

2902 =head2 runif

2903

2904 Make an approximately uniform distribution into an array

2905

2906 =head3 named arguments

2907

2908 my $unif = runif( n => $n, min => 0, max => 1);

2909

2910 where C is the number of items, the values are between C and C

2911

2912 =head3 positional args

2913

2914 this is to match R's behavior:

2915

2916 runif( 9 )

2917

2918 will make 9 numbers in [0,1]

2919

2920 runif(9, 0, 99)

2921

2922 will match C, C, and C respectively

2923

2924 =head2 sample

2925

2926 take a sample of hash or array slices.

2927

2928 my $h = sample(\%h, 4); # take 4 hash keys and their values into $h

2929

2930 or, alternatively, with arrays:

2931

2932 my $arr = sample(\@arr, 3); # take 3 indices of an array

2933

2934 =head2 scale

2935

2936 my @scaled_results = scale(1..5);

2937

2938 You can also pass an options hash to disable centering or scaling:

2939

2940 my @scaled_results = scale(1..5, { center => false, scale => 1 });

2941

2942 It fully supports matrix operations. By passing an array of arrays, C processes the data column by column independently:

2943

2944 my $scaled_mat = scale([[1, 2], [3, 4], [5, 6]]);

2945

2946 =head2 sd

2947

2948 my $stdev = sd(2,4,4,4,5,5,7,9);

2949

2950 Correct answer is 2.1380899352994

2951

2952 C can accept both array references as well as arrays:

2953

2954 my $stdev = sd([2,4,4,4,5,5,7,9]);

2955

2956 As of version 0.02, sd will croak/die if any undefined values are provided.

2957

2958 =head2 seq

2959

2960 Works as closely as I can to R's seq, which is very similar to Perl's C loops. Returns an array, not an array reference.

2961

2962 =head3 Standard integer sequence

2963

2964 say 'seq(1, 5):';

2965 my @seq = seq(1, 5);

2966 say join(', ', @seq), "\n";

2967

2968 say 'seq(1, 2, 0.25):';

2969 @seq = seq(1, 2, 0.25);

2970

2971 =head3 Fractional steps

2972

2973 say 'seq(1, 2, 0.25):';

2974 @seq = seq(1, 2, 0.25);

2975 say join(", ", @seq), "\n";

2976 for (my $idx = 2; $idx >= 1; $idx -= 0.25) { # count down to pop

2977 is_approx(pop @seq, $idx, "seq item $idx with fractional step");

2978 }

2979

2980 =head3 Negative steps

2981

2982 say 'seq(10, 5, -1):';

2983 @seq = seq(10, 5, -1);

2984 say join(", ", @seq), "\n";

2985 for (my $idx = 5; $idx <= 10; $idx++) { # count down to pop

2986 is_approx(pop @seq, $idx, "seq item $idx with negative step");

2987 }

2988

2989 =head2 shapiro_test

2990

2991 tests to see if an array reference is normally distributed, returns a p-value and a statistic

2992

2993 my $shapiro = shapiro_test(

2994 [1..5]

2995 );

2996

2997 and returns the hash reference:

2998

2999 {

3000 p.value 0.589650577093106,

3001 p_value 0.589650577093106,

3002 statistic 0.960870680168535,

3003 W 0.960870680168535

3004 }

3005

3006 =head2 sum

3007

3008 returns sum, but using both arrays and array references.

3009

3010 my $test_data = [1..8];

3011 sum($test_data)

3012

3013 which I prefer, compared to List::Util's required casting into an array:

3014

3015 sum(@{ $test_data });

3016

3017 which passing a reference is shorter and much easier to read. Stats::LikeR, however, will work for B

3018

3019 as of version 0.02, C will cause the script to die if any undefined values are provided

3020

3021 =head2 summary

3022

3023 Analogous to R's C
, but does not deal with outputs from other functions.

3024 C
only describes data as it is entered.

3025 An option C or its synonym C specifies the maximum number of rows that will print.

3026

3027 =head3 array of array input

3028

3029 my @arr;

3030 foreach my $i (0..18) {

3031 push @arr, runif(22);

3032 }

3033

3034 and then C, or C

3035

3036 ---------------------------------------------------------------------------

3037 Index # values Min. 1st Qu. Median Mean 3rd Qu. Max.

3038 ---------------------------------------------------------------------------

3039 0 22 0.04312 0.286 0.4975 0.5121 0.7296 0.9633

3040 1 22 0.05932 0.1483 0.495 0.4737 0.7699 0.9371

3041 2 22 0.02742 0.1588 0.4045 0.4325 0.6682 0.9878

3042 3 22 0.009233 0.2552 0.5398 0.5147 0.7755 0.9808

3043 4 22 0.06727 0.2432 0.5019 0.4855 0.7121 0.9043

3044 5 22 0.001032 0.1646 0.3021 0.3727 0.5704 0.9556

3045

3046 =head3 hash of array input

3047

3048 $test_data = summary(

3049 {

3050 A => runif(9),

3051 B => runif(9)

3052 },

3053 );

3054

3055 =head2 t_test

3056

3057 There are 1-sample and 2-sample t-tests, from one or two arrays:

3058

3059 my $t_test = t_test( $array1, mu => 0.2334 );

3060

3061 or 2-sample:

3062

3063 $t_test = t_test(

3064 $array1, $array2,

3065 paired => 1

3066 );

3067

3068 returns a hash reference, which looks like:

3069

3070 conf_int => [

3071 -0.06672889, 0.25672889

3072 ],

3073 df => 5,

3074 estimate => 0.095,

3075 p_value => 0.19143688433660,

3076 statistic => 1.50996688705414

3077

3078 the two groups compared can be specified, though not necessarily, as C and C, just like in R:

3079

3080 $t_test = t_test(

3081 'x' => $array1, 'y' => $array2,

3082 paired => 1

3083 );

3084

3085 =head3 Parameters

3086

3087

3088

3089 =begin html

3090

3091

3092
3093
3094								Parameter
3095								Type
3096								Default
3097								Description
3098
3099
3100
3101
3102								`x`
3103								Array Reference
3104								Required
3105								The first vector of data. Must contain at least 2 elements.
3106
3107
3108								`y`
3109								Array Reference
3110								`undef`
3111								The second vector of data. Required for two-sample or paired tests.
3112
3113
3114								`mu`
3115								Float
3116								0.0
3117								The true value of the mean (or difference in means) for the null hypothesis.
3118
3119
3120								`paired`
3121								Boolean
3122								`FALSE`
3123								If true, performs a paired t-test. `x` and `y` must be the same length.
3124
3125
3126								`var_equal`
3127								Boolean
3128								`FALSE`
3129								If true, assumes equal variances (standard two-sample). If false, performs Welch's t-test with unequal variances.
3130
3131
3132								`conf_level`
3133								Float
3134								0.95
3135								Confidence level for the returned confidence interval. Must be between 0 and 1.
3136
3137
3138								`alternative`
3139								String
3140								`"two.sided"`
3141								Direction of the alternative hypothesis: `"two.sided"`, `"less"`, or `"greater"`.
3142
3143
3144

3145

3146 =end html

3147

3148

3149

3150 =head3 Return Hash

3151

3152

3153

3154 =begin html

3155

3156

3157
3158
3159								Key
3160								Description
3161
3162
3163
3164
3165								`statistic`
3166								The computed t-statistic.
3167
3168
3169								`df`
3170								Degrees of freedom for the test.
3171
3172
3173								`p_value`
3174								The calculated p-value based on the test directionality.
3175
3176
3177								`conf_int`
3178								An Array Reference containing two elements: `[lower_bound, upper_bound]`.
3179
3180
3181								`estimate`
3182								The estimated mean of `x` (one-sample) OR the mean of the differences (paired).
3183
3184
3185								`estimate_x`
3186								The estimated mean of the `x` vector (only returned in two-sample tests).
3187
3188
3189								`estimate_y`
3190								The estimated mean of the `y` vector (only returned in two-sample tests).
3191
3192
3193

3194

3195 =end html

3196

3197

3198

3199 =head2 transpose

3200

3201 Transposes a two-dimensional data structure, swapping rows and columns. Accepts either an array of arrays or a hash of hashes.

3202 Returns a new reference of the same type; the input is never modified.

3203

3204 =head3 Array of array input

3205

3206 Takes a reference to an array of array references and returns a new AoA where C.

3207

3208 my $matrix = [[1, 2, 3], [4, 5, 6]];

3209 my $t = transpose($matrix);

3210 # [[1, 4],

3211 # [2, 5],

3212 # [3, 6]]

3213

3214 All rows must be the same length; a ragged input is a fatal error.

3215 C is valid as an element value and is preserved exactly. An empty outer array or an array of empty rows both return C<[]>.

3216

3217 Dies if:

3218 - any inner element is not an array reference

3219 - rows differ in length (ragged array)

3220

3221 =head3 Hash of hash input

3222

3223 Takes a reference to a hash of hash references and returns a new HoH where C.

3224

3225 my $table = { alice => { score => 97, grade => 'A' }, bob => { score => 84, grade => 'B' } };

3226 my $t = transpose($table);

3227 # { score => { alice => 97, bob => 84 },

3228 # grade => { alice => 'A', bob => 'B' } }

3229

3230 Inner keys do not need to be uniform across rows. If a given column key appears in only some rows, the output hash for that column will simply contain only those rows — no padding or C-filling is performed.

3231

3232 my $sparse = {

3233 a => { x => 1, y => 2 },

3234 b => { x => 3, z => 4 } };

3235

3236 my $t = transpose($sparse);

3237 # { x => { a => 1, b => 3 },

3238 # y => { a => 2 },

3239 # z => { b => 4 } }

3240

3241 An empty outer hash or an outer hash whose inner hashes are all empty both return C<{}>.

3242

3243 Dies if any inner element is not a hash reference

3244

3245 =head2 value_counts

3246

3247 Count the values in a given data set, return a hash reference showing how many times each particular value is present.

3248

3249 =head3 Scalar

3250

3251 $hash = value_counts('c');

3252

3253 returns C<< { c =E 1 } >>

3254

3255 =head3 Array reference

3256

3257 value_counts(['a','b','b']);

3258

3259 returns C<< { a =E 1, b =E 2} >>

3260

3261 =head3 Array

3262

3263 my $value_counts = value_counts('a','b','b');

3264

3265 like an array reference above, returns C<< { a =E 1, b =E 2} >>

3266

3267 =head3 Hash

3268

3269 my $value_counts = value_counts( { A => 'a', B => 'a', C => 'b' } );

3270

3271 returns C<< { a =E 2, b =E 1} >>

3272

3273 =head3 Hash of array

3274

3275 my $value_counts = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']});

3276

3277 without a key (like above), the occurences of C, C, and C are counted.

3278

3279 With a key, like C for above, only values within that hash key are counted:

3280

3281 my $vc = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']}, 'a');

3282

3283 =head3 Hash of hash (table)

3284

3285 $hash = value_counts( {

3286 A => {

3287 a => 'x',

3288 b => 'z'

3289 },

3290 B => {

3291 a => 'x'

3292 },

3293 C => {

3294 a => 'y'

3295 }

3296 }, 'a');

3297

3298 the column, or second hash key, that you wish to count, is specified at the command line

3299

3300 =head2 var

3301

3302 as simple as possible:

3303

3304 var(2, 4, 5, 8, 9)

3305

3306 as of version 0.02, C will die if any undefined values are provided

3307

3308 like C, C, etc., C can accept array references, to make code simpler:

3309

3310 my $ref = \@arr;

3311 var($ref) = var(@arr)

3312

3313 =head2 var_test

3314

3315 As described by R: Performs an F test to compare the variances of two samples from normal populations

3316

3317 use Stats::LikeR;

3318

3319 my @x = (2.9, 3.0, 2.5, 2.6, 3.2);

3320 my @y = (3.8, 2.7, 4.0, 2.4);

3321

3322 my $vt = var_test(\@x, \@y);

3323

3324 also, conf_level can be set:

3325

3326 $vt = var_test(\@x, \@y, conf_level => 0.99);

3327

3328 as well as a ratio (from R: the hypothesized ratio of the population variances of C and C:

3329

3330 $test_data = var_test(\@xk, \@yk, ratio => 2);

3331

3332 =head2 view

3333

3334 An R-style C for the structures C returns. Prints the first

3335 few rows of a dataframe as an aligned text table, with numeric columns

3336 right-justified, string columns left-justified, and undefined cells shown as

3337 C. Works on all three C values:

3338

3339

3340

3341 =begin html

3342

3343

3344
3345
3346								`output.type`
3347								Perl structure
3348								What `view` shows
3349
3350
3351
3352
3353								`aoh`
3354								array of hash refs
3355								one line per row, sequential row numbers
3356
3357
3358								`hoa`
3359								hash of array refs
3360								values gathered column-wise by row index
3361
3362
3363								`hoh`
3364								hash of hash refs
3365								top-level keys become the row label column
3366
3367
3368

3369

3370 =end html

3371

3372

3373

3374 =head3 Synopsis

3375

3376 my $aoh = read_table('all.data.tsv', 'output.type' => 'aoh');

3377

3378 view($aoh); # first 6 rows, like head()

3379 view($aoh, n => 20); # first 20 rows

3380 view($aoh, cols => [qw(id age tt)]); # force a column order

3381 view($aoh, 'row.names' => 'id'); # use column 'id' as the row label

3382 view($aoh, na => '.', max_width => 30);

3383

3384 my $txt = view($aoh, return_only => 1); # capture the string, print nothing

3385 view($aoh, to => \*STDERR); # print somewhere other than STDOUT

3386

3387 =head3 Output

3388

3389 # AoH: 7 rows x 3 cols (showing 6)

3390 row_name Testosterone, total (nmol/L) age sex

3391 p1 18.2 41 M

3392 p2 NA 7 F

3393 p3 1.05 33 F

3394 p4 22.9 55 M

3395 p5 14 29 M

3396 p6 NA 62 F

3397 # ... 1 more row

3398

3399 The banner reports the structure type, full dimensions, and how many rows are

3400 displayed. A footer appears only when rows are hidden.

3401

3402 =head3 Arguments

3403

3404 All arguments after the data reference are optional name/value pairs.

3405

3406

3407

3408 =begin html

3409

3410

3411
3412
3413								Argument
3414								Default
3415								Meaning
3416
3417
3418
3419
3420								`n`
3421								`6`
3422								Number of rows to show. `n` greater than the table shows everything.
3423
3424
3425								`rows`
3426								`6`
3427								Number of rows to show. `n` greater than the table shows everything (synonymous with `n`)
3428
3429
3430								`cols` / `columns`
3431								—
3432								Array ref pinning column order (and which columns appear).
3433
3434
3435								`row.names`
3436								—
3437								Column to use as the row label (for `aoh`/`hoa`). See ordering note.
3438
3439
3440								`na`
3441								`'NA'`
3442								Token printed for undefined cells.
3443
3444
3445								`max_width`
3446								`80`
3447								Truncate any cell wider than this (column names are never truncated).
3448
3449
3450								`ellipsis`
3451								`'...'`
3452								Marker appended to truncated cells.
3453
3454
3455								`gap`
3456								`2`
3457								Spaces between columns.
3458
3459
3460								`to`
3461								STDOUT
3462								Filehandle to print to.
3463
3464
3465								`return_only`
3466								`0`
3467								If true, return the string and print nothing.
3468
3469
3470

3471

3472 =end html

3473

3474

3475

3476 C always returns the formatted string, whether or not it also prints.

3477

3478 =head3 A note on column order

3479

3480 C stores rows as hashes, so the original CSV column order is not

3481 preserved. C therefore sorts columns by name for a stable, reproducible

3482 layout. Two conveniences soften this:

3483

3484 =over

3485

3486 =item * A column literally named C (the label C assigns to a

3487 leading blank header) is detected automatically and moved to the left as the

3488 row label.

3489

3490 =item * Pass C<< cols =E [ ... ] >> to control both the order and the selection of columns

3491 shown.

3492

3493 =back

3494

3495 When no label column is present, C numbers the rows C<1, 2, 3, …>, the way

3496 R prints row names for an unnamed data frame.

3497

3498 =head3 Edge cases

3499

3500 =over

3501

3502 =item * Empty input (C<[]> or C<{}>) prints a clean C<0 rows x 0 cols> banner.

3503

3504 =item * Tabs, carriage returns, and newlines inside a cell are escaped (C<\t>, C<\r>,

3505 C<\n>) so one record always stays on one line.

3506

3507 =item * A non-reference argument, or a hash whose values are plain scalars, dies with

3508 a clear message rather than producing garbled output.

3509

3510 =back

3511

3512 =head3 Tests

3513

3514 The behavior above is covered by C (run with C): the three

3515 structure types, C boundaries, alignment, C rendering, truncation,

3516 C/C handling, control-character escaping, the C and

3517 C output paths, empty structures, and the error cases.

3518

3519 =head2 wilcox_test

3520

3521 $test_data = wilcox_test(

3522 [1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30],

3523 [0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29]

3524 );

3525

3526 Computes the Wilcoxon rank-sum / Mann-Whitney test (two samples) or the Wilcoxon signed-rank test (one sample or paired), following R's C conventions.

3527 This is an alternative to the t-test, that does not assume a normal distribution.

3528 With two array refs and no C flag it runs the two-sample rank-sum test; with a single sample, or with C<< paired =E 1 >>, it runs the signed-rank test. It calculates exact p-values by default for C<< N E 50 >> without ties; when ties (or, for the signed-rank case, zero differences) are present it automatically switches to the normal approximation with continuity correction.

3529

3530 =head3 Calling conventions

3531

3532 The first one or two array-ref arguments are taken positionally as C and C; everything after that is parsed as C<< key =E value >> pairs. The named forms C<< x =E >> and C<< y =E >> are also accepted and override the positional values. The flat argument list following the positional refs must contain an even number of elements, or the call dies with a usage message.

3533

3534 # positional

3535 wilcox_test(\@x, \@y, paired => 1);

3536

3537 # fully named

3538 wilcox_test(x => \@x, y => \@y, alternative => "greater", exact => 0);

3539

3540 =head3 Input parameters

3541

3542

3543

3544 =begin html

3545

3546

3547
3548
3549								Parameter
3550								Type
3551								Default
3552								Description
3553
3554
3555
3556
3557								`x`
3558								ARRAY ref
3559								(required)
3560								The first sample. Passed positionally or as `x =>`. Non-numeric and undefined elements are silently dropped; an empty or all-missing `x` is fatal. In the two-sample test `mu` is subtracted from each `x` value.
3561
3562
3563								`y`
3564								ARRAY ref
3565								`undef`
3566								The second sample. If present and `paired` is false, a two-sample rank-sum test is run. If `paired` is true, `y` is required and must be the same length as `x`. Omit it for the one-sample signed-rank test.
3567
3568
3569								`paired`
3570								boolean
3571								`0` (false)
3572								Run a paired signed-rank test on the per-element differences `x[i] - y[i] - mu`. Requires `y` of equal length.
3573
3574
3575								`correct`
3576								boolean
3577								`1` (true)
3578								Apply the continuity correction (±0.5) when using the normal approximation. Ignored when an exact p-value is computed.
3579
3580
3581								`mu`
3582								number
3583								`0.0`
3584								Null-hypothesis location shift. Subtracted from `x` (two-sample) or from each difference (one-sample / paired).
3585
3586
3587								`exact`
3588								boolean / undef
3589								`undef` (auto)
3590								Tri-state. `undef` (or absent) selects exact automatically: when both group sizes are `< 50` and there are no ties (two-sample), or `n < 50` with no ties (signed-rank). A true value forces the exact test, a false value forces the approximation. Exact is impossible with ties — or, for the signed-rank test, with zero differences — and falls back to the approximation with a warning.
3591
3592
3593								`alternative`
3594								string
3595								`"two.sided"`
3596								One of `"two.sided"`, `"less"`, or `"greater"`. Selects the tail(s) used for the p-value.
3597
3598
3599

3600

3601 =end html

3602

3603

3604

3605 =head3 Output

3606

3607 Returns a hash ref with the following keys:

3608

3609

3610

3611 =begin html

3612

3613

3614
3615
3616								Key
3617								Type
3618								Description
3619
3620
3621
3622
3623								`statistic`
3624								number
3625								The test statistic. For the two-sample test this is the Mann-Whitney W (the `x` rank sum minus `nx*(nx+1)/2`). For the signed-rank test it is V, the sum of the ranks assigned to the positive differences.
3626
3627
3628								`p_value`
3629								number
3630								The p-value for the chosen `alternative`, capped at `1.0`. Two-sided p-values are `2 * min(p_less, p_greater)`.
3631
3632
3633								`method`
3634								string
3635								A human-readable description of the exact test variant that was run (see below).
3636
3637
3638								`alternative`
3639								string
3640								Echoes the `alternative` actually used (`"two.sided"`, `"less"`, or `"greater"`).
3641
3642
3643

3644

3645 =end html

3646

3647

3648

3649 The C string reports which path executed:

3650

3651 =over

3652

3653 =item * Two-sample: C<"Wilcoxon rank sum exact test">, C<"Wilcoxon rank sum test with continuity correction">, or C<"Wilcoxon rank sum test">.

3654

3655 =item * One-sample / paired: C<"Wilcoxon exact signed rank test">, C<"Wilcoxon signed rank test with continuity correction">, or C<"Wilcoxon signed rank test">.

3656

3657 =back

3658

3659 =head3 Notes and edge cases

3660

3661 Missing data is handled by listwise removal of non-numeric / undefined cells before ranking; in the paired case a pair is dropped if either member is missing. An empty C (or, in the two-sample case, an empty C) after this filtering is fatal.

3662

3663 For the signed-rank test, exact zero differences are discarded before ranking (matching R), and their presence disables the exact computation. Both empty-after-filtering and all-zero-difference inputs are fatal.

3664

3665 Ties are detected during ranking and trigger the tie-corrected variance in the normal approximation; they also rule out the exact p-value. When C is left on auto, the size thresholds (C<< E 50 >> per group, or C<< E 50 >> differences) are what gate the exact vs. approximate decision.

3666

3667 =head2 write_table

3668

3669 mimics R's C, with data as first argument to subroutine, and output file as second

3670

3671 write_table(\@data_aoh, $tmp_file, sep => "\t", 'row.names' => 1);

3672

3673 You can also precisely filter and reorder which columns are written by passing an array reference to C:

3674

3675 write_table(\@data, $tmp_file, sep => "\t", 'col.names' => ['c', 'a']);

3676

3677 undefined variables are printed as C by default, but can be set as you wish using C

3678

3679 write_table(\%data_hoa, '/tmp/undef.val.tsv', sep => "\t", 'undef.val' => 'nan')

3680

3681 as of version 0.07, C determines comma and tab-separated delimiters from the filename, but will override if C or C are explicitly set.

3682

3683 Args can also be accepted:

3684

3685 write_table( 'data' => \%flat, 'file' => $f );

3686

3687 =head1 changes

3688

3689 =head2 0.16

3690

3691 changes to dist.ini, the minimum Perl version disappeared when I fixed other problems

3692

3693 clarifications between run time and test dependencies

3694

3695 addition of C function to sort AoH and HoA

3696

3697 addition of C to translate array of hashes into a hash of arrays

3698

3699 fix of long double functions: https://www.cpantesters.org/cpan/report/5d5d9836-6a5f-11f1-aadb-63fd6d8775ea

3700

3701 =head3 C

3702

3703 output residual keys now use names, not integers

3704

3705 =head3 C

3706

3707 =head3 Bug fixes

3708

3709 B When

3710 C<< valid_n E= p >>, the cleanup freed the C I but not the

3711 per-row name strings it held (those had been transferred out of C,

3712 whose own array was already freed). The strings leaked on every such error.

3713 Added the per-entry C loop before freeing the array, matching the

3714 normal path.

3715

3716 B Only the first hash value was

3717 checked to be a C; subsequent values were C'd unconditionally, so

3718 a malformed row (C<< { a =E {...}, b =E 5 } >>) dereferenced a non-reference. Every

3719 row is now validated, with the partial allocations cleaned up before the

3720 C, mirroring the existing AoH path.

3721

3722 B<< C on a possibly-signed C. >> C is undefined for

3723 byte values ≥ 0x80 on platforms where C is signed. Cast to

3724 C<(unsigned char)> before the call.

3725

3726 =head3 Speed / RAM improvements

3727

3728 B C silently

3729 truncated any longer formula. Replaced with a buffer sized to

3730 C, so there is no fixed limit and no truncation.

3731

3732 B<< C<.>-expansion buffer is now a growable heap buffer. >> C

3733 silently dropped expanded terms once full. It is now a buffer that doubles on

3734 demand. Appends also went from C (which rescans from the start every

3735 time — O(n²) over many columns) to an O(1) amortised append that tracks the

3736 write position.

3737

3738 B The original

3739 C'd a C buffer, filled it, copied it into C, and freed it

3740 I — C allocations plus C copies. Each candidate row is now

3741 written straight into C at its prospective commit slot; a row that fails

3742 listwise deletion is simply overwritten by the next candidate. This removes the

3743 C allocate/free cycles and the copy loop entirely.

3744

3745 B<< Categorical levels sorted with C. >> The level list used an O(n²) bubble

3746 sort; replaced with C (relevant only for high-cardinality factors).

3747

3748 B<< Unused tail of C reclaimed after listwise deletion. >> C is allocated for

3749 all C rows up front (C is unknown until rows are scanned). When rows

3750 are dropped, C is now Ced down to C, returning the unused

3751 tail to the allocator before the OLS phase.

3752

3753 B The argument-parsing index was widened from

3754 C to C to match C, and the HoH row count now uses

3755 C rather than relying on C's return value.

3756

3757 =head3 Known limitations (left unchanged)

3758

3759 =over

3760

3761 =item * A multi-way term such as C is split only on the first C<*>, so it yields

3762 C, C, and C rather than a full three-way expansion. Deeper

3763 interactions silently fail (the unparsable term evaluates to C and the

3764 rows are dropped). This matches the documented two-way C<*> support.

3765

3766 =item * HoA input takes the row count from the first column; columns shorter than

3767 that simply contribute dropped rows rather than raising an error.

3768

3769 =back

3770

3771 =head3 C

3772

3773 =head4 Bug fixes

3774

3775 B Nearly every C after an allocation

3776 leaked memory. C does a C, so anything allocated but not yet

3777 freed is lost. Affected paths:

3778

3779 =over

3780

3781 =item * AoA and hash first-pass errors leaked C and any C entries

3782 allocated so far.

3783

3784 =item * Formula-mode "not found as an array ref" errors leaked C and C.

3785

3786 =back

3787

3788 All post-allocation errors now route through a single C label that frees

3789 every pointer unconditionally. Pointers are initialised to C and C

3790 is zero-allocated with C, so the cleanup is always safe to run.

3791

3792 B<< Undefined and non-numeric cells silently coerced to C<0.0>. >> The original

3793 second pass used C<(svp && *svp) ? SvNV(*svp) : 0.0>, meaning an C or

3794 non-numeric cell was quietly treated as zero, silently corrupting the

3795 F-statistic. Each cell is now validated with C and C;

3796 the call dies naming the group and observation index, consistent with the rest

3797 of C (C, C, C, etc.).

3798

3799 B C

3800 cast to C I adding, so an empty array (C returns C<-1>)

3801 produced C rather than C<0>. Changed to

3802 C so the C<+1> is done in signed arithmetic

3803 before the cast.

3804

3805 B<< Unreliable group count from C. >> C returns the

3806 number of buckets in use rather than the number of keys for tied hashes.

3807 Replaced with C, which always returns the correct key count.

3808

3809 =head4 Improvements

3810

3811 B<< C accepted as an alias for C. >> R users write

3812 C; the argument parser now accepts both spellings.

3813

3814 B C and manual C replaced

3815 with C, C, C, and C. C additionally

3816 preserves embedded NUL bytes in group key strings, which the previous

3817 C-based copies silently truncated.

3818

3819 =head4 Known limitations (not changed)

3820

3821 =over

3822

3823 =item * A factor column named C or C in a formula call will

3824 collide with reserved top-level keys in the result hash.

3825

3826 =item * Group names containing an embedded NUL are stored correctly but are still

3827 truncated at C when written into the output hash keys.

3828

3829 =back

3830

3831 =head3 C

3832

3833 default view shifted to 80 characters to match Linux window length

3834

3835 =head4 New features

3836

3837 =over

3838

3839 =item * B<< C is accepted as a synonym for C >> (the number of rows shown).

3840 Passing both C and C is an error.

3841

3842 =item * B C validates its argument names

3843 against the documented set (C, C, C, C, C,

3844 C, C, C, C, C, C, C) and

3845 dies listing any it does not recognise, so a misspelt option (e.g. C)

3846 is caught instead of silently ignored.

3847

3848 =item * B<< C / C is validated. >> It must be a non-negative integer; C or

3849 a non-numeric value now dies with a clear message instead of producing

3850 warnings and being treated as C<0>.

3851

3852 =item * B

3853

3854 =back

3855

3856 =head4 Bug fixes

3857

3858 =over

3859

3860 =item * B<< C<< n =E 0 >> now still prints the column header. >> Column names were collected

3861 only from the rows being shown, so requesting zero rows produced an empty

3862 header line. At least one row is now scanned (when data exists) so the

3863 header always lists the columns.

3864

3865 =item * B<< An empty hash (C<{}>) no longer dies. >> It was rejected as

3866 I<"neither ARRAY nor HASH">; it is now shown as an empty table

3867 (C<0 rows x 0 cols>), matching the handling of an empty array.

3868

3869 =item * B<< The C alias now drives the Hash-of-Hashes label header. >> The

3870 header for the row-label column consulted only C, so

3871 C<< row_names =E 'id' >> displayed C instead of C. Both spellings are

3872 now honoured consistently.

3873

3874 =item * B A Hash-of-Arrays column or

3875 Hash-of-Hashes row whose value is not actually an array/hash reference now

3876 renders as empty cells rather than throwing a dereference error.

3877

3878 =back

3879

3880 =head4 Performance

3881

3882 =over

3883

3884 =item * Column gathering no longer sorts once per scanned row. Unique column names

3885 are collected across the scanned rows and sorted a single time (same output

3886 order), and the ellipsis length is computed once rather than per cell.

3887

3888 =back

3889

3890 =head4 Tests

3891

3892 =over

3893

3894 =item * C is self-contained (the C implementation is inlined; it loads

3895 no other files) and covers the new argument handling, the bug fixes above,

3896 and the existing AoH / HoA / HoH behaviour, alignment, truncation, and

3897 output-path handling.

3898

3899 =back

3900

3901 =head3 C

3902

3903

Corrected four bugs in the C XSUB plus a portability fix in its exact signed-rank helper. Behaviour on valid input is unchanged: the R-agreement cases (unpaired C, C

; paired one-sided C, C

; separated exact C, C

) all still match R's C.

3904

3905 =head4 Bug fixes

3906

3907 =over

3908

3909 =item * B<< Invalid C is now rejected. >> Any value other than C or C previously fell through to the two-sided branch and returned a two-sided result mislabelled with the bad string, so a typo like C<< alternative =E "twosided" >> silently "worked". It now croaks unless C is one of C, C, C.

3910

3911

=item * B When every observation is tied the approximation's variance collapses to 0 and the old code divided by C: C returned C

(a "significant" difference between identical samples). It now warns and returns C

3912

3913

=item * B<< Two-sided continuity correction at C. >> R uses C, so the correction is C<0> when the statistic sits exactly on its mean; the old code used C<-0.5>. Example: C<< wilcox_test([1,4], [2,3], exact =E 0) >> changed from C

to C

(matches R).

3914

3915 =item * B<< C no longer shadows libm. >> The local C accumulator (mean of the statistic) shadowed the C library C; renamed to C (two-sample) and C (signed-rank). No active miscompute, removed as a latent hazard.

3916

3917 =back

3918

3919 =head4 Cosmetic

3920

3921 =over

3922

3923 =item * Collapsed a no-op ternary that assigned the same signed-rank exact method string on both branches; the C field is now simply C.

3924

3925 =back

3926

3927 =head4 Portability (exact signed-rank helper)

3928

3929 =over

3930

3931 =item * B<< C no longer calls C. >> The C<2^n> normaliser is now built by exact repeated doubling, which has no long-double libm dependency. This fixes an C load failure reported by a CPAN smoker (FreeBSD, perl 5.20, C) whose libm lacks the long-double math functions; the symbol resolved on glibc, which is why local builds passed. C accumulation in the DP is retained — only the C call was at fault.

3932

3933 =item * B<< C → C >> for C, C, and the DP loop counters, which also removes a C-to-C narrowing at the call site. The C result (C) stays signed so its negative-C sentinel still fires, and is cast to C only after the C<< k E 0 >> check.

3934

3935 =back

3936

3937 =head4 Tests

3938

3939 =over

3940

3941 =item * Added C (flat, no subtests): R-agreement cases, option handling (C, C, C, C, named/positional C/C, NA dropping), regressions for all four bug fixes, argument-error and C-validation checks, output shape, and C coverage of the two-sample, exact, and paired allocation paths.

3942

3943 =back

3944

3945 =head2 0.15

3946

3947 C function added, similar to R's C

3948

3949 C:

3950 filter => {

3951 'Testosterone, total (nmol/L)' => sub { defined $_ },

3952 }

3953

3954 was broken by the change in undefined variables in 0.14, but is back to being C

3955

3956 C improvement in sectioning in README

3957

3958 Numerous changes to prevent quadmath/long double CPAN test failures

3959

3960 Minimum Scalar::Util version in dist.ini is now 1.22, see https://www.cpantesters.org/cpan/report/6b682236-6567-11f1-a3bc-a055f9c4ba34

3961

3962 C is no longer needed, and removed as a dependency

3963

3964 =head3 C

3965

3966 =head4 Bug fixes

3967

3968 =over

3969

3970 =item * B C strips a

3971 leading comment marker from the header line (so a file may begin with

3972 C<#id,val>), but that strip was dead code: the XS parser skipped I line

3973 beginning with the comment string before the callback ever saw it, so a

3974 commented header was silently dropped and the first data row was mistaken for

3975 the header. The parser now delivers the first content line even when it

3976 begins with the comment marker, and only skips comment lines after the header

3977 has been seen.

3978

3979 =item * B The parser stripped

3980 C<\r> unconditionally, so a quoted value such as C<"x\ry"> lost its carriage

3981 return and would not survive a C -> C round-trip. C<\r>

3982 is now stripped only as part of a trailing CRLF line ending and as a stray CR

3983 I quotes; inside quotes it is literal data.

3984

3985 =item * B<< Duplicate column names no longer corrupt C output. >> With

3986 C<< output.type =E 'hoa' >>, a repeated column name pushed the same cell once per

3987 occurrence, so the affected columns came out longer than the others and the

3988 arrays no longer lined up by row. Columns are now keyed by unique header name

3989 (first-seen order preserved, later values win, one warning emitted).

3990

3991 =item * B Passing a defined argument

3992 that was not a CODE reference silently fell through to slurp mode and ignored

3993 the argument; it now croaks

3994 (I<"callback must be a CODE reference">).

3995

3996 =item * B<< An undefined/empty C row-name now dies instead of keying on C<"">. >>

3997 With C<< output.type =E 'hoh' >>, a row whose row-name column was empty/undef was

3998 stored under the C<''> key and raised I<"uninitialized value"> warnings. It now

3999 dies, naming the column and the offending data row.

4000

4001 =item * B A 1-based numeric

4002 filter key greater than the column count was accepted, then silently extended

4003 every row through the C<$_> write-back. It is now rejected up front with a

4004 message naming the column count.

4005

4006 =item * B<< C and C together now die. >> Supplying both silently preferred

4007 C; passing both is now an explicit error (C remains an alias for

4008 C when used alone).

4009

4010 =item * B The unknown-argument path used

4011 C to dump the offending names to STDOUT before dying; the names are now

4012 carried in the C message itself.

4013

4014 =back

4015

4016 =head4 Better diagnostics

4017

4018 =over

4019

4020 =item * Alignment errors now report B is ragged

4021 (I<"Alignment error on FILE data row N (X fields vs Y headers)">), instead of

4022 only the field/header counts.

4023

4024 =back

4025

4026 =head4 Memory-leak fixes (exception paths)

4027

4028 The parser allocated its working buffers (C, C, and — in

4029 slurp mode — C) in the XS C block, i.e. I any validation, and

4030 freed them only by falling off the end of the function. Any non-local exit

4031 therefore leaked:

4032

4033 =over

4034

4035 =item * the open-failure C leaked the row buffer and field (and the slurp

4036 accumulator);

4037

4038 =item * far more commonly, a C thrown B — which

4039 C does routinely on alignment errors, bad row names, and filter

4040 exceptions — unwound straight out of the XS frame and leaked the field, the

4041 current row, the line buffer, the slurp accumulator, I

4042 handle>.

4043

4044 =back

4045

4046 Allocations now happen in C after every croak-able check, and every

4047 long-lived resource (the file handle via C, the buffers via

4048 C) is tied to the save stack, which an exception unwinds. Measured

4049 with C: a C mid-file went from 5 leaked SVs to 0, and an

4050 open failure from 2 to 0. This is the likely source of the constant-size leaks

4051 seen in CPAN-tester reports for the exception-path tests.

4052

4053 =head4 Performance

4054

4055 =over

4056

4057 =item * B<~2.5x faster parsing> (57 -> 145 MB/s on a 100k-row quoted file). The core

4058 loop appended one character at a time with C; it now

4059 scans runs of ordinary bytes with C / a bounded scan and appends each

4060 run in a single C, copying field contents in bulk rather than byte

4061 by byte.

4062

4063 =back

4064

4065 =head4 Internal / non-behavioral

4066

4067 =over

4068

4069 =item * XS declarations moved from C to C; allocations deferred into

4070 C (see the leak fixes above).

4071

4072 =item * The filter loop now aliases the row hash with C

4073 instead of copying it with C. This removes a full

4074 per-row hash copy for every filtered row and fixes a latent staleness bug:

4075 after a filter mutated C<$_> and the change was written back, C<%_> still

4076 reflected the pre-mutation copy, so a subsequent filter in the same row saw

4077 stale values. With aliasing, C<%_> I the row, so write-backs are always

4078 visible.

4079

4080 =back

4081

4082 =head4 Known limitation (not changed)

4083

4084 =over

4085

4086 =item * B<< C does not round-trip back to C. >> C renders an

4087 C cell as an empty field by default, and C maps an empty

4088 field back to C, so the I round-trip is clean. But if a file is

4089 written with a token such as C<< 'undef.val' =E 'NA' >>, C has no

4090 inverse option and reads C back as the string C<'NA'>. C also

4091 cannot distinguish a deliberately quoted empty string (C<"">) from a missing

4092 value -- both become C. Adding an C-style option to

4093 C (mapping configurable tokens and/or empty fields to C)

4094 would close this gap.

4095

4096 =back

4097

4098 =head3 C

4099

4100 =head4 Behavior change

4101

4102 =over

4103

4104 =item * B<< C cells now write as an empty field, not an empty string. >> A missing

4105 or C value renders as nothing between separators (C) rather than a

4106 quoted empty string (C / C). Supplying C<< 'undef.val' =E 'NA' >>

4107 (or any other token) still overrides this, exactly as before. This is the

4108 only change that can alter the bytes of an existing output file; if you relied

4109 on the previous default, pass C<< 'undef.val' =E '' >> to keep an explicit empty

4110 field, or your chosen placeholder.

4111

4112 =back

4113

4114 =head4 Bug fixes

4115

4116 =over

4117

4118 =item * B

4119 Previously, cells were looked up with the raw bytes of the column name

4120 (C), which fails to match a

4121 UTF-8-flagged hash key: the column header printed correctly but every cell

4122 under it came back empty. All lookups now fetch by SV (C), header

4123 lists are gathered and sorted as SVs (C + C, preserving the

4124 flag) instead of being round-tripped through C, and the C

4125 column is matched with C rather than C. Embedded NUL bytes in

4126 keys are handled correctly as a side effect.

4127

4128 =item * B<< C<< col.names =E [] >> no longer loops forever. >> An empty C array made

4129 C return C<-1>, which — compared against an unsigned C loop

4130 index — wrapped to C and ran effectively without end. This was fixed

4131 for flat hashes previously; it was still present for hash-of-hashes,

4132 hash-of-arrays, and array-of-hashes, plus both C header-filtering

4133 loops. All such loops now use a signed index.

4134

4135 =item * B One header loop used an

4136 C index that silently wrapped past 65,535 and never terminated.

4137 It now uses C like the rest of the code.

4138

4139 =item * B Every other input shape

4140 rejects a nested reference with

4141 I<"Cannot write nested reference types to table">; a flat hash instead

4142 stringified it (e.g. C) into the file. It now croaks

4143 consistently.

4144

4145 =item * B<< C<< 'undef.val' =E undef >> is handled cleanly. >> It previously called

4146 C on C, raising an I<"uninitialized value"> warning and

4147 yielding an empty string by accident. It is now treated explicitly as an empty

4148 field, with no warning.

4149

4150 =back

4151

4152 =head4 Memory-leak fixes (exception paths)

4153

4154 =over

4155

4156 =item * The row-key list gathered for hash-of-hashes input was leaked when the output

4157 file could not be opened.

4158

4159 =item * The I<"Could not get headers"> croak on hash-of-arrays input leaked both the

4160 already-open filehandle and the headers array.

4161

4162 =back

4163

4164 =head4 Internal / non-behavioral

4165

4166 =over

4167

4168 =item * Numeric row labels are now formatted into a reused stack buffer instead of a

4169 per-row C / C allocation (no functional change; removes a

4170 cast-away-C and one allocation per row).

4171

4172 =item * Several signed/unsigned index types were made consistent (C vs

4173 C) to match C and silence the conditions behind the loop bugs

4174 above.

4175

4176 =back

4177

4178 =head4 Tests

4179

4180 =over

4181

4182 =item * C expanded from 17 to 69 assertions. New coverage targets each

4183 fix above: the empty-field default and C<< undef.val =E undef >> (no warning),

4184 C<< col.names =E [] >> termination across all four input shapes, the

4185 >65,535-column header loop (gated behind C), in-sequence

4186 numeric row labels, nested-reference rejection, CSV quoting corners

4187 (carriage return, separators inside column names, multi-character separators),

4188 empty input writing no file, and UTF-8 column names and row keys. Two leak

4189 assertions cover the exception paths above.

4190

4191 =back

4192

4193 =head2 0.14

4194

4195 C function added for rows

4196

4197 C reads undefined values to C instead of C, which makes calculations easier

4198

4199 C writes undef by default as an empty string C<''>

4200

4201 C transforms a hash of hashes into an hash of arrays

4202

4203 C uses C instead of C to allow for high-precision 128-bit floats to be used on quadmath machines when available: https://www.cpantesters.org/cpan/report/296f4868-631f-11f1-abba-ff15558d240b

4204

4205 Numerous switches from C to C for local precision, like above

4206

4207 numerous changes to C for ease of use and working with datasets with numerous undefined values

4208

4209 dist.ini now links to math library when compiling: https://www.cpantesters.org/cpan/report/785e26d8-6397-11f1-89c0-dc066e8775ea

4210

4211 C now should be complete, errors with confidence intervals fixed

4212

4213 =head2 0.13

4214

4215 C: speed improvements; commented headers are now allowed

4216

4217 C: fix for

4218

4219 Attempt to free temp prematurely: SV 0x56417a2ae610 at t/write_table.t line 182.

4220 main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203

4221 Attempt to free unreferenced scalar: SV 0x56417a2ae610 at t/write_table.t line 183.

4222 main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203

4223

4224 C gives better warnings for incorrect types of data given

4225

4226 Numerous changes to dist.ini to improve CPAN testing, especially for Win32

4227

4228 =head2 0.12

4229

4230 C can also take hash of arrays, and various mixes of data types

4231

4232 C: Addition of C keywords in many places; should improve CPU performance

4233

4234 Better POD formatting, correction of output hash for README's C

4235

4236 C can now accept hash of hashes as input

4237

4238 new C function for switching 2D hash keys and 2D array indices, and C for comparing columns against columns

4239

4240 removed unused function from C helpers

4241

4242 C: addition of restrict keywords in preinit, should improve CPU performance

4243

4244 MANIFEST.skip changed to MANIFEST.SKIP to improve CPAN testing

4245

4246 using C for tests of C, which may or may not work with CPAN testers (experimental)

4247

4248 Added function name to warnings, so I actually know which function is producing the error

4249

4250 C can also take C and C as args, in addition to positions

4251

4252 fixed C as it could hang if given empty C or C

4253

4254 Added C<__EXTENSIONS__> to source XS file for better CPAN testing

4255

4256 =head2 0.11

4257

4258 better POD formatting for tables

4259

4260 addition of MANIFEST.skip to get better testing results on CPAN

4261

4262 C: bugfix for when there is no intercept in the formula, new test cases in t/glm.t

4263

4264 C now accepts simple hashes as input, in addition to hash of arrays, hash of hashes, and arrays of hashes

4265

4266 Better documentation for t-test

4267

4268 =head2 0.10

4269

4270 changes to compilation for CPAN, trying to get this work on Windows

4271

4272 Addition of C and C

4273

4274 C will work without key names, just like in R. Testing for C has improved.

4275

4276 =head2 0.09

4277

4278 context changes in XS C, C, and C to get better CPAN testing results

4279

4280 C keywords added to C to increase speed

4281

4282 =head2 0.08

4283

4284 Speed improvement in C
of hashes.

4285

4286 Addition of C, C, C, C, and C functions

4287

4288 Chi-squared function no longer has Perl wrapper, and all code is in XS, which should result in a minor speed increase with 1 less function call.

4289

4290 Compiler changes for GNU source and inclusion of C, to ensure more CPAN testing works better.

4291

4292 C now returns hash-of-hash in {row}{column}

4293

4294 =head2 0.07

4295

4296 Addition of C
function.

4297

4298 Formulas can now be omitted from C, resulting in a stacked calculation as R would think.

4299

4300 Addition of C for multi-group comparisons that does not assume normality like C does.

4301

4302 C and C now automatically set separators for C<.csv> files as C<,> and C<.tsv> files as C<"\t">, respectively, so these values no longer need to be specified separately from the file name.