File Coverage

blib/lib/Stats/LikeR.pm

Criterion	Covered	Total	%
statement	205	367	55.8
branch	71	204	34.8
condition	36	77	46.7
subroutine	30	36	83.3
pod	4	4	100.0
total	346	688	50.2

are exported by default.

line stmt bran cond sub pod time code

1 #!/usr/bin/env perl

2 # ABSTRACT: Get basic statistical functions, like in R, but with Perl using XS for performance

3 require 5.010;

4 14 14 546487 use strict;

14 23

14 482

5 14 14 83 use feature 'say';

14 22

14 3472

6 package Stats::LikeR;

7 our $VERSION = 0.15;

8 require XSLoader;

9 14 14 7237 use Devel::Confess 'color';

14 128225

14 43

10 14 14 1370 use warnings FATAL => 'all';

14 23

14 629

11 14 14 5912 use autodie ':default';

14 197331

14 55

12 14 14 66792 use Exporter 'import';

14 18

14 525

13 14 14 53 use Scalar::Util 'looks_like_number';

14 17

14 8547

14 XSLoader::load('Stats::LikeR', $VERSION);

15 our @EXPORT_OK = qw(add_data aov cfilter chisq_test col col2col cor cor_test cov dnorm filter fisher_test glm group_by hoh2hoa hist kruskal_test ks_test ljoin lm matrix max mean median min mode oneway_test p_adjust power_t_test prcomp quantile rbinom read_table rnorm runif sample scale sd seq shapiro_test sum summary t_test transpose value_counts var var_test view wilcox_test write_table);

16 our @EXPORT = @EXPORT_OK;

18 require XSLoader;

19 # ---- filter DSL: col() builds a predicate via overloading (pure Perl) -------

20 # Exported: filter (XS) and col. Place col()/Col/Pred near the top of the .pm;

21 # they need no XS. filter() is the XSUB.

22 26 26 1 139024 sub col { Stats::LikeR::Col->new($_[0]) }

23 {

24 package Stats::LikeR::Col;

25 26 33 26 176 sub new { bless { name => $_[1] }, ref($_[0]) || $_[0] }

26 # build a comparison leaf; if operands were swapped (4 > col('x')), flip the op

27 sub _c {

28 26 26 47 my ($self, $val, $swapped, $op, $flip) = @_;

29 26 100 61 Stats::LikeR::Pred->_leaf($self->{name}, $swapped ? $flip : $op, $val);

30 }

31 use overload

32 13 13 29 '>' => sub { $_[0]->_c($_[1],$_[2],'>','<') },

33 3 3 6 '<' => sub { $_[0]->_c($_[1],$_[2],'<','>') },

34 1 1 4 '>=' => sub { $_[0]->_c($_[1],$_[2],'>=','<=') },

35 1 1 4 '<=' => sub { $_[0]->_c($_[1],$_[2],'<=','>=') },

36 3 3 6 '==' => sub { $_[0]->_c($_[1],$_[2],'==','==') },

37 1 1 15 '!=' => sub { $_[0]->_c($_[1],$_[2],'!=','!=') },

38 0 0 0 'lt' => sub { $_[0]->_c($_[1],$_[2],'lt','gt') },

39 1 1 4 'gt' => sub { $_[0]->_c($_[1],$_[2],'gt','lt') },

40 0 0 0 'le' => sub { $_[0]->_c($_[1],$_[2],'le','ge') },

41 0 0 0 'ge' => sub { $_[0]->_c($_[1],$_[2],'ge','le') },

42 2 2 5 'eq' => sub { $_[0]->_c($_[1],$_[2],'eq','eq') },

43 1 1 4 'ne' => sub { $_[0]->_c($_[1],$_[2],'ne','ne') },

44 14 14 123 fallback => 1;

14 33

14 270

45 }

46 {

47 package Stats::LikeR::Pred;

48 26 26 308 sub _leaf { bless { op => $_[2], col => $_[1], val => $_[3] }, 'Stats::LikeR::Pred' }

49 4 4 52 sub _node { bless { op => $_[0], l => $_[1], r => $_[2] }, 'Stats::LikeR::Pred' }

50 use overload

51 2 2 4 '&' => sub { Stats::LikeR::Pred::_node('and', $_[0], $_[1]) },

52 1 1 3 '|' => sub { Stats::LikeR::Pred::_node('or', $_[0], $_[1]) },

53 1 1 3 '!' => sub { Stats::LikeR::Pred::_node('not', $_[0], undef) },

54 14 14 4601 fallback => 1;

14 16

14 91

55 }

56 sub summary {

57 5 5 1 660125 my ($data, %args);

58 5 38 my $current_sub = (split(/::/,(caller(0))[3]))[-1];

60 5 100 66 29 if (@_ && ref $_[0]) {

61 # Handles: summary(\@arr) or summary(\@arr, nrows => 5) or summary(\%h, nrow => 3)

62 4 4 $data = shift;

63 4 9 %args = @_; # capture any trailing key/value pairs

64 } else {

65 # Handles: summary(@runif) or summary(@runif, nrows => 2)

66 # Extract known trailing named arguments from the flat list

67 1 33 15 while (@_ >= 2 && defined $_[-2] && !ref($_[-2]) && $_[-2] =~ /^(?:nrows|nrow)$/) {

68 0 0 my $val = pop @_;

69 0 0 my $key = pop @_;

70 0 0 $args{$key} = $val;

71 }

72 # The remaining items in @_ make up the actual data array

73 1 6 my @list = @_;

74 1 1 $data = \@list;

75 }

76 # Normalize nrow -> nrows, default to 10

77 5 50 26 $args{nrows} //= delete($args{nrow}) // 10;

78 5 8 my $ref_type = ref $data;

79 5 50 66 17 if (($ref_type ne 'ARRAY') && ($ref_type ne 'HASH')) {

80 0 0 die "$current_sub' data must either be a hash or an array, not \"$ref_type\"";

81 }

82 5 6 my $single_arr = 0;

83 5 100 100 26 if (($ref_type eq 'ARRAY') && (ref $data->[0] eq '')) {

84 2 4 $single_arr = 1;

85 }

86 5 11 my @header = ('# values', 'Min.', '1st Qu.', 'Median', 'Mean', '3rd Qu.', 'Max.');

87 5 6 my @out;

88 5 100 30 if ($single_arr == 1) {

100

89 2 3 push @out, '-' x 75;

90 2 11 my $header = sprintf('%9s ' x scalar @header, @header);

91 2 4 push @out, $header;

92 2 3 push @out, '-' x 75;

93 2 3 my @undef = grep {!defined $data->[$_]} 0..scalar @{ $data }-1;

198 206

2 12

94 2 50 8 if (scalar @undef > 0) {

95 0 0 say STDERR join (',', @undef);

96 0 0 die "The above indices are not defined in $current_sub";

97 }

98 2 2 my @numeric = grep {looks_like_number($_)} @{ $data };

198 261

2 4

99 2 51 my $q = quantile(\@numeric, probs => [0.25, 0.75]);

100 2 72 my $vals = sprintf('%9.4g ' x scalar @header, scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));

101 2 9 push @out, $vals;

102 } elsif ($ref_type eq 'ARRAY') {

103 1 2 push @out, '-' x 75;

104 1 6 my $header = sprintf('%9s ' x scalar @header, @header);

105 1 3 unshift @header, 'Index';

106 1 2 $header = 'Index ' . $header;

107 1 1 push @out, $header;

108 1 2 push @out, '-' x 75;

109 1 1 my $rows_printed = 0;

110 1 4 foreach my $index (0..$#$data) {

111 2 2 my @undef = grep {!defined $data->[$index][$_]} 0..scalar @{ $data->[$index] }-1;

18 22

2 4

112 2 50 6 if (scalar @undef > 0) {

113 0 0 say STDERR join (',', @undef);

114 0 0 die "The above indices are not defined for index $index in $current_sub";

115 }

116 2 2 my @numeric = grep {looks_like_number($_)} @{ $data->[$index] };

18 24

2 3

117 2 15 my $q = quantile(\@numeric, probs => [0.25, 0.75]);

118 2 29 my $vals = sprintf('%6.4g', $index) . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));

119 2 4 push @out, $vals;

120 2 3 $rows_printed++;

121 2 50 6 last if $rows_printed >= $args{nrows}; # Changed to >= just to be safe

122 }

123 } elsif ($ref_type eq 'HASH') {

124 2 4 push @out, '-' x 78;

125 2 10 my $header = sprintf('%9s ' x scalar @header, @header);

126 2 4 unshift @header, 'Key';

127 2 5 $header = ' Key ' . $header;

128 2 3 push @out, $header;

129 2 2 push @out, '-' x 78;

130 2 2 my $rows_printed = 0;

131 2 3 foreach my $key (sort {lc $a cmp lc $b} keys %{ $data }) {

4 10

2 9

132 3 5 my @undef = grep {!defined $data->{$key}[$_]} 0..scalar @{ $data->{$key} }-1;

27 32

3 7

133 3 50 9 if (scalar @undef > 0) {

134 0 0 say STDERR join (',', @undef);

135 0 0 die "The above indices are not defined for key $key in $current_sub";

136 }

137 3 3 my @numeric = grep {looks_like_number($_)} @{ $data->{$key} };

27 37

3 5

138 3 24 my $q = quantile(\@numeric, probs => [0.25, 0.75]);

139 3 7 my $print_key = substr($key, 0, 9);

140 3 50 21 if ((length $print_key) < 9) { # make sure that short keys line up correctly

141 3 5 $print_key .= ' ' x (9 - length $print_key);

142 }

143 3 41 my $vals = $print_key . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));

144 3 6 push @out, $vals;

145 3 3 $rows_printed++;

146 3 100 11 last if $rows_printed >= $args{nrows};

147 }

148 }

149 5 61 say join ("\n", @out);

150 5 27 return \@out;

151 }

152

153 sub read_table {

154 523 523 1 730274 my $file = shift;

155 523 50 4901 die "read_table: \"$file\" is not a file\n" unless -f $file;

156 523 50 3250 die "read_table: \"$file\" is not readable\n" unless -r $file;

157

158 523 661 my %input_args = @_;

159 523 100 906 if (exists $input_args{delim}) {

160 # FIX: sep + delim together used to silently prefer delim

161 die "read_table: pass either 'sep' or 'delim', not both\n"

162 1 50 3 if exists $input_args{sep};

163 1 3 $input_args{sep} = delete $input_args{delim};

164 }

165

166 523 100 1489 my $default_sep = $file =~ /\.tsv$/i ? "\t" : ',';

167 523 1137 my %args = (

168 sep => $default_sep,

169 comment => '#',

170 %input_args,

171 );

172

173 523 640 my %allowed_args = map { $_ => 1 } (

2615 3319

174 'comment', 'output.type', 'filter', 'row.names', 'sep',

175 );

176 523 954 my @undef_args = sort grep { !$allowed_args{$_} } keys %args;

1067 1523

177 523 50 819 if (@undef_args) {

178 0 0 my $current_sub = ( split /::/, (caller(0))[3] )[-1];

179 # FIX: no more printing to STDOUT from a library ('say'); the die

180 # message carries the offending argument names itself

181 0 0 die "the args \"@undef_args\" aren't defined for $current_sub\n";

182 }

183 523 100 1224 my $otype = $args{'output.type'} // 'aoh';

184 523 100 1136 die "read_table: output.type \"$otype\" isn't allowed (aoh, hoa, hoh)\n"

185 unless $otype =~ m/^(?:aoh|hoa|hoh)$/;

186

187 522 535 my $filter = $args{filter};

188 522 50 66 1200 if (defined $filter && ref($filter) eq 'CODE') {

189 0 0 $filter = { 0 => $filter };

190 } elsif (defined $filter && ref($filter) ne 'HASH') {

191 0 0 die "'filter' must be a CODE or HASH reference\n";

192 }

193

194 522 546 my (@data, %data, @header, @uniq_header,

195 %mapped_filters, @sorted_filter_flds, %seen_rownames);

196 522 467 my $data_row = 0;

197 _parse_csv_file($file, $args{sep} // '', $args{comment} // '', sub {

198 6718 6718 7608 my ($line_ref) = @_;

199 6718 100 7560 if (!@header) {

200 # --- HEADER PROCESSING (copy made only here; runs once) ---

201 522 909 my @line = @$line_ref;

202 $line[0] =~ s/^\Q$args{comment}\E//

203 522 50 33 3141 if @line && defined $line[0] && length( $args{comment} // '' );

204 522 699 @header = @line;

205 522 100 66 1077 if (@header && $header[0] eq '') {

206 10 14 $header[0] = 'row_name';

207 }

208 522 466 my %seen_h;

209 # FIX: hoa output with duplicate column names used to push the

210 # same cell once PER OCCURRENCE, silently corrupting the column

211 # lengths. Iterate unique names (order preserved) instead.

212 522 566 @uniq_header = grep { !$seen_h{$_}++ } @header;

1166 2166

213 522 555 my @dup_cols = grep { $seen_h{$_} > 1 } @uniq_header;

1165 1407

214 522 100 610 warn "read_table: duplicate column name(s) in $file: @dup_cols (later values win)\n"

215 if @dup_cols;

216 522 100 100 1347 if ($otype eq 'hoh' && !defined $args{'row.names'}) {

217 4 8 $args{'row.names'} = $header[0];

218 }

219 522 50 66 771 if (defined $args{'row.names'}

220 61 74 && !grep { $_ eq $args{'row.names'} } @header) {

221 0 0 die "\"$args{'row.names'}\" isn't in the header of $file\n";

222 }

223 522 100 672 if ($filter) {

224 4 12 for my $k (keys %$filter) {

225 4 50 15 if ($k =~ /^\d+$/) {

226 # FIX: a numeric key past the last column used to be

227 # accepted and then silently extended every row via

228 # the $_ write-back

229 0 0 0 die "read_table: numeric filter key $k exceeds the "

230 . scalar(@header) . " columns of $file\n"

231 if $k > @header;

232 0 0 $mapped_filters{$k} = $filter->{$k};

233 } else {

234 4 12 my ($idx) = grep { $header[$_] eq $k } 0 .. $#header;

44 54

235 4 50 11 die "Filter column '$k' not found in header\n"

236 unless defined $idx;

237 4 16 $mapped_filters{ $idx + 1 } = $filter->{$k};

238 }

239 }

240 4 11 @sorted_filter_flds = sort { $a <=> $b } keys %mapped_filters;

0 0

241 }

242 522 1955 return;

243 }

244 # --- DATA PROCESSING (operate on $line_ref directly; no row copy) ---

245 6196 5233 $data_row++;

246 6196 100 7196 if (@$line_ref != @header) {

247 # FIX: alignment errors now say WHICH row is ragged

248 1 11 die sprintf "Alignment error on %s data row %d (%d fields vs %d headers).\n",

249 $file, $data_row, scalar @$line_ref, scalar @header;

250 }

251 6195 5405 my %line_hash;

252 6195 7433 for my $i (0 .. $#header) {

253 74732 67231 my $v = $line_ref->[$i];

254 74732 100 66 145140 $line_hash{ $header[$i] } = ( !defined($v) || $v eq '' ) ? undef : $v;

255 }

256 # --- APPLY FILTERS ---

257 6195 100 7180 if (@sorted_filter_flds) {

258 # FIX: 'local %_ = %line_hash' copied the whole row for every

259 # filtered row AND went stale after a $_ write-back. Aliasing the

260 # glob makes %_ THE row hash: zero copy, mutations always visible.

261 1847 1821 local *_ = \%line_hash;

262 1847 1550 my $skip = 0;

263 1847 1765 foreach my $fld (@sorted_filter_flds) {

264 1847 50 2713 local $_ = $fld == 0 ? $line_ref : $line_hash{ $header[ $fld - 1 ] };

265 1847 100 2354 if ( !$mapped_filters{$fld}->( $line_ref, \%line_hash ) ) {

266 1131 2232 $skip = 1;

267 1131 1095 last;

268 }

269 716 50 1800 if ( $fld > 0 ) { # write back any mutation made to $_

270 716 753 $line_ref->[ $fld - 1 ] = $_;

271 716 100 66 1503 $line_hash{ $header[ $fld - 1 ] }

272 = ( !defined($_) || $_ eq '' ) ? undef : $_;

273 }

274 }

275 1847 100 8545 return if $skip;

276 }

277 # Populate requested data structure

278 5064 100 7073 if ($otype eq 'aoh') {

100

279 1863 10983 push @data, \%line_hash;

280 } elsif ($otype eq 'hoa') {

281 1114 1205 push @{ $data{$_} }, $line_hash{$_} for @uniq_header;

15757 30131

282 } elsif ($otype eq 'hoh') {

283 2087 2334 my $row_name = $line_hash{ $args{'row.names'} };

284 # FIX: an undef/empty row-name cell used to become the '' hash

285 # key with "uninitialized" warnings; now it dies with the row

286 die sprintf "read_table: undefined row name (column '%s') in %s data row %d\n",

287 2087 50 2378 $args{'row.names'}, $file, $data_row

288 unless defined $row_name;

289 warn "read_table: duplicate row name '$row_name' in $file (later values win)\n"

290 2087 100 3421 if $seen_rownames{$row_name}++;

291 2087 1988 foreach my $col (@uniq_header) {

292 29171 100 32998 next if $col eq $args{'row.names'};

293 27084 51430 $data{$row_name}{$col} = $line_hash{$col};

294 }

295 }

296 522 50 15010 });

297 521 100 6481 if ($otype eq 'aoh') {

298 509 2039 return \@data;

299 } else { # hoa or hoh

300 12 7373 return \%data;

301 }

302 }

303 # ---------------------------------------------------------------------------

304 # view(): an R-style `head` for the structures read_table() returns.

305 #

306 # Handles all three output.type values:

307 # aoh -> ARRAY of HASH refs

308 # hoa -> HASH of ARRAY refs

309 # hoh -> HASH of HASH refs

310 #

311 # Pure Perl; the only dependency is Scalar::Util (core). There is nothing for

312 # XS to speed up here: view only touches the first n rows, and the total-row

313 # count is O(1) for every structure (scalar @$aoh, array length, scalar keys).

314 # Keeping it pure Perl also keeps install portable (no compiler needed).

315 #

316 # Usage:

317 # view($data); # first 6 rows, like head()

318 # view($data, n => 20); # first 20 rows

319 # view($data, cols => [...]); # pin explicit column order

320 # view($data, na => '.', max_width => 30, gap => 1, ellipsis => '~');

321 # view($data, 'row.names' => 'id'); # use column 'id' as the row label

322 # view($data, to => \*STDERR); # print elsewhere

323 # my $s = view($data, return_only => 1); # capture string, suppress print

324 #

325 # Column order: read_table stores rows as hashes, so the original CSV column

326 # order is gone. view sorts columns by name for a stable layout and treats a

327 # column literally named 'row_name' (read_table's label for a leading blank

328 # header) as the row label. Pass cols => [...] to override the order.

329 # ---------------------------------------------------------------------------

330 14 14 39791 use Scalar::Util qw(looks_like_number);

14 18

14 31916

331

332 sub view {

333 0 0 1 my $data = shift;

334 0 my %args = @_;

335 0 0 my $n = exists $args{n} ? $args{n} : 6;

336 0 0 my $na = exists $args{na} ? $args{na} : 'NA';

337 0 0 my $maxw = exists $args{max_width} ? $args{max_width} : 50;

338 0 0 my $ell = exists $args{ellipsis} ? $args{ellipsis} : '...';

339 0 0 my $gap = exists $args{gap} ? (' ' x $args{gap}) : ' ';

340 0 0 my $ucols = $args{cols} || $args{columns};

341 0 my $fh = $args{to};

342 0 my $quiet = $args{return_only};

343

344 my $label_col = exists $args{'row.names'} ? $args{'row.names'}

345 : exists $args{row_names} ? $args{row_names}

346 0 0 : undef;

347 0 my $rt = ref $data;

348 0 0 0 die "view: expected an ARRAY (AoH) or HASH (HoA/HoH) reference, got "

349 . ($rt || 'a non-reference') . "\n"

350 unless $rt eq 'ARRAY' or $rt eq 'HASH';

351 0 my ($kind, @cols, @labels, @raw, $total, $lab_header);

352 0 0 if ($rt eq 'ARRAY') { # ---- AoH ----

353 0 $kind = 'AoH';

354 0 $total = scalar @$data;

355 0 0 my $show = $n < $total ? $n : $total;

356 0 0 $show = 0 if $show < 0;

357

358 0 0 if ($ucols) {

359 0 @cols = @$ucols;

360 } else {

361 0 my %seen;

362 0 for my $i (0 .. $show - 1) {

363 0 my $row = $data->[$i];

364 0 0 next unless ref $row eq 'HASH';

365 0 push @cols, grep { !$seen{$_}++ } sort keys %$row;

366 }

367 }

368 my $lc = defined $label_col ? $label_col

369 0 0 : (grep { $_ eq 'row_name' } @cols) ? 'row_name' : undef;

370 0 0 if (defined $lc) {

371 0 @cols = grep { $_ ne $lc } @cols;

372 0 $lab_header = $lc;

373 }

374 0 for my $i (0 .. $show - 1) {

375 0 0 my $row = $data->[$i] || {};

376 0 0 push @labels, defined $lc ? $row->{$lc} : ($i + 1);

377 0 push @raw, [ map { $row->{$_} } @cols ];

378 }

379 0 0 $lab_header = '' unless defined $lab_header;

380 } elsif ($rt eq 'HASH') {

381 0 my @keys = keys %$data;

382 0 my $sample;

383 0 0 for my $k (@keys) { $sample = $data->{$k}; last if defined $sample; }

384 0 my $vt = ref $sample;

385

386 0 0 if ($vt eq 'ARRAY') { # ---- HoA ----

387 0 $kind = 'HoA';

388 0 0 my @allcols = $ucols ? @$ucols : sort @keys;

389 0 $total = 0;

390 0 for my $k (@keys) {

391 0 0 my $l = ref $data->{$k} eq 'ARRAY' ? scalar @{ $data->{$k} } : 0;

392 0 0 $total = $l if $l > $total;

393 }

394 0 0 my $show = $n < $total ? $n : $total;

395 0 0 $show = 0 if $show < 0;

396 my $lc = defined $label_col ? $label_col

397 0 0 : (grep { $_ eq 'row_name' } @allcols) ? 'row_name' : undef;

398 0 0 @cols = grep { !defined $lc || $_ ne $lc } @allcols;

399 0 0 $lab_header = defined $lc ? $lc : '';

400 0 for my $i (0 .. $show - 1) {

401 0 0 push @labels, defined $lc ? $data->{$lc}[$i] : ($i + 1);

402 0 push @raw, [ map { $data->{$_}[$i] } @cols ];

403 }

404 } elsif ($vt eq 'HASH') { # ---- HoH ----

405 0 $kind = 'HoH';

406 0 $total = scalar @keys;

407 0 my @rk = sort @keys;

408 0 0 my $show = $n < $total ? $n : $total;

409 0 0 $show = 0 if $show < 0;

410 0 0 my @shown = $show > 0 ? @rk[0 .. $show - 1] : ();

411 0 0 if ($ucols) {

412 0 @cols = @$ucols;

413 } else {

414 0 my %seen;

415 0 for my $rkk (@shown) {

416 0 push @cols, grep { !$seen{$_}++ } sort keys %{ $data->{$rkk} };

417 }

418 }

419 0 0 @cols = grep { $_ ne $label_col } @cols if defined $label_col;

420 0 0 $lab_header = exists $args{'row.names'} ? $args{'row.names'} : 'row_name';

421 0 for my $rkk (@shown) {

422 0 push @labels, $rkk;

423 0 push @raw, [ map { $data->{$rkk}{$_} } @cols ];

424 }

425 } else {

426 0 die "view: HASH values are neither ARRAY (HoA) nor HASH (HoH) refs; "

427 . "cannot interpret as a table\n";

428 }

429 }

430

431 # numeric detection per column (drives right/left alignment)

432 0 my @numeric = (1) x scalar @cols;

433 0 for my $r (@raw) {

434 0 for my $j (0 .. $#cols) {

435 0 my $v = $r->[$j];

436 0 0 next unless defined $v;

437 0 0 $numeric[$j] = 0 unless looks_like_number($v);

438 }

439 }

440 0 0 my $lab_numeric = @labels ? 1 : 0;

441 0 0 0 for my $l (@labels) { $lab_numeric = 0, last unless defined $l && looks_like_number($l); }

442

443 # stringify + sanitize + truncate cells (column names left intact)

444 my $clean = sub {

445 0 0 my $v = shift;

446 0 0 return $na unless defined $v;

447 0 $v = "$v";

448 0 $v =~ s/\t/\\t/g; $v =~ s/\r/\\r/g; $v =~ s/\n/\\n/g;

449 0 0 0 if ($maxw && length($v) > $maxw) {

450 0 my $keep = $maxw - length($ell);

451 0 0 $keep = 0 if $keep < 0;

452 0 $v = substr($v, 0, $keep) . $ell;

453 }

454 0 return $v;

455 0 };

456 0 my @lab_s = map { $clean->($_) } @labels;

457 0 my @rows_s = map { [ map { $clean->($_) } @$_ ] } @raw;

458 0 my @head_s = @cols;

459

460 # column widths

461 0 my $lab_w = length $lab_header;

462 0 0 $lab_w = length $_ > $lab_w ? length $_ : $lab_w for @lab_s;

463 0 my @w = map { length $_ } @head_s;

464 0 for my $r (@rows_s) {

465 0 for my $j (0 .. $#cols) {

466 0 my $l = length $r->[$j];

467 0 0 $w[$j] = $l if $l > $w[$j];

468 }

469 }

470

471 my $pad = sub {

472 0 0 my ($s, $width, $right) = @_;

473 0 0 my $g = $width - length $s; $g = 0 if $g < 0;

474 0 0 return $right ? (' ' x $g) . $s : $s . (' ' x $g);

475 0 };

476

477 0 my @out;

478 0 my $shown = scalar @rows_s;

479 0 0 push @out, sprintf("# %s: %d row%s x %d col%s (showing %d)",

480 $kind, $total, ($total == 1 ? '' : 's'),

481 scalar(@cols), (@cols == 1 ? '' : 's'), $shown);

482

483 0 my @hcells = ( $pad->($lab_header, $lab_w, 0) );

484 0 push @hcells, $pad->($head_s[$_], $w[$_], $numeric[$_]) for 0 .. $#cols;

485 0 push @out, join($gap, @hcells);

486

487 0 for my $ri (0 .. $#rows_s) {

488 0 my @cells = ( $pad->($lab_s[$ri], $lab_w, $lab_numeric) );

489 0 push @cells, $pad->($rows_s[$ri][$_], $w[$_], $numeric[$_]) for 0 .. $#cols;

490 0 push @out, join($gap, @cells);

491 }

492 0 0 push @out, sprintf("# ... %d more row%s", $total - $shown,

493 ($total - $shown == 1 ? '' : 's')) if $shown < $total;

494

495 0 my $str = join("\n", @out) . "\n";

496 0 0 unless ($quiet) { defined $fh ? print {$fh} $str : print $str; }

497 0 return $str;

498 }

499

500 1;

501 =encoding utf8

502

503 =head1 Synopsis

504

505 Get basic statistical functions working in Perl as if they were part of List::Util, like C, C, C, etc.

506 I've used Artificial Intelligence tools such as Claude, Gemini, and Grok to write this as well as using my own gray matter.

507 There are other similar tools on CPAN, but I want speed and a form like List::Util, which I've gotten here with the help of AI, which often required many attempts to do correctly.

508 This is meant to call subroutines directly through eXternal Subroutines (XS) for performance and portability.

509

510 There B other modules on CPAN that can do B of this, but this works the way that I B it to.

511

512 =head1 Functions/Subroutines

513

514 ========================================================================

515

516 =head2 add_data

517

518 Add data to an existing hash or array reference. This function acts as the equivalent of adding new rows, as well as an C (described below). It dynamically infers your target data structure, handles deeply nested records, and seamlessly coerces mismatched data shapes to preserve the structural integrity of your primary reference.

519

520 =head3 Hash of Hashes (HoH)

521

522 When the target is a Hash of Hashes, incoming hash keys update existing rows, and new keys create new rows.

523

524 $data = { 'Jack Smith' => { age => 30 } };

525

526 $n = {

527 'Jack Smith' => { # Update existing (Hash)

528 dept => 'Engineering'

529 },

530 'Jane Doe' => { age => 25, dept => 'Sales' }, # Add new (Hash)

531 'Invalid' => 'Not a reference' # Edge case safety

532 };

533

534 add_data($data, $n);

535

536 B

537

538 {

539 "Jack Smith": {

540 "age": 30,

541 "dept": "Engineering"

542 },

543 "Jane Doe": {

544 "age": 25,

545 "dept": "Sales"

546 }

547 }

548

549 =head3 Hash of Arrays (HoA)

550

551 When the target is a Hash of Arrays, incoming arrays are pushed onto the existing arrays, appending the new elements, similarly to R's C.

552

553 $data = { 'Project Alpha' => [ 'task1', 'task2' ] };

554 $n = {

555 'Project Alpha' => [ 'task3' ], # Appends to existing array

556 'Project Beta' => [ 'task1', 'task2' ] # Creates new array row

557 };

558 add_data($data, $n);

559

560 B

561

562 {

563 "Project Alpha": [ "task1", "task2", "task3" ],

564 "Project Beta": [ "task1", "task2" ]

565 }

566

567 =head3 Array of Hashes / Arrays (AoH / AoA)

568

569 C now natively supports Array references at the root level. When targeting an Array, it iterates through the source array and merges data at the corresponding indices.

570

571 $data = [

572 { id => 1, name => 'Alice' }

573 ];

574

575 $n = [

576 { role => 'Admin' }, # Updates index 0

577 { id => 2, name => 'Bob' } # Creates index 1

578 ];

579

580 add_data($data, $n);

581

582 B

583

584 [

585 { "id": 1, "name": "Alice", "role": "Admin" },

586 { "id": 2, "name": "Bob" }

587 ]

588

589 =head3 Advanced Structural Coercion & Cross-Merging

590

591 C strictly enforces the primary structure of your target reference (determined by inspecting its outer and inner bounds). If you mix Array and Hash types, the function automatically coerces the incoming data to match the target.

592

593 B<1. Inner Coercion (Mixing Rows):>

594

595 =over

596

597 =item * B Source Array rows are read in pairs and converted to key-value pairs.

598

599 =item * B Source Hash rows are flattened into key-value pairs and pushed onto the array.

600

601 =back

602

603 B<2. Root-Level Coercion (Mixing Outer Containers):>

604

605 =over

606

607 =item * B The function evaluates the Hash keys as numeric indices. (e.g., source key C<"0"> merges into target array index C<[0]>). Non-numeric keys are safely ignored.

608

609 =item * B The function converts the Array indices into stringified Hash keys. (e.g., source array index C<[1]> merges into target hash key C<"1">).

610

611 =back

612

613 =head3 Source is a mixed Hash. Keys dictate the target array index!

614

615 $n = {

616 '0' => { y => 20 }, # Merges into $data->[0]

617 '1' => [ 'z', 30 ], # Array pair coerced to Hash, creates $data->[1]

618 'ignored' => { k => 'v' } # Ignored: cannot map to an array index

619 };

620

621 add_data($data, $n);

622

623 B

624

625 [

626 { "x": 10, "y": 20 },

627 { "z": 30 }

628 ]

629

630 NB: If C is called on a completely empty target reference (e.g., C<$data = {}> or C<$data = []>), it will intelligently infer the required inner structure (Hashes vs Arrays) by inspecting the first valid row of the source data.

631

632 =head2 aov

633

634 Warning: assumes normal distribution

635

636 aov(

637 {

638 yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],

639 ctrl => [1, 1, 1, 0, 0, 0]

640 },

641 'yield ~ ctrl');

642

643 which returns

644

645 {

646 ctrl {

647 Df 1,

648 "F value" 25.6000000000001,

649 "Mean Sq" 1.70666666666667,

650 Pr(>F) 0.00718232855871859,

651 "Sum Sq" 1.70666666666667

652 },

653 Residuals {

654 Df 4,

655 "Mean Sq" 0.0666666666666665,

656 "Sum Sq" 0.266666666666666

657 }

658 }

659

660 You can also perform Two-Way ANOVA with categorical interactions using the C<*> operator. The parser will implicitly evaluate the main effects alongside the interaction:

661

662 my $res_2way = aov($data_2way, 'len ~ supp * dose');

663

664 It is robust against rank deficiency; collinear terms will gracefully receive 0 degrees of freedom and 0 sum of squares, matching R's behavior.

665

666 =head3 Input Parameters

667

668

669

670 =begin html

671

672

673
674
675								Parameter
676								Type
677								Default
678								Description
679								Example
680
681
682
683
684								`data_sv`
685								`HashRef` or `ArrayRef`
686								(Required)
687								The dataset to analyze. Accepts a Hash of Arrays (HoA) or Array of Hashes (AoH). If no formula is provided, it must be an HoA to allow automatic stacking (mimicking R's `stack()` on a named list).
688
689
690
691								`formula_sv`
692								`String`
693								`undef`
694								A symbolic description of the model to be fitted. If omitted, the formula automatically defaults to `'Value ~ Group'` and the input data is stacked.
695								`'yield ~ N * P'`
696
697
698

699

700 =end html

701

702

703

704 =head3 Output Variables

705

706 The function returns a single C containing the evaluated statistical results. Because the keys map dynamically to the terms parsed from your formula, the structure will vary based on your inputs.

707

708

709

710 =begin html

711

712

713
714
715								Parameter
716								Type
717								Default
718								Description
719								Example
720
721
722
723
724								(Term Name)
725								`HashRef`
726								`undef`
727								A nested hash for each independent term in the formula (e.g., `'Group'`, `'N:P'`), containing its ANOVA table statistics.
728								`{'Df' => 1, 'Sum Sq' => 14.2, 'Mean Sq' => 14.2, 'F value' => 25.81, 'Pr(>F)' => 0.0004}`
729
730
731								`Residuals`
732								`HashRef`
733								`undef`
734								A nested hash containing the residual (error) statistics for the fitted model.
735								`{'Df' => 10, 'Sum Sq' => 5.5, 'Mean Sq' => 0.55}`
736
737
738								`group_stats`
739								`HashRef`
740								`undef`
741								A nested hash containing descriptive statistics (`mean` and `size` / count) for every column evaluated in the original unstacked data structure.
742								`{'mean' => {'A' => 2.1, 'B' => 5.4}, 'size' => {'A' => 10, 'B' => 10}}`
743
744
745

746

747 =end html

748

749

750

751 =head3 omitting formula

752

753 As of version 0.07, in the case of an omitted formula, stacking is done:

754

755 aov(

756 {

757 yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],

758 ctrl => [1, 1, 1, 0, 0, 0]

759 },

760 );

761

762 is the equivalent of:

763

764 yield <- c(5.5, 5.4, 5.8, 4.5, 4.8, 4.2)

765 ctrl <- c(1, 1, 1, 0, 0, 0)

766

767 # Combine them into a named list (the R equivalent of your hash)

768 my_list <- list(yield = yield, ctrl = ctrl)

769

770 # Convert the list into a "long" dataframe

771 # This creates two columns: "values" and "ind" (the group name)

772 my_data <- stack(my_list)

773

774 # Rename columns for clarity (optional but good practice)

775 colnames(my_data) <- c("Value", "Group")

776 anova_model <- aov(Value ~ Group, data = my_data)

777 summary(anova_model)

778

779 in R

780

781 =head2 cfilter

782

783 Select B out of a table and return it in the same shape. A column is

784 the inner (second-level) key of a B or an B,

785 or the outer key of a B:

786

787 use Stats::LikeR;

788 my %hoa = ( x => [1,2,3], y => [4,5,6], z => [0,0,0] );

789 cfilter(\%hoa, keep => ['x','y']); # { x => [1,2,3], y => [4,5,6] }

790 cfilter(\%hoa, remove => ['z']); # { x => [1,2,3], y => [4,5,6] }

791

792 C takes exactly one of C or C. C returns only the

793 matching columns; C returns everything except them. The result is the

794 same shape as the input (HoH → HoH, HoA → HoA, AoH → AoH), with cell values

795 copied and the original structure left untouched.

796

797 =head3 Selecting by name

798

799 Pass an array ref of column names. Naming a column that is not present in the

800 data is an error (it catches typos), and a row that happens not to contain a

801 kept column simply comes back without it:

802

803 my @aoh = ( { a => 1, b => 2 }, { a => 3 } );

804 cfilter(\@aoh, keep => ['b']); # [ { b => 2 }, {} ]

805

806 =head3 Selecting by a predicate

807

808 Instead of names, C/C accept a B — a CODE ref or a

809 function name — evaluated once per column. It is called as

810

811 $predicate->($column_values, $column_name)

812

813 where C<$column_values> is an array ref of the column's B cells (undef

814 and missing cells are dropped, so functions like C get clean input).

815 With C, columns for which the predicate is true are kept; with C,

816 those columns are dropped.

817

818 # Keep only the constant columns (standard deviation zero):

819 my $const = cfilter(\%hoa, keep => sub { sd($_[0]) == 0 }); # { z => [0,0,0] }

820 # Drop the constant columns instead:

821 my $varying = cfilter(\%hoa, remove => sub { sd($_[0]) == 0 }); # { x=>..., y=>... }

822 # A bare function name resolves in Stats::LikeR:: (use a package for your own):

823 cfilter(\%hoa, keep => 'some_predicate');

824

825 A bare string is always treated as a B, not a single column

826 name, so to keep one column by name use an array ref: C<< keep =E ['x'] >>.

827

828 =head3 Errors

829

830 C dies (via C) when:

831

832 =over

833

834 =item * neither C nor C is given, or both are,

835

836 =item * a named column is not present in the data,

837

838 =item * the selector is neither an array ref nor a code ref / function name, or the

839 function name cannot be resolved,

840

841 =item * an unknown option is given, or the options are not C<< name =E value >> pairs,

842

843 =item * the data is not a hash/array reference of the expected shape (a hash of hash

844 refs or array refs, or an array of hash refs).

845

846 =back

847

848 =head2 chisq_test

849

850 The C function performs chi-squared contingency table tests and goodness-of-fit tests. It natively accepts both arrays and hashes (1D and 2D) and mathematically mirrors R's C, returning a structured hash reference of the results.

851

852 For 2x2 matrices, Yates' Continuity Correction is applied automatically.

853

854 =head3 Accepted Inputs

855

856

857

858 =begin html

859

860

861
862
863								Input Type
864								Data Structure
865								Applied Test
866
867
868
869
870								1D Array
871								`[ $v1, $v2, ... ]`
872								Chi-squared test for given probabilities
873
874
875								2D Array
876								`[ [ $v1, $v2 ], [ $v3, $v4 ] ]`
877								Pearson's Chi-squared test (Yates' correction if 2x2)
878
879
880								1D Hash
881								`{ key1 => $v1, key2 => $v2 }`
882								Chi-squared test for given probabilities
883
884
885								2D Hash
886								`{ row1 => { c1 => $v1, c2 => $v2 } }`
887								Pearson's Chi-squared test (Yates' correction if 2x2)
888
889
890

891

892 =end html

893

894

895

896 =head3 Output Object Structure

897

898 The function returns a single Hash Reference containing the following key-value pairs. The internal structure of C and C will always identically match the structure of your input.

899

900

901

902 =begin html

903

904

905
906
907								Key
908								Data Type
909								Description
910
911
912
913
914								data.name
915								String
916								Identifies the input type (e.g., `"Perl ArrayRef"` or `"Perl HashRef"`).
917
918
919								expected
920								Array/Hash Ref
921								The expected frequencies, matching the geometry of the input.
922
923
924								method
925								String
926								The specific statistical test applied.
927
928
929								observed
930								Array/Hash Ref
931								The original data passed to the function.
932
933
934								p.value
935								Float
936								The calculated p-value of the test.
937
938
939								parameter
940								Hash Ref
941								Contains the degrees of freedom (`df`).
942
943
944								statistic
945								Hash Ref
946								Contains the test statistic (`X-squared`).
947
948
949

950

951 =end html

952

953

954

955 =head3 Two-Dimensional Array

956

957 Passing an Array of Arrays (AoA) triggers a standard Pearson's Chi-squared test. If the input is exactly a 2x2 matrix, Yates' continuity correction is applied automatically.

958

959 my $test_data = [

960 [762, 327, 468],

961 [484, 239, 477]

962 ];

963 my $res = chisq_test($test_data);

964

965 B

966

967 {

968 'data.name' => 'Perl ArrayRef',

969 'expected' => [

970 [ 703.671381936888, 319.645266594124, 533.683351468988 ],

971 [ 542.328618063112, 246.354733405876, 411.316648531012 ]

972 ],

973 'method' => "Pearson's Chi-squared test",

974 'observed' => [

975 [ 762, 327, 468 ],

976 [ 484, 239, 477 ]

977 ],

978 'p.value' => 2.95358918321176e-07,

979 'parameter' => { 'df' => 2 },

980 'statistic' => { 'X-squared' => 30.0701490957547 }

981 }

982

983 =head3 1-Dimensional Array (Goodness of Fit)

984

985 Passing a flat Array Reference triggers a Goodness of Fit test, assuming equal expected probabilities across all items.

986

987 my $data = [10, 20, 30];

988 my $res = chisq_test($data);

989

990 B

991

992 {

993 'data.name' => 'Perl ArrayRef',

994 'expected' => [ 20, 20, 20 ],

995 'method' => 'Chi-squared test for given probabilities',

996 'observed' => [ 10, 20, 30 ],

997 'p.value' => 0.00673794699908547,

998 'parameter' => { 'df' => 2 },

999 'statistic' => { 'X-squared' => 10 }

1000 }

1001

1002 =head3 2-Dimensional Hash (Pearson's Chi-squared)

1003

1004 Passing a Hash of Hashes (HoH) applies the exact same logic as a 2D Array, but preserves your nested string keys in the output. This is particularly useful when mapping data extracted directly from JSON, databases, or categorical mappings.

1005

1006 my $data = {

1007 GroupA => { Success => 10, Failure => 15 },

1008 GroupB => { Success => 20, Failure => 5 }

1009 };

1010

1011 my $res = chisq_test($data);

1012

1013 B

1014

1015 {

1016 'data.name' => 'Perl HashRef',

1017 'expected' => {

1018 'GroupA' => { 'Failure' => 10, 'Success' => 15 },

1019 'GroupB' => { 'Failure' => 10, 'Success' => 15 }

1020 },

1021 'method' => "Pearson's Chi-squared test with Yates' continuity correction",

1022 'observed' => {

1023 'GroupA' => { 'Failure' => 15, 'Success' => 10 },

1024 'GroupB' => { 'Failure' => 5, 'Success' => 20 }

1025 },

1026 'p.value' => 0.00937475878430379,

1027 'parameter' => { 'df' => 1 },

1028 'statistic' => { 'X-squared' => 6.75 }

1029 }

1030

1031 =head3 One-Dimensional Hash (Goodness of Fit)

1032

1033 Flat Hash References evaluate Goodness of Fit while preserving your categorical keys in the C and C output blocks.

1034

1035 my $data = {

1036 Apples => 10,

1037 Oranges => 20,

1038 Bananas => 30

1039 };

1040

1041 my $res = chisq_test($data);

1042

1043 =head2 C

1044

1045 Apply a B to every pair of columns in a table and collect

1046 the answers in a hash of hashes.

1047

1048 It's the workhorse behind things like correlation matrices: give it your data and

1049 the name of a function that takes two columns (C, C, …) and you get

1050 back every column compared against every other column.

1051

1052 use Stats::LikeR;

1053

1054 my %data = (

1055 height => [ 170, 165, 180, 175 ],

1056 weight => [ 70, 60, 85, 77 ],

1057 age => [ 30, 41, 25, 38 ],

1058 );

1059

1060 my $result = col2col(\%data, 'cor');

1061

1062 # $result->{height}{weight} == correlation of height vs weight

1063 # $result->{height}{age} == correlation of height vs age

1064 # ...and so on for every pair

1065

1066 ========================================================================

1067

1068 =head3 Arguments

1069

1070 col2col( $data, $command, $cols, %options )

1071 col2col( $data, $command, \%options ) # options in place of $cols

1072

1073

1074

1075 =begin html

1076

1077

1078
1079
1080								Position
1081								Argument
1082								What it is
1083
1084
1085
1086
1087								1
1088								`$data`
1089								Your table, as a reference (see Data shapes below).
1090
1091
1092								2
1093								`$command`
1094								A code block or the name of a two-column function.
1095
1096
1097								3
1098								`$cols`
1099								(optional) Which columns to use as the "from" side. Omit for all.
1100
1101
1102								4+
1103								`%options`
1104								(optional) `na`, `skip.errors`, … (see Options).
1105
1106
1107

1108

1109 =end html

1110

1111

1112

1113 ========================================================================

1114

1115 =head3 Data shapes

1116

1117 C understands three layouts. In every case a B is the thing that

1118 gets compared, and the result is keyed by column name.

1119

1120 B — keys are column names:

1121

1122 my %hoa = ( a => [1, 2, 3], b => [4, 5, 6] );

1123

1124 B — First keys are row names, second keys are columns:

1125

1126 my %hoh = (

1127 row1 => { a => 1, b => 4 },

1128 row2 => { a => 2, b => 5 },

1129 );

1130

1131 B — each element is a row, inner keys are columns:

1132

1133 my @aoh = ( { a => 1, b => 4 }, { a => 2, b => 5 } );

1134

1135 All three produce the same result for the same underlying numbers. Missing or

1136 C cells are handled by the C option (below).

1137

1138 ========================================================================

1139

1140 =head3 The command

1141

1142 The second argument is the function applied to each pair of columns. It is called

1143 as:

1144

1145 $command->( $column_a, $column_b ) # two ARRAY refs

1146

1147 so inside a block the two columns arrive in C<@_>:

1148

1149 my $result = col2col(\%data, sub {

1150 my ($x, $y) = @_; # $x and $y are array refs

1151 cor($x, $y);

1152 });

1153

1154 You can also pass a B. A bare name is looked up in

1155 C, so these two are equivalent:

1156

1157 col2col(\%data, 'cor');

1158 col2col(\%data, sub { cor($_[0], $_[1]) });

1159

1160 ========================================================================

1161

1162 =head3 The result

1163

1164 Always a hash of hashes: B<< C<< $result-E{from}{to} >> >>.

1165

1166 for my $from (sort keys %$result) {

1167 for my $to (sort keys %{ $result->{$from} }) {

1168 printf "%s vs %s = %s\n", $from, $to, $result->{$from}{$to};

1169 }

1170 }

1171

1172 A column is never compared with itself, so C<< $result-E{a}{a} >> does not exist.

1173

1174 ========================================================================

1175

1176 =head3 Restricting columns (C<$cols>)

1177

1178 By default every column is used as the "from" side. The third argument narrows

1179 that down — handy when you only care about one variable.

1180

1181 # all columns vs all columns

1182 my $all = col2col(\%data, 'cor');

1183 # just ONE column vs every other column

1184 my $one = col2col(\%data, 'cor', 'height');

1185 my $cors = $one->{height}; # { weight => ..., age => ... }

1186 # a FEW specific columns vs every other column

1187 my $few = col2col(\%data, 'cor', ['height', 'weight']);

1188

1189 The "to" side is always every other column; C<$cols> only limits the outer keys.

1190

1191 ========================================================================

1192

1193 =head3 Options

1194

1195 Options can be given two ways:

1196

1197 col2col(\%data, 'cor', $cols, 'skip.errors' => 0); # after $cols

1198 col2col(\%data, 'cor', { 'skip.errors' => 0 }); # hash ref, no $cols needed

1199

1200 The hash-ref form is convenient when you have B column restriction — it saves

1201 you from passing a placeholder. (A hash ref I C<$cols>, so you can't use

1202 it to restrict columns at the same time; use the trailing form for that.)

1203

1204 =head4 C — how undefined values are handled

1205

1206 Real data has gaps. C decides what the function sees.

1207

1208

1209

1210 =begin html

1211

1212

1213
1214
1215								Value
1216								Behaviour
1217								Use for
1218
1219
1220
1221
1222								`'pairwise'` (default)
1223								A row is used for a pair only if both columns are defined there. The two columns arrive aligned and equal-length.
1224								Paired stats like `cor`.
1225
1226
1227								`'omit'`
1228								Each column drops its own undefined values independently. The two columns may end up different lengths.
1229								Unpaired tests like `t_test`, `kruskal_test`, where a gap in one sample shouldn't discard a value in the other.
1230
1231
1232								`'keep'`
1233								Every row is passed through, `undef` and all.
1234								When your function does its own missing-data handling.
1235
1236
1237

1238

1239 =end html

1240

1241

1242

1243 # correlation: keep only complete pairs (the default)

1244 col2col(\%data, 'cor');

1245 # two-sample test: each column keeps its own values

1246 col2col(\%data, 't_test', undef, na => 'omit');

1247 col2col(\%data, 't_test', { na => 'omit' }); # same, no placeholder

1248

1249 C / C remain as boolean aliases for backward compatibility:

1250 C means C<'pairwise'>, C means C<'keep'>. Don't combine them with C.

1251

1252 =head4 C — keep going when a pair fails I<(default: true)>

1253

1254 Some functions croak on degenerate input — for example C dies if a column has

1255 zero variance. By default C B that croak per pair: instead of

1256 aborting the whole run, it stores the B of the error message in that

1257 cell, so the result tells you I pair failed and I. Every other cell is

1258 computed normally.

1259

1260 my $r = col2col(\%data, 'cor');

1261 # a good pair: $r->{a}{b} == 0.83

1262 # a bad pair: $r->{a}{const} eq 'cor: standard deviation of y is 0'

1263

1264 To restore the old "die on the first error" behaviour, turn it off:

1265

1266 col2col(\%data, 'cor', undef, 'skip.errors' => 0);

1267 col2col(\%data, 'cor', { 'skip.errors' => 0 });

1268

1269 Only errors from B are trapped. Mistakes in the call itself

1270 (unknown column, bad data, unknown function name, unknown option) always die.

1271

1272 ========================================================================

1273

1274 =head3 Worked examples

1275

1276 B

1277

1278 my $m = col2col(\%data, 'cor');

1279

1280 B

1281

1282 my $col = 'Testosterone, total (nmol/L)';

1283 my $cors = col2col($hoa, 'cor', $col)->{$col};

1284 for my $other (sort { ($cors->{$b} // -2) <=> ($cors->{$a} // -2) } keys %$cors) {

1285 next unless $cors->{$other} =~ /^-?\d/; # skip cells holding an error message

1286 printf "%-30s % .3f\n", $other, $cors->{$other};

1287 }

1288

1289 B

1290

1291 my $t = col2col($hoa, 't_test', undef, na => 'omit');

1292

1293 B

1294

1295 my $m = col2col($hoa, 'cor');

1296 for my $from (sort keys %$m) {

1297 for my $to (sort keys %{ $m->{$from} }) {

1298 my $v = $m->{$from}{$to};

1299 warn "$from vs $to: $v\n" if defined $v && $v !~ /^-?\d/; # non-numeric = error

1300 }

1301 }

1302

1303 ========================================================================

1304

1305 =head3 Gotchas

1306

1307 =over

1308

1309 =item * B, C<($col_a, $col_b)> — not a column and

1310 a name. Unpack with C.

1311

1312 =item * B<< C<'pairwise'> can still hit a constant I. >> A column with overall

1313 variance can be flat on just the rows it shares with one partner, so C may

1314 still croak for that pair. With the default C, that shows up as a

1315 message in the single offending cell rather than killing the run.

1316

1317 =item * B<< C does not modify your data. >> It reads the table and returns a new

1318 hash of hashes.

1319

1320 =item * B — i.e.

1321 C is the inner ("to") key. So C<< $result-E{A}{B} >> reading C<…deviation of y is 0>

1322 means column C is the degenerate one for that pair.

1323

1324 =back

1325

1326 =head2 cor

1327

1328 cor($array1, $array2, $method = 'pearson'),

1329

1330 that is, C is the default and will be used if C<$method> is not specified.

1331

1332 Just like R, C, C, and C are available

1333

1334 If you provide an array of arrays (a matrix), C will compute the correlation matrix automatically.

1335

1336 =head2 cor_test

1337

1338 my $result = cor_test(

1339 'x' => $x,

1340 'y' => $y,

1341 alternative => 'two.sided',

1342 method => 'pearson',

1343 continuity => 1

1344 );

1345

1346 C safely handles C (or C) values seamlessly by computing over pairwise complete observations.

1347

1348 =head2 cov

1349

1350 cov($array1, $array2, 'pearson')

1351

1352 or

1353

1354 cov($array1, $array2, 'spearman')

1355

1356 or

1357

1358 cov($array1, $array2, 'kendall')

1359

1360 =head2 dnorm

1361

1362 gives the density of the normal distribution, with the specified mean and standard deviation.

1363

1364 In other words, the predicted height of the value C, given a mean, standard deviation, and whether or not to use a log value.

1365

1366 returns a single scalar/number if a single value is given, otherwise returns an array reference.

1367

1368 Usage:

1369

1370 dnorm(4) # assumes a mean of 0 and standard deviation of 1

1371

1372 but default mean, standard deviation, and log can be passed as parameters:

1373

1374 $x = dnorm(0, mean => 0, sd => 2, 'log' => 0);

1375

1376 =head2 filter

1377

1378 Return a new data frame containing only the rows of C<$df> that match a predicate. The original C<$df> is never modified.

1379

1380 my $df2 = filter($df, col('column.name') > 4);

1381

1382 C accepts a predicate in one of two forms:

1383

1384 =over

1385

1386 =item 1. a B<< C expression >> — a small, composable comparison built with overloaded operators, and

1387

1388 =item 2. a B — for anything the operators can't express (multiple columns, regexes, arbitrary logic), in the same spirit as the C option of L<#>.

1389

1390 =back

1391

1392 Both C and C

1393

1394 =head3 Arguments

1395

1396

1397

1398 =begin html

1399

1400

1401
1402
1403								Position
1404								Name
1405								Description
1406
1407
1408
1409
1410								1
1411								`$df`
1412								The data frame to filter. Either an array of hashes (AoH — e.g. the default output of `read_table`) or a hash of arrays (HoA).
1413
1414
1415								2
1416								predicate
1417								Either a `col()` comparison object or a `CODE` reference.
1418
1419
1420

1421

1422 =end html

1423

1424

1425

1426 The return value is a B data frame of the B as the input (AoH in → AoH out, HoA in → HoA out). For an HoA, every column is filtered in parallel by row index, so all returned columns stay the same length and aligned.

1427

1428 =head3 The C form

1429

1430 C is a deferred reference to a column. It carries no data — only the column name — so it can be compared with a literal (or another value) to build a predicate that C evaluates once per row.

1431

1432 filter($df, col('age') >= 18); # keep rows where age >= 18

1433 filter($df, col('sex') eq 'f'); # keep rows where sex is 'f'

1434 filter($df, 18 <= col('age')); # operands may be in either order

1435

1436 =head3 Comparison operators

1437

1438

1439

1440 =begin html

1441

1442

1443
1444
1445								Kind
1446								Operators
1447								Comparison
1448
1449
1450
1451
1452								Numeric
1453								`>` `<` `>=` `<=` `==` `!=`
1454								numeric (the cell and the value are compared as numbers)
1455
1456
1457								String
1458								`gt` `lt` `ge` `le` `eq` `ne`
1459								string (the cell and the value are compared as strings)
1460
1461
1462

1463

1464 =end html

1465

1466

1467

1468 C may appear on either side of the operator; C<< 4 E col('x') >> is automatically rewritten to the equivalent C<< col('x') E 4 >>.

1469

1470 =head3 Combining predicates: C<&>, C<|>, C

1471

1472 Predicates compose with bitwise C<&> (and), C<|> (or), and C (not):

1473

1474 filter($df, (col('age') > 18) & (col('sex') eq 'f')); # and

1475 filter($df, (col('grp') eq 'a') | (col('grp') eq 'c')); # or

1476 filter($df, !(col('x') > 100)); # not

1477

1478 Comparison operators bind more tightly than C<&> and C<|>, so C<< (col('a') E 4) & (col('b') E 2) >> is parsed correctly, but the parentheses are recommended for readability.

1479

1480 =head3 The code-reference form

1481

1482 For logic the operators can't express, pass a C_{. It is called once per row; the B, available both as C<$_> and as the first argument C<$_[0]>. Return a true value to keep the row.}

1483

1484 filter($df, sub { $_->{x} > 4 && $_->{grp} eq 'a' });

1485 filter($df, sub { $_->{name} =~ /^A/ });

1486 filter($df, sub { $_[0]{score} > $_[0]{threshold} });

1487

1488 For an HoA, each row is assembled into a temporary hash reference (C<< { column =E value, ... } >>) before the sub is called, so the same C<< $_-E{column} >> syntax works regardless of the input shape.

1489

1490 =head3 Examples

1491

1492 use Stats::LikeR;

1493 my $df = read_table('patients.csv'); # array of hashes

1494 # numeric threshold

1495 my $adults = filter($df, col('Age') >= 18);

1496 # combine conditions

1497 my $target = filter($df, (col('Age') >= 18) & (col('Sex') eq 'f'));

1498 # arbitrary logic with a coderef

1499 my $flagged = filter($df, sub { $_->{ALT} > 40 || $_->{AST} > 40 });

1500 # hash-of-arrays input -> hash-of-arrays output, columns filtered in parallel

1501 my $hoa = read_table('patients.csv', 'output.type' => 'hoa');

1502 my $sub = filter($hoa, col('Age') > 32);

1503 # $sub->{Age}, $sub->{Sex}, ... are all the same length and row-aligned

1504

1505 =head3 Behavior and notes

1506

1507 =over

1508

1509 =item * B C builds and returns a new frame; C<$df> is left untouched.

1510

1511 =item * B<< A missing or C cell never matches >> a C comparison. For example C<< col('x') E 0 >> silently drops any row that has no C value or whose C is C.

1512

1513 =item * B, into the returned frame: the returned array references the I row hashes as the input (fast, low-memory). Mutating a row in the result would therefore also change it in the original. HoA values are copied into fresh arrays.

1514

1515 =item * B are well defined: a predicate true for every row returns a copy-shaped frame with all rows; a predicate true for none returns an empty frame (C<[]> for AoH, a hash of empty arrays for HoA).

1516

1517 =item * B Passing a non-reference, an array element that is not a hash reference, or an HoA column that is not an array reference raises a descriptive error.

1518

1519 =item * B The C/operator layer is pure Perl (operator overloading); the per-row evaluation is done in XS.

1520

1521 =back

1522

1523 =head3 See also

1524

1525 C (whose C option applies the same coderef convention while reading a file), C.

1526

1527 =head2 fisher_test

1528

1529 =head3 array reference entry

1530

1531 my $array_data = [

1532 [10, 2],

1533 [3, 15]

1534 ];

1535 my $res1 = fisher_test($array_data);

1536

1537 which returns a hash reference:

1538

1539 {

1540 alternative "two.sided",

1541 conf_int [

1542 [0] 2.75343836564204,

1543 [1] 300.682787419401

1544 ],

1545 conf_level 0.95,

1546 estimate {

1547 "odds ratio" 21.3053312750168

1548 },

1549 method "Fisher's Exact Test for Count Data",

1550 p_value 0.000536724119143435

1551 }

1552

1553 =head3 hash reference entry

1554

1555 $ft = fisher_test( {

1556 Guess => {

1557 Milk => 3, Tea => 1

1558 },

1559 Truth => {

1560 Milk => 1, Tea => 3

1561 }

1562 });

1563

1564 =head2 glm

1565

1566 takes a hash of an array as input

1567

1568 my %tooth_growth = (

1569 dose => [qw(0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

1570 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5

1571 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0

1572 2.0 2.0 2.0)],

1573 len => [qw(4.2 11.5 7.3 5.8 6.4 10.0 11.2 11.2 5.2 7.0 16.5 16.5 15.2 17.3 22.5

1574 17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5

1575 15.2 21.5 17.6 9.7 14.5 10.0 8.2 9.4 16.5 9.7 19.7 23.3 23.6 26.4 20.0

1576 25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0)],

1577 supp => [qw(VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC

1578 VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ

1579 OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ)]

1580 );

1581

1582 my $glm_teeth = glm(

1583 data => \%tooth_growth,

1584 formula => 'len ~ dose + supp',

1585 family => 'gaussian'

1586 );

1587

1588 In addition to the C default, it fully supports logistic regression using the C family parameter via Iteratively Reweighted Least Squares (IRLS):

1589

1590 my $glm_bin = glm(formula => 'am ~ wt + hp', data => \%mtcars, family => 'binomial');

1591

1592 =head3 Input Parameters

1593

1594

1595

1596 =begin html

1597

1598

1599
1600
1601								Parameter
1602								Type
1603								Default
1604								Description
1605								Example
1606
1607
1608
1609
1610								`formula`
1611								`String`
1612								None (Required)
1613								A symbolic description of the model to be fitted. Supports operators like `+`, `:`, `*`, `^`, and `-1` (to remove the intercept).
1614								`'am ~ wt + hp'`, `'y ~ x - 1'`
1615
1616
1617								`data`
1618								`HashRef` or `ArrayRef`
1619								None (Required)
1620								The dataset containing the variables used in the formula. Accepts either a Hash of Arrays (HoA) or an Array of Hashes (AoH).
1621								`\%mtcars`, `[{x => 1, y => 2}, ...]`
1622
1623
1624								`family`
1625								`String`
1626								`'gaussian'`
1627								A description of the error distribution and link function to be used in the model. Currently supports `'gaussian'` (identity link) and `'binomial'` (logit link).
1628								`'binomial'`
1629
1630
1631

1632

1633 =end html

1634

1635

1636

1637 =head3 Output variables

1638

1639

1640

1641 =begin html

1642

1643

1644
1645
1646								Variable
1647								Type
1648								Description
1649								Example
1650
1651
1652
1653
1654								`aic`
1655								`Double`
1656								Akaike's Information Criterion for the fitted model.
1657								`123.45`
1658
1659
1660								`boundary`
1661								`Integer (Boolean)`
1662								`1` if the fitted values computationally reached the `0` or `1` boundary (specific to the binomial family), `0` otherwise.
1663								`0`
1664
1665
1666								`coefficients`
1667								`HashRef`
1668								A hash mapping the expanded model term names to their estimated coefficient values.
1669								`{'Intercept' => 1.5, 'wt' => -0.5}`
1670
1671
1672								`converged`
1673								`Integer (Boolean)`
1674								`1` if the Iteratively Reweighted Least Squares (IRLS) algorithm converged within the maximum iterations, `0` otherwise.
1675								`1`
1676
1677
1678								`deviance`
1679								`Double`
1680								The residual deviance of the fitted model.
1681								`15.2`
1682
1683
1684								`deviance.resid`
1685								`HashRef`
1686								A hash mapping data row names to their computed deviance residuals.
1687								`{'Mazda RX4' => 0.12}`
1688
1689
1690								`df.null`
1691								`Integer`
1692								The residual degrees of freedom for the null model.
1693								`31`
1694
1695
1696								`df.residual`
1697								`Integer`
1698								The residual degrees of freedom for the fitted model.
1699								`30`
1700
1701
1702								`family`
1703								`String`
1704								The statistical family used to fit the model.
1705								`"gaussian"`
1706
1707
1708								`fitted.values`
1709								`HashRef`
1710								A hash mapping data row names to the fitted mean values (the model's predictions on the scale of the response).
1711								`{'Mazda RX4' => 0.85}`
1712
1713
1714								`iter`
1715								`Integer`
1716								The number of IRLS iterations performed before convergence or hitting the iteration limit.
1717								`4`
1718
1719
1720								`null.deviance`
1721								`Double`
1722								The deviance for the null model (a baseline model containing only an intercept, or an offset of 0 if the intercept is removed).
1723								`43.5`
1724
1725
1726								`rank`
1727								`Integer`
1728								The numeric rank of the fitted linear model (the number of estimated, non-aliased parameters).
1729								`2`
1730
1731
1732								`summary`
1733								`HashRef`
1734								A nested hash mapping each term to its detailed summary statistics, including `Estimate`, `Std. Error`, `t value` / `z value`, and `Pr(> t )` / `Pr(> z )`. Aliased parameters return `"NaN"`.
1735								`{'wt' => {'Estimate' => -0.5, 'Std. Error' => 0.1, ...}}`
1736
1737
1738								`terms`
1739								`ArrayRef`
1740								An ordered list of the expanded term names included in the model matrix.
1741								`['Intercept', 'wt', 'hp']`
1742
1743
1744

1745

1746 =end html

1747

1748

1749

1750 =head2 group_by

1751

1752 Take a hash of arrays, hash of hashes, or array of hashes, and group a column by another column.

1753

1754 my $aoh_data = [

1755 { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 },

1756 { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 },

1757 { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 },

1758 { 'Gender' => 'Female' } # Intentional missing target value

1759 ];

1760

1761 as well as

1762

1763 $hoh_data = {

1764 'Patient_A' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 },

1765 'Patient_B' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 },

1766 'Patient_C' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 },

1767 'Patient_D' => { 'Gender' => 'Female' }, # Intentional missing target value

1768 'Patient_E' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => undef } # Explicit undef

1769 };

1770

1771 and

1772

1773 my $hoa_data = {

1774 'Gender' => ['Male', 'Female', 'Male', 'Female'],

1775 'Testosterone, total (nmol/L)' => [22.1, 2.5, 19.4, undef ]

1776 };

1777

1778 then run the function thus:

1779

1780 group_by( $hoa_data, 'Testosterone, total (nmol/L)', 'Gender');

1781

1782 The output can be thought of like a hash, with the first string broken down by the second.

1783

1784 all become hash of arrays:

1785

1786 {

1787 Female [

1788 [0] 1.8

1789 ],

1790 Male [

1791 [0] 18.2,

1792 [1] 20.5

1793 ]

1794 }

1795

1796 returns an empty array of hashes if neither target nor group keys are found.

1797

1798 =head3 Filtering

1799

1800 Data can be further broken down with filter/subs like in C:

1801

1802 my $testosterone = group_by($d, # group testosterone by "Gender"

1803 'Testosterone, total (nmol/L)',

1804 'Gender',

1805 { 'Race/Hispanic origin w/ NH Asian' => sub { $_ eq $n } },# filter

1806 { 'Testosterone, total (nmol/L)' => sub { $_ ne 'NA' } } # filter

1807 );

1808

1809 where each filter filters on the columns, e.g. second hash keys.

1810

1811 =head2 hoh2hoa

1812

1813 Convert a B (row-major: outer key = row, inner key = column)

1814 into a B (column-major: key = column, value = that column's

1815 cells down the rows).

1816

1817 use Stats::LikeR;

1818

1819 my %hoh = (

1820 'r1' => { 'a' => 1, 'b' => 2 },

1821 'r2' => { 'a' => 3, 'b' => 4 },

1822 );

1823

1824 my $hoa = hoh2hoa(\%hoh);

1825

1826 which returns

1827 {

1828 a => [1, 3],

1829 b => [2, 4],

1830 }

1831

1832 =head3 Behavior

1833

1834 =over

1835

1836 =item * B are the union of every inner key, so a key that appears in only

1837 some rows still becomes a column.

1838

1839 =item * B are emitted in sorted outer-key (row-name) order, and that one order

1840 is used for every column, so the arrays stay aligned and the result is

1841 reproducible regardless of hash ordering.

1842

1843 =item * B — a missing inner key, or a cell whose value is C — are filled

1844 with the fill value (see C below). Every column therefore has

1845 exactly one entry per row.

1846

1847 =item * Values are B into the result; the original structure is left

1848 untouched.

1849

1850 =item * An B hash of hashes returns an empty hash of arrays (it is not an

1851 error).

1852

1853 =back

1854

1855 =head3 Options

1856

1857 Options are passed as trailing C<< name =E value >> pairs.

1858

1859

1860

1861 =begin html

1862

1863

1864
1865
1866								Option
1867								Default
1868								Meaning
1869
1870
1871
1872
1873								`undef.val`
1874								`undef`
1875								Value used to fill a missing key or an `undef` cell. Any defined scalar works, including `0` and `''`. Passing `undef` keeps the default.
1876
1877
1878								`row.names`
1879								(none)
1880								If set to a string, an extra column of that name is added holding the sorted row labels, aligned with the data. Dies if the name collides with an existing column.
1881
1882
1883

1884

1885 =end html

1886

1887

1888

1889 # Ragged input with an explicit fill string:

1890 my %ragged = (

1891 'r1' => { 'a' => 1, 'b' => 2 },

1892 'r2' => { 'a' => 3, 'c' => 9 },

1893 );

1894 my $hoa = hoh2hoa(\%ragged, 'undef.val' => 'NA');

1895 # {

1896 # a => [1, 3 ],

1897 # b => [2, 'NA'],

1898 # c => ['NA', 9 ],

1899 # }

1900

1901 # Keep the row labels as a column:

1902 my $with_ids = hoh2hoa(\%ragged, 'row.names' => 'id');

1903 # {

1904 # id => ['r1', 'r2'],

1905 # a => [1, 3 ],

1906 # b => [2, undef],

1907 # c => [undef, 9 ],

1908 # }

1909

1910 =head3 Errors

1911

1912 C dies (via C) when:

1913

1914 =over

1915

1916 =item * the argument is not a hash reference,

1917

1918 =item * any value in the hash is not itself a hash reference,

1919

1920 =item * an unknown option is given, or the options are not C<< name =E value >> pairs,

1921

1922 =item * C is not a plain string, or it names an already-present column.

1923

1924 =back

1925

1926 =head2 hist

1927

1928 Computes the histogram of the given data values, operating in single $O(N)$ pass performance. It returns the bin counts, computed breaks, midpoints, and density.

1929

1930 my $res = hist([1, 2, 2, 3, 3, 3, 4, 4, 5], breaks => 4);

1931

1932 If C is not explicitly provided, it defaults to calculating the number of bins using Sturges' formula.

1933

1934 =head2 kruskal_test

1935

1936 Essentially the test determines if all groups have the same median (same distribution) (an excellent review is at https://library.virginia.edu/data/articles/getting-started-with-the-kruskal-wallis-test)

1937

1938 Performs a Kruskal-Wallis rank sum test, see

1939 https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/kruskal.test

1940

1941 =head3 hash of array entry

1942

1943 I feel that this is better, and more easily read, than what you get in R:

1944

1945 my %x = (

1946 'normal.subjects' => [2.9, 3.0, 2.5, 2.6, 3.2],

1947 'obs. airway disease' => [3.8, 2.7, 4.0, 2.4],

1948 'asbestosis' => [2.8, 3.4, 3.7, 2.2, 2.0]

1949 );

1950 $kt = kruskal_test(\%x);

1951

1952 =head3 R-like array entry

1953

1954 my @xk = (2.9, 3.0, 2.5, 2.6, 3.2); # normal subjects

1955 my @yk = (3.8, 2.7, 4.0, 2.4); # with obstructive airway disease

1956 my @zk = (2.8, 3.4, 3.7, 2.2, 2.0); # with asbestosis

1957 my @x = (@xk, @yk, @zk);

1958 my @g = (

1959 (map {'Normal subjects'} 0..4),

1960 (map {'Subjects with obstructive airway disease'} 0..3),

1961 map {'Subjects with asbestosis'} 0..4

1962 );

1963 my $kt = kruskal_test(\@x, \@g);

1964

1965 =head2 ks_test

1966

1967 The Kolmogorov-Smirnov test, which tests whether or not two arrays/lists of data are part of the same distribution is implemented simply:

1968

1969 $ks = ks_test(\@x, \@y, alternative => 'greater');

1970

1971 returning a hash reference.

1972

1973 Also, a single array can be tested against a normal distribution:

1974

1975 $ks = ks_test($ksx, 'pnorm');

1976

1977 The p-value precision is about 1e-8, which I want to improve, but am not sure how.

1978

1979 =head2 ljoin

1980

1981 Consider a hash: C<$h{$row}{$col}>, and another hash C<$i{$row}{$col}>.

1982 C will add information for C<$col> in C<%i> for each C<$row> to C<%h>, where C<$row> exists in both C<%h> and C<%i>

1983

1984 For example,

1985

1986 {

1987 "Jack Smith" {

1988 age 30

1989 }

1990 }

1991

1992 and a second hash,

1993 {

1994 "Jack Smith" {

1995 dept "Engineering"

1996 },

1997 "Jane Doe" {

1998 age 25

1999 }

2000 }

2001

2002 in this case, running C will modify \%h to result:

2003

2004 {

2005 "Jack Smith" {

2006 age 30,

2007 dept "Engineering"

2008 }

2009 }

2010

2011 =head2 lm

2012

2013 This is the linear models function.

2014

2015 $lm = lm(formula => 'mpg ~ wt + hp', data => $mtcars);

2016

2017 where C<$mtcars> is a hash of hashes

2018

2019 C also supports generating interaction terms directly within the formula using the C<*> operator:

2020

2021 my $lm = lm(formula => 'mpg ~ wt * hp^2', data => \%mtcars);

2022

2023 If your data contains missing numbers (C or C), C handles listwise deletion dynamically to ensure mathematical integrity before fitting.

2024

2025 the dot operator also works:

2026

2027 $lm = lm(formula => 'y ~ .', data => $dot_data);

2028

2029 =head2 matrix

2030

2031 my $mat1 = matrix(

2032 data => [1..6],

2033 nrow => 2

2034 );

2035

2036 You can also pass C<< byrow =E 1 >> if you want the matrix populated row-wise instead of column-wise.

2037

2038 As of version 0.10, parameters do not need to be named, so that C works more like R:

2039

2040 my $d = matrix(rnorm(32000), 1000, 32);

2041

2042 works as C, C, and C

2043

2044 =head2 max

2045

2046 max(1,2,3);

2047

2048 or

2049

2050 my @arr = 1..8;

2051 max(@arr, 4, 5)

2052

2053 as of version 0.02, max will die if any undefined values are provided

2054

2055 =head2 mean

2056

2057 mean(1,2,3);

2058

2059 or

2060

2061 my @arr = 1..8;

2062 mean(@arr, 4, 5)

2063

2064 or

2065

2066 mean([1,1], [2,2]) # 1.5

2067

2068 as of version 0.02, mean will die if any undefined values are provided

2069

2070 =head2 median

2071

2072 works like mean, taking array references and arrays:

2073

2074 median( $test_data[$i][0] )

2075

2076 as of version 0.02, median will die if any undefined values are provided

2077

2078 =head2 min

2079

2080 min(1,2,3);

2081

2082 or

2083

2084 my @arr = 1..8;

2085 min(@arr, 4, 5)

2086

2087 as of version 0.02, min will die if any undefined values are provided

2088

2089 =head2 mode

2090

2091 Takes either an array or an array reference, and returns an array of the most common scalars (numbers or strings)

2092

2093 @arr = mode([1,3,3,3]); # returns (3)

2094

2095 @arr = mode('a','a','c','c','z'); # returns ('a', 'c')

2096

2097 =head2 oneway_test

2098

2099 Like ANOVA/aov but does not assume normality

2100

2101 =head3 hash of array input

2102

2103 $test_data = oneway_test({

2104 yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],

2105 ctrl => [1, 1, 1, 0, 0, 0]

2106 });

2107

2108 which will output a hash reference:

2109

2110 {

2111 Group {

2112 Df 1,

2113 "F value" 177.504798464491,

2114 "Mean Sq" 61.6533333333333,

2115 Pr(>F) 1.31343255160843e-07,

2116 "Sum Sq" 61.6533333333333

2117 },

2118 group_stats {

2119 mean {

2120 ctrl 0.5,

2121 yield 5.03333333333333

2122 },

2123 size {

2124 ctrl 6,

2125 yield 6

2126 }

2127 },

2128 Residuals {

2129 Df 9.81767348326473,

2130 "Mean Sq" 0.353783749200256,

2131 "Sum Sq" 3.47333333333333

2132 }

2133

2134 }

2135

2136 =head3 array of array input

2137

2138 oneway_test([

2139 [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],

2140 [1, 1, 1, 0, 0, 0]

2141 ]);

2142

2143 which will output a nearly identical hash reference as for hash of arrays:

2144

2145 {

2146 Group {

2147 Df 1,

2148 "F value" 177.504798464491,

2149 "Mean Sq" 61.6533333333333,

2150 Pr(>F) 1.31343255160843e-07,

2151 "Sum Sq" 61.6533333333333

2152 },

2153 group_stats {

2154 mean {

2155 "Index 0" 5.03333333333333,

2156 "Index 1" 0.5

2157 },

2158 size {

2159 "Index 0" 6,

2160 "Index 1" 6

2161 }

2162 },

2163 Residuals {

2164 Df 9.81767348326473,

2165 "Mean Sq" 0.353783749200256,

2166 "Sum Sq" 3.47333333333333

2167 }

2168 }

2169

2170 =head2 p_adjust

2171

2172 Returns array of false-discovery-rate-corrected p-values, where methods available are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr"

2173

2174 my @q = p_adjust(\@pvalues, $method);

2175

2176 =head2 power_t_test

2177

2178 $test_data = power_t_test(

2179 n => 30, delta => 0.5,

2180 sd => 1.0, sig_level => 0.05

2181 );

2182

2183 It also allows configuring the test type (C<< type =E 'one.sample' >>, C<'two.sample'>, C<'paired'>) and alternative hypothesis (C<< alternative =E 'one.sided' >>). You can also pass C<< strict =E 1 >> to strictly evaluate both tails of the distribution.

2184

2185

2186

2187 =begin html

2188

2189

2190
2191
2192								Parameter
2193								Type
2194								Default
2195								Description
2196
2197
2198
2199
2200								`n`
2201								Float
2202								`undef`
2203								Number of observations (per group for two-sample, pairs for paired).
2204
2205
2206								`delta`
2207								Float
2208								`undef`
2209								True difference in means.
2210
2211
2212								`sd`
2213								Float
2214								1.0
2215								Standard deviation.
2216
2217
2218								`sig_level`
2219								Float
2220								0.05
2221								Significance level (Type I error probability). Also accepts `sig.level`.
2222
2223
2224								`power`
2225								Float
2226								`undef`
2227								Power of test (1 minus Type II error probability).
2228
2229
2230								`type`
2231								String
2232								`"two.sample"`
2233								Type of t-test: `"two.sample"`, `"one.sample"`, or `"paired"`.
2234
2235
2236								`alternative`
2237								String
2238								`"two.sided"`
2239								One- or two-sided test: `"two.sided"`, `"one.sided"`, `"greater"`, or `"less"`.
2240
2241
2242								`strict`
2243								Boolean
2244								0 (False)
2245								Use strict interpretation of two-sided power calculations.
2246
2247
2248								`tol`
2249								Float
2250								~`1.22e-4`
2251								Numerical tolerance used for the internal root-finding algorithm.
2252
2253
2254

2255

2256 =end html

2257

2258

2259

2260 =head2 prcomp

2261

2262 Principal Component Analysis

2263

2264 =head3 Options

2265

2266

2267

2268 =begin html

2269

2270

2271
2272
2273								Option
2274								Type
2275								Default
2276								Description
2277
2278
2279
2280
2281								`center`
2282								Boolean
2283								`1` (True)
2284								If true, the variables are shifted to be zero-centered before the analysis takes place.
2285
2286
2287								`scale`
2288								Boolean
2289								`0` (False)
2290								If true, the variables are scaled to have unit variance before the analysis takes place. Note: If a column has zero variance, the function will `croak` to prevent division by zero.
2291
2292
2293								`retx`
2294								Boolean
2295								`1` (True)
2296								If true, the rotated data (the original data multiplied by the rotation matrix) is returned under the key `x`.
2297
2298
2299								`tol`
2300								Number
2301								`undef`
2302								A value indicating the magnitude below which components should be omitted. Components are omitted if their standard deviation is less than or equal to `tol` times the standard deviation of the first component.
2303
2304
2305								`rank`
2306								Integer
2307								`undef`
2308								Optionally specify a strict limit on the number of principal components to return. The function will return `min(rank, rows, columns)` components.
2309
2310
2311

2312

2313 =end html

2314

2315

2316

2317 =head3 Results

2318

2319 =head4 Returned Data Structure

2320

2321 The C function returns a HashRef containing the following keys representing the results of the Principal Component Analysis:

2322

2323

2324

2325 =begin html

2326

2327

2328
2329
2330								Key
2331								Type
2332								Description
2333
2334
2335
2336
2337								`sdev`
2338								ArrayRef[Number]
2339								The standard deviations of the principal components. Mathematically, these are the square roots of the eigenvalues of the covariance matrix.
2340
2341
2342								`rotation`
2343								ArrayRef[ArrayRef]
2344								A 2D array representing the matrix of variable loadings (the eigenvectors). Each inner array represents a row, and the columns correspond to the principal components.
2345
2346
2347								`x`
2348								ArrayRef[ArrayRef]
2349								A 2D array containing the rotated data (often referred to as PCA scores). This is the original data projected onto the principal components. Note: Only present if the `retx` option is true.
2350
2351
2352								`center`
2353								ArrayRef[Number] or `0`
2354								The centering values used (typically the column means). Returns false (`0`) if centering was disabled.
2355
2356
2357								`scale`
2358								ArrayRef[Number] or `0`
2359								The scaling values used (typically the column standard deviations). Returns false (`0`) if scaling was disabled.
2360
2361
2362								`varnames`
2363								ArrayRef[String]
2364								The sorted names of the original variables. Note: Only present if the input data was a Hash of Arrays (HoA) or a Hash of Hashes (HoH).
2365
2366
2367

2368

2369 =end html

2370

2371

2372

2373 =head3 Using array of arrays

2374

2375 my $aoa = [

2376 [2, 4],

2377 [4, 2],

2378 [6, 6]

2379 ];

2380

2381 my $pca = prcomp($aoa);

2382

2383 which returns

2384

2385 {

2386 center [

2387 [0] 4,

2388 [1] 4

2389 ],

2390 rotation [

2391 [0] [

2392 [0] 0.707106781186547,

2393 [1] 0.707106781186548

2394 ],

2395 [1] [

2396 [0] 0.707106781186548,

2397 [1] -0.707106781186547

2398 ]

2399 ],

2400 scale 0,

2401 sdev [

2402 [0] 2.44948974278318,

2403 [1] 1.4142135623731

2404 ],

2405 x [

2406 [0] [

2407 [0] -1.41421356237309,

2408 [1] -1.4142135623731

2409 ],

2410 [1] [

2411 [0] -1.4142135623731,

2412 [1] 1.41421356237309

2413 ],

2414 [2] [

2415 [0] 2.82842712474619,

2416 [1] 2.22044604925031e-16

2417 ]

2418 ]

2419 }

2420

2421 =head3 Hash of Arrays

2422

2423 my $hoa = { B => [4, 2, 6], A => [2, 4, 6] };

2424 my $pca = prcomp($hoa);

2425

2426 =head2 quantile

2427

2428 Calculates sample quantiles using R's continuous Type 7 interpolation.

2429

2430 my $quantile = quantile('x' => [1..99], probs => [0.05, 0.1, 0.25]);

2431

2432 If the C parameter is omitted, it behaves identically to R by defaulting to the 0, 25, 50, 75, and 100 percentiles (C). The returned hash keys match R's standardized naming convention (e.g., C<"25%">, C<"33.3%">).

2433

2434 =head2 rbinom

2435

2436 Create a binomial distribution of numbers

2437

2438 my $binom = rbinom( n => $n, prob => 0.5, size => 9);

2439

2440 It hooks directly into Perl's internal PRNG system, respecting C seeds.

2441

2442 =head2 read_table

2443

2444 I've tried to make this as simple as possible, trying to follow from R:

2445

2446 my $test_data = read_table('t/HepatitisCdata.csv');

2447

2448 =head3 options

2449

2450

2451

2452 =begin html

2453

2454

2455
2456
2457								Option
2458								Description
2459								Example
2460
2461
2462
2463
2464								`comment`
2465								Comment character, by default `#`
2466								`comment => %` or whatever; does not apply with header
2467
2468
2469								`output.type`
2470								data type for output: array of hash, hash of array, or hash of hash
2471								`'output.type' => 'aoh'`
2472
2473
2474								`filter`
2475								Only take in rows with a certain filter
2476								`filter => { Sex => sub {$_ eq 'f'} }`
2477
2478
2479								`row.names`
2480								include row names in retrieved data; off by default
2481
2482
2483
2484								`sep`
2485								field separator character; synonym with `delim`
2486								`sep => "\t"`
2487
2488
2489								`delim`
2490								field separator character; synonym with `sep`
2491								`delim => "\t"`
2492
2493
2494

2495

2496 =end html

2497

2498

2499

2500 output types can be AOH (aoa), HOA (hoa), HOH (hoh)

2501

2502 read_table($filename, 'output.type' => 'aoh');

2503

2504 read_table($filename, 'output.type' => 'hoa');

2505

2506 and, like Text::CSV_XS, filters can be applied in order to save RAM on big files:

2507

2508 $test_data = read_table(

2509 't/HepatitisCdata.csv',

2510 filter => {

2511 Sex => sub {$_ eq 'f'} # where "Sex" is the column name, and "$_" is the value for that column

2512 },

2513 'output.type' => 'aoh'

2514 );

2515

2516 the default delimiter is C<,>

2517 Suffixes C<.csv> and C<.tsv> are automatically detected from file names, but if specified, are overridden by C and/or C. C is given priority.

2518

2519 =head2 rnorm

2520

2521 Make a normal distribution of numbers, with pre-set mean C, standard deviation C, and number C.

2522

2523 my ($rmean, $sd, $n) = (10, 2, 9999);

2524 my $normals = rnorm( n => $n, mean => $rmean, sd => $sd);

2525

2526 =head2 runif

2527

2528 Make an approximately uniform distribution into an array

2529

2530 =head3 named arguments

2531

2532 my $unif = runif( n => $n, min => 0, max => 1);

2533

2534 where C is the number of items, the values are between C and C

2535

2536 =head3 positional args

2537

2538 this is to match R's behavior:

2539

2540 runif( 9 )

2541

2542 will make 9 numbers in [0,1]

2543

2544 runif(9, 0, 99)

2545

2546 will match C, C, and C respectively

2547

2548 =head2 sample

2549

2550 take a sample of hash or array slices.

2551

2552 my $h = sample(\%h, 4); # take 4 hash keys and their values into $h

2553

2554 or, alternatively, with arrays:

2555

2556 my $arr = sample(\@arr, 3); # take 3 indices of an array

2557

2558 =head2 scale

2559

2560 my @scaled_results = scale(1..5);

2561

2562 You can also pass an options hash to disable centering or scaling:

2563

2564 my @scaled_results = scale(1..5, { center => false, scale => 1 });

2565

2566 It fully supports matrix operations. By passing an array of arrays, C processes the data column by column independently:

2567

2568 my $scaled_mat = scale([[1, 2], [3, 4], [5, 6]]);

2569

2570 =head2 sd

2571

2572 my $stdev = sd(2,4,4,4,5,5,7,9);

2573

2574 Correct answer is 2.1380899352994

2575

2576 C can accept both array references as well as arrays:

2577

2578 my $stdev = sd([2,4,4,4,5,5,7,9]);

2579

2580 As of version 0.02, sd will croak/die if any undefined values are provided.

2581

2582 =head2 seq

2583

2584 Works as closely as I can to R's seq, which is very similar to Perl's C loops. Returns an array, not an array reference.

2585

2586 =head3 Standard integer sequence

2587

2588 say 'seq(1, 5):';

2589 my @seq = seq(1, 5);

2590 say join(', ', @seq), "\n";

2591

2592 say 'seq(1, 2, 0.25):';

2593 @seq = seq(1, 2, 0.25);

2594

2595 =head3 Fractional steps

2596

2597 say 'seq(1, 2, 0.25):';

2598 @seq = seq(1, 2, 0.25);

2599 say join(", ", @seq), "\n";

2600 for (my $idx = 2; $idx >= 1; $idx -= 0.25) { # count down to pop

2601 is_approx(pop @seq, $idx, "seq item $idx with fractional step");

2602 }

2603

2604 =head3 Negative steps

2605

2606 say 'seq(10, 5, -1):';

2607 @seq = seq(10, 5, -1);

2608 say join(", ", @seq), "\n";

2609 for (my $idx = 5; $idx <= 10; $idx++) { # count down to pop

2610 is_approx(pop @seq, $idx, "seq item $idx with negative step");

2611 }

2612

2613 =head2 shapiro_test

2614

2615 tests to see if an array reference is normally distributed, returns a p-value and a statistic

2616

2617 my $shapiro = shapiro_test(

2618 [1..5]

2619 );

2620

2621 and returns the hash reference:

2622

2623 {

2624 p.value 0.589650577093106,

2625 p_value 0.589650577093106,

2626 statistic 0.960870680168535,

2627 W 0.960870680168535

2628 }

2629

2630 =head2 sum

2631

2632 returns sum, but using both arrays and array references.

2633

2634 my $test_data = [1..8];

2635 sum($test_data)

2636

2637 which I prefer, compared to List::Util's required casting into an array:

2638

2639 sum(@{ $test_data });

2640

2641 which passing a reference is shorter and much easier to read. Stats::LikeR, however, will work for B

2642

2643 as of version 0.02, C will cause the script to die if any undefined values are provided

2644

2645 =head2 summary

2646

2647 Analogous to R's C
, but does not deal with outputs from other functions.

2648 C
only describes data as it is entered.

2649 An option C or its synonym C specifies the maximum number of rows that will print.

2650

2651 =head3 array of array input

2652

2653 my @arr;

2654 foreach my $i (0..18) {

2655 push @arr, runif(22);

2656 }

2657

2658 and then C, or C

2659

2660 ---------------------------------------------------------------------------

2661 Index # values Min. 1st Qu. Median Mean 3rd Qu. Max.

2662 ---------------------------------------------------------------------------

2663 0 22 0.04312 0.286 0.4975 0.5121 0.7296 0.9633

2664 1 22 0.05932 0.1483 0.495 0.4737 0.7699 0.9371

2665 2 22 0.02742 0.1588 0.4045 0.4325 0.6682 0.9878

2666 3 22 0.009233 0.2552 0.5398 0.5147 0.7755 0.9808

2667 4 22 0.06727 0.2432 0.5019 0.4855 0.7121 0.9043

2668 5 22 0.001032 0.1646 0.3021 0.3727 0.5704 0.9556

2669

2670 =head3 hash of array input

2671

2672 $test_data = summary(

2673 {

2674 A => runif(9),

2675 B => runif(9)

2676 },

2677 );

2678

2679 =head2 t_test

2680

2681 There are 1-sample and 2-sample t-tests, from one or two arrays:

2682

2683 my $t_test = t_test( $array1, mu => 0.2334 );

2684

2685 or 2-sample:

2686

2687 $t_test = t_test(

2688 $array1, $array2,

2689 paired => 1

2690 );

2691

2692 returns a hash reference, which looks like:

2693

2694 conf_int => [

2695 -0.06672889, 0.25672889

2696 ],

2697 df => 5,

2698 estimate => 0.095,

2699 p_value => 0.19143688433660,

2700 statistic => 1.50996688705414

2701

2702 the two groups compared can be specified, though not necessarily, as C and C, just like in R:

2703

2704 $t_test = t_test(

2705 'x' => $array1, 'y' => $array2,

2706 paired => 1

2707 );

2708

2709 =head3 Parameters

2710

2711

2712

2713 =begin html

2714

2715

2716
2717
2718								Parameter
2719								Type
2720								Default
2721								Description
2722
2723
2724
2725
2726								`x`
2727								Array Reference
2728								Required
2729								The first vector of data. Must contain at least 2 elements.
2730
2731
2732								`y`
2733								Array Reference
2734								`undef`
2735								The second vector of data. Required for two-sample or paired tests.
2736
2737
2738								`mu`
2739								Float
2740								0.0
2741								The true value of the mean (or difference in means) for the null hypothesis.
2742
2743
2744								`paired`
2745								Boolean
2746								`FALSE`
2747								If true, performs a paired t-test. `x` and `y` must be the same length.
2748
2749
2750								`var_equal`
2751								Boolean
2752								`FALSE`
2753								If true, assumes equal variances (standard two-sample). If false, performs Welch's t-test with unequal variances.
2754
2755
2756								`conf_level`
2757								Float
2758								0.95
2759								Confidence level for the returned confidence interval. Must be between 0 and 1.
2760
2761
2762								`alternative`
2763								String
2764								`"two.sided"`
2765								Direction of the alternative hypothesis: `"two.sided"`, `"less"`, or `"greater"`.
2766
2767
2768

2769

2770 =end html

2771

2772

2773

2774 =head3 Return Hash

2775

2776

2777

2778 =begin html

2779

2780

2781
2782
2783								Key
2784								Description
2785
2786
2787
2788
2789								`statistic`
2790								The computed t-statistic.
2791
2792
2793								`df`
2794								Degrees of freedom for the test.
2795
2796
2797								`p_value`
2798								The calculated p-value based on the test directionality.
2799
2800
2801								`conf_int`
2802								An Array Reference containing two elements: `[lower_bound, upper_bound]`.
2803
2804
2805								`estimate`
2806								The estimated mean of `x` (one-sample) OR the mean of the differences (paired).
2807
2808
2809								`estimate_x`
2810								The estimated mean of the `x` vector (only returned in two-sample tests).
2811
2812
2813								`estimate_y`
2814								The estimated mean of the `y` vector (only returned in two-sample tests).
2815
2816
2817

2818

2819 =end html

2820

2821

2822

2823 =head2 transpose

2824

2825 Transposes a two-dimensional data structure, swapping rows and columns. Accepts either an array of arrays or a hash of hashes.

2826 Returns a new reference of the same type; the input is never modified.

2827

2828 =head3 Array of array input

2829

2830 Takes a reference to an array of array references and returns a new AoA where C.

2831

2832 my $matrix = [[1, 2, 3], [4, 5, 6]];

2833 my $t = transpose($matrix);

2834 # [[1, 4],

2835 # [2, 5],

2836 # [3, 6]]

2837

2838 All rows must be the same length; a ragged input is a fatal error.

2839 C is valid as an element value and is preserved exactly. An empty outer array or an array of empty rows both return C<[]>.

2840

2841 Dies if:

2842 - any inner element is not an array reference

2843 - rows differ in length (ragged array)

2844

2845 =head3 Hash of hash input

2846

2847 Takes a reference to a hash of hash references and returns a new HoH where C.

2848

2849 my $table = { alice => { score => 97, grade => 'A' }, bob => { score => 84, grade => 'B' } };

2850 my $t = transpose($table);

2851 # { score => { alice => 97, bob => 84 },

2852 # grade => { alice => 'A', bob => 'B' } }

2853

2854 Inner keys do not need to be uniform across rows. If a given column key appears in only some rows, the output hash for that column will simply contain only those rows — no padding or C-filling is performed.

2855

2856 my $sparse = {

2857 a => { x => 1, y => 2 },

2858 b => { x => 3, z => 4 } };

2859

2860 my $t = transpose($sparse);

2861 # { x => { a => 1, b => 3 },

2862 # y => { a => 2 },

2863 # z => { b => 4 } }

2864

2865 An empty outer hash or an outer hash whose inner hashes are all empty both return C<{}>.

2866

2867 Dies if any inner element is not a hash reference

2868

2869 =head2 value_counts

2870

2871 Count the values in a given data set, return a hash reference showing how many times each particular value is present.

2872

2873 =head3 Scalar

2874

2875 $hash = value_counts('c');

2876

2877 returns C<< { c =E 1 } >>

2878

2879 =head3 Array reference

2880

2881 value_counts(['a','b','b']);

2882

2883 returns C<< { a =E 1, b =E 2} >>

2884

2885 =head3 Array

2886

2887 my $value_counts = value_counts('a','b','b');

2888

2889 like an array reference above, returns C<< { a =E 1, b =E 2} >>

2890

2891 =head3 Hash

2892

2893 my $value_counts = value_counts( { A => 'a', B => 'a', C => 'b' } );

2894

2895 returns C<< { a =E 2, b =E 1} >>

2896

2897 =head3 Hash of array

2898

2899 my $value_counts = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']});

2900

2901 without a key (like above), the occurences of C, C, and C are counted.

2902

2903 With a key, like C for above, only values within that hash key are counted:

2904

2905 my $vc = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']}, 'a');

2906

2907 =head3 Hash of hash (table)

2908

2909 $hash = value_counts( {

2910 A => {

2911 a => 'x',

2912 b => 'z'

2913 },

2914 B => {

2915 a => 'x'

2916 },

2917 C => {

2918 a => 'y'

2919 }

2920 }, 'a');

2921

2922 the column, or second hash key, that you wish to count, is specified at the command line

2923

2924 =head2 var

2925

2926 as simple as possible:

2927

2928 var(2, 4, 5, 8, 9)

2929

2930 as of version 0.02, C will die if any undefined values are provided

2931

2932 like C, C, etc., C can accept array references, to make code simpler:

2933

2934 my $ref = \@arr;

2935 var($ref) = var(@arr)

2936

2937 =head2 var_test

2938

2939 As described by R: Performs an F test to compare the variances of two samples from normal populations

2940

2941 use Stats::LikeR;

2942

2943 my @x = (2.9, 3.0, 2.5, 2.6, 3.2);

2944 my @y = (3.8, 2.7, 4.0, 2.4);

2945

2946 my $vt = var_test(\@x, \@y);

2947

2948 also, conf_level can be set:

2949

2950 $vt = var_test(\@x, \@y, conf_level => 0.99);

2951

2952 as well as a ratio (from R: the hypothesized ratio of the population variances of C and C:

2953

2954 $test_data = var_test(\@xk, \@yk, ratio => 2);

2955

2956 =head2 view

2957

2958 An R-style C for the structures C returns. Prints the first

2959 few rows of a dataframe as an aligned text table, with numeric columns

2960 right-justified, string columns left-justified, and undefined cells shown as

2961 C. Works on all three C values:

2962

2963

2964

2965 =begin html

2966

2967

2968
2969
2970								`output.type`
2971								Perl structure
2972								What `view` shows
2973
2974
2975
2976
2977								`aoh`
2978								array of hash refs
2979								one line per row, sequential row numbers
2980
2981
2982								`hoa`
2983								hash of array refs
2984								values gathered column-wise by row index
2985
2986
2987								`hoh`
2988								hash of hash refs
2989								top-level keys become the row label column
2990
2991
2992

2993

2994 =end html

2995

2996

2997

2998 =head3 Synopsis

2999

3000 my $aoh = read_table('all.data.tsv', 'output.type' => 'aoh');

3001

3002 view($aoh); # first 6 rows, like head()

3003 view($aoh, n => 20); # first 20 rows

3004 view($aoh, cols => [qw(id age tt)]); # force a column order

3005 view($aoh, 'row.names' => 'id'); # use column 'id' as the row label

3006 view($aoh, na => '.', max_width => 30);

3007

3008 my $txt = view($aoh, return_only => 1); # capture the string, print nothing

3009 view($aoh, to => \*STDERR); # print somewhere other than STDOUT

3010

3011 =head3 Output

3012

3013 # AoH: 7 rows x 3 cols (showing 6)

3014 row_name Testosterone, total (nmol/L) age sex

3015 p1 18.2 41 M

3016 p2 NA 7 F

3017 p3 1.05 33 F

3018 p4 22.9 55 M

3019 p5 14 29 M

3020 p6 NA 62 F

3021 # ... 1 more row

3022

3023 The banner reports the structure type, full dimensions, and how many rows are

3024 displayed. A footer appears only when rows are hidden.

3025

3026 =head3 Arguments

3027

3028 All arguments after the data reference are optional name/value pairs.

3029

3030

3031

3032 =begin html

3033

3034

3035
3036
3037								Argument
3038								Default
3039								Meaning
3040
3041
3042
3043
3044								`n`
3045								`6`
3046								Number of rows to show. `n` greater than the table shows everything.
3047
3048
3049								`cols` / `columns`
3050								—
3051								Array ref pinning column order (and which columns appear).
3052
3053
3054								`row.names`
3055								—
3056								Column to use as the row label (for `aoh`/`hoa`). See ordering note.
3057
3058
3059								`na`
3060								`'NA'`
3061								Token printed for undefined cells.
3062
3063
3064								`max_width`
3065								`50`
3066								Truncate any cell wider than this (column names are never truncated).
3067
3068
3069								`ellipsis`
3070								`'...'`
3071								Marker appended to truncated cells.
3072
3073
3074								`gap`
3075								`2`
3076								Spaces between columns.
3077
3078
3079								`to`
3080								STDOUT
3081								Filehandle to print to.
3082
3083
3084								`return_only`
3085								`0`
3086								If true, return the string and print nothing.
3087
3088
3089

3090

3091 =end html

3092

3093

3094

3095 C always returns the formatted string, whether or not it also prints.

3096

3097 =head3 A note on column order

3098

3099 C stores rows as hashes, so the original CSV column order is not

3100 preserved. C therefore sorts columns by name for a stable, reproducible

3101 layout. Two conveniences soften this:

3102

3103 =over

3104

3105 =item * A column literally named C (the label C assigns to a

3106 leading blank header) is detected automatically and moved to the left as the

3107 row label.

3108

3109 =item * Pass C<< cols =E [ ... ] >> to control both the order and the selection of columns

3110 shown.

3111

3112 =back

3113

3114 When no label column is present, C numbers the rows C<1, 2, 3, …>, the way

3115 R prints row names for an unnamed data frame.

3116

3117 =head3 Edge cases

3118

3119 =over

3120

3121 =item * Empty input (C<[]> or C<{}>) prints a clean C<0 rows x 0 cols> banner.

3122

3123 =item * Tabs, carriage returns, and newlines inside a cell are escaped (C<\t>, C<\r>,

3124 C<\n>) so one record always stays on one line.

3125

3126 =item * A non-reference argument, or a hash whose values are plain scalars, dies with

3127 a clear message rather than producing garbled output.

3128

3129 =back

3130

3131 =head3 Tests

3132

3133 The behavior above is covered by C (run with C): the three

3134 structure types, C boundaries, alignment, C rendering, truncation,

3135 C/C handling, control-character escaping, the C and

3136 C output paths, empty structures, and the error cases.

3137

3138 =head2 wilcox_test

3139

3140 $test_data = wilcox_test(

3141 [1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30],

3142 [0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29]

3143 );

3144

3145 It fully supports paired tests (C<< paired =E 1 >>) and can calculate exact p-values (the default for C<< N E 50 >> without ties). If ties are encountered, it automatically switches to an approximation with continuity correction.

3146

3147 =head2 write_table

3148

3149 mimics R's C, with data as first argument to subroutine, and output file as second

3150

3151 write_table(\@data_aoh, $tmp_file, sep => "\t", 'row.names' => 1);

3152

3153 You can also precisely filter and reorder which columns are written by passing an array reference to C:

3154

3155 write_table(\@data, $tmp_file, sep => "\t", 'col.names' => ['c', 'a']);

3156

3157 undefined variables are printed as C by default, but can be set as you wish using C

3158

3159 write_table(\%data_hoa, '/tmp/undef.val.tsv', sep => "\t", 'undef.val' => 'nan')

3160

3161 as of version 0.07, C determines comma and tab-separated delimiters from the filename, but will override if C or C are explicitly set.

3162

3163 Args can also be accepted:

3164

3165 write_table( 'data' => \%flat, 'file' => $f );

3166

3167 =head1 changes

3168

3169 =head2 0.15

3170

3171 C function added, similar to R's C

3172

3173 C:

3174 filter => {

3175 'Testosterone, total (nmol/L)' => sub { defined $_ },

3176 }

3177

3178 was broken by the change in undefined variables in 0.14, but is back to being C

3179

3180 C improvement in sectioning in README

3181

3182 Numerous changes to prevent quadmath/long double CPAN test failures

3183

3184 Minimum Scalar::Util version in dist.ini is now 1.22, see https://www.cpantesters.org/cpan/report/6b682236-6567-11f1-a3bc-a055f9c4ba34

3185

3186 C is no longer needed, and removed as a dependency

3187

3188 =head3 C

3189

3190 =head4 Behavior change

3191

3192 =over

3193

3194 =item * B<< C cells now write as an empty field, not an empty string. >> A missing

3195 or C value renders as nothing between separators (C) rather than a

3196 quoted empty string (C / C). Supplying C<< 'undef.val' =E 'NA' >>

3197 (or any other token) still overrides this, exactly as before. This is the

3198 only change that can alter the bytes of an existing output file; if you relied

3199 on the previous default, pass C<< 'undef.val' =E '' >> to keep an explicit empty

3200 field, or your chosen placeholder.

3201

3202 =back

3203

3204 =head4 Bug fixes

3205

3206 =over

3207

3208 =item * B

3209 Previously, cells were looked up with the raw bytes of the column name

3210 (C), which fails to match a

3211 UTF-8-flagged hash key: the column header printed correctly but every cell

3212 under it came back empty. All lookups now fetch by SV (C), header

3213 lists are gathered and sorted as SVs (C + C, preserving the

3214 flag) instead of being round-tripped through C, and the C

3215 column is matched with C rather than C. Embedded NUL bytes in

3216 keys are handled correctly as a side effect.

3217

3218 =item * B<< C<< col.names =E [] >> no longer loops forever. >> An empty C array made

3219 C return C<-1>, which — compared against an unsigned C loop

3220 index — wrapped to C and ran effectively without end. This was fixed

3221 for flat hashes previously; it was still present for hash-of-hashes,

3222 hash-of-arrays, and array-of-hashes, plus both C header-filtering

3223 loops. All such loops now use a signed index.

3224

3225 =item * B One header loop used an

3226 C index that silently wrapped past 65,535 and never terminated.

3227 It now uses C like the rest of the code.

3228

3229 =item * B Every other input shape

3230 rejects a nested reference with

3231 I<"Cannot write nested reference types to table">; a flat hash instead

3232 stringified it (e.g. C) into the file. It now croaks

3233 consistently.

3234

3235 =item * B<< C<< 'undef.val' =E undef >> is handled cleanly. >> It previously called

3236 C on C, raising an I<"uninitialized value"> warning and

3237 yielding an empty string by accident. It is now treated explicitly as an empty

3238 field, with no warning.

3239

3240 =back

3241

3242 =head4 Memory-leak fixes (exception paths)

3243

3244 =over

3245

3246 =item * The row-key list gathered for hash-of-hashes input was leaked when the output

3247 file could not be opened.

3248

3249 =item * The I<"Could not get headers"> croak on hash-of-arrays input leaked both the

3250 already-open filehandle and the headers array.

3251

3252 =back

3253

3254 =head4 Internal / non-behavioral

3255

3256 =over

3257

3258 =item * Numeric row labels are now formatted into a reused stack buffer instead of a

3259 per-row C / C allocation (no functional change; removes a

3260 cast-away-C and one allocation per row).

3261

3262 =item * Several signed/unsigned index types were made consistent (C vs

3263 C) to match C and silence the conditions behind the loop bugs

3264 above.

3265

3266 =back

3267

3268 =head4 Tests

3269

3270 =over

3271

3272 =item * C expanded from 17 to 69 assertions. New coverage targets each

3273 fix above: the empty-field default and C<< undef.val =E undef >> (no warning),

3274 C<< col.names =E [] >> termination across all four input shapes, the

3275 >65,535-column header loop (gated behind C), in-sequence

3276 numeric row labels, nested-reference rejection, CSV quoting corners

3277 (carriage return, separators inside column names, multi-character separators),

3278 empty input writing no file, and UTF-8 column names and row keys. Two leak

3279 assertions cover the exception paths above.

3280

3281 =back

3282

3283 =head2 0.14

3284

3285 C function added for rows

3286

3287 C reads undefined values to C instead of C, which makes calculations easier

3288

3289 C writes undef by default as an empty string C<''>

3290

3291 C transforms a hash of hashes into an hash of arrays

3292

3293 C uses C instead of C to allow for high-precision 128-bit floats to be used on quadmath machines when available: https://www.cpantesters.org/cpan/report/296f4868-631f-11f1-abba-ff15558d240b

3294

3295 Numerous switches from C to C for local precision, like above

3296

3297 numerous changes to C for ease of use and working with datasets with numerous undefined values

3298

3299 dist.ini now links to math library when compiling: https://www.cpantesters.org/cpan/report/785e26d8-6397-11f1-89c0-dc066e8775ea

3300

3301 C now should be complete, errors with confidence intervals fixed

3302

3303 =head2 0.13

3304

3305 C: speed improvements; commented headers are now allowed

3306

3307 C: fix for

3308

3309 Attempt to free temp prematurely: SV 0x56417a2ae610 at t/write_table.t line 182.

3310 main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203

3311 Attempt to free unreferenced scalar: SV 0x56417a2ae610 at t/write_table.t line 183.

3312 main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203

3313

3314 C gives better warnings for incorrect types of data given

3315

3316 Numerous changes to dist.ini to improve CPAN testing, especially for Win32

3317

3318 =head2 0.12

3319

3320 C can also take hash of arrays, and various mixes of data types

3321

3322 C: Addition of C keywords in many places; should improve CPU performance

3323

3324 Better POD formatting, correction of output hash for README's C

3325

3326 C can now accept hash of hashes as input

3327

3328 new C function for switching 2D hash keys and 2D array indices, and C for comparing columns against columns

3329

3330 removed unused function from C helpers

3331

3332 C: addition of restrict keywords in preinit, should improve CPU performance

3333

3334 MANIFEST.skip changed to MANIFEST.SKIP to improve CPAN testing

3335

3336 using C for tests of C, which may or may not work with CPAN testers (experimental)

3337

3338 Added function name to warnings, so I actually know which function is producing the error

3339

3340 C can also take C and C as args, in addition to positions

3341

3342 fixed C as it could hang if given empty C or C

3343

3344 Added C<__EXTENSIONS__> to source XS file for better CPAN testing

3345

3346 =head2 0.11

3347

3348 better POD formatting for tables

3349

3350 addition of MANIFEST.skip to get better testing results on CPAN

3351

3352 C: bugfix for when there is no intercept in the formula, new test cases in t/glm.t

3353

3354 C now accepts simple hashes as input, in addition to hash of arrays, hash of hashes, and arrays of hashes

3355

3356 Better documentation for t-test

3357

3358 =head2 0.10

3359

3360 changes to compilation for CPAN, trying to get this work on Windows

3361

3362 Addition of C and C

3363

3364 C will work without key names, just like in R. Testing for C has improved.

3365

3366 =head2 0.09

3367

3368 context changes in XS C, C, and C to get better CPAN testing results

3369

3370 C keywords added to C to increase speed

3371

3372 =head2 0.08

3373

3374 Speed improvement in C
of hashes.

3375

3376 Addition of C, C, C, C, and C functions

3377

3378 Chi-squared function no longer has Perl wrapper, and all code is in XS, which should result in a minor speed increase with 1 less function call.

3379

3380 Compiler changes for GNU source and inclusion of C, to ensure more CPAN testing works better.

3381

3382 C now returns hash-of-hash in {row}{column}

3383

3384 =head2 0.07

3385

3386 Addition of C
function.

3387

3388 Formulas can now be omitted from C, resulting in a stacked calculation as R would think.

3389

3390 Addition of C for multi-group comparisons that does not assume normality like C does.

3391

3392 C and C now automatically set separators for C<.csv> files as C<,> and C<.tsv> files as C<"\t">, respectively, so these values no longer need to be specified separately from the file name.