File Coverage

blib/lib/Stats/LikeR.pm

Criterion	Covered	Total	%
statement	200	221	90.5
branch	70	90	77.7
condition	34	56	60.7
subroutine	29	32	90.6
pod	3	3	100.0
total	336	402	83.5

are exported by default.

line stmt bran cond sub pod time code

1 #!/usr/bin/env perl

2 # ABSTRACT: Get basic statistical functions, like in R, but with Perl using XS for performance

3 require 5.010;

4 13 13 458814 use strict;

13 17

13 438

5 13 13 80 use feature 'say';

13 15

13 3171

6 package Stats::LikeR;

7 our $VERSION = 0.14;

8 require XSLoader;

9 13 13 6411 use Devel::Confess 'color';

13 111176

13 38

10 13 13 1199 use warnings FATAL => 'all';

13 33

13 592

11 13 13 5129 use autodie ':default';

13 171192

13 42

12 13 13 57231 use Exporter 'import';

13 17

13 459

13 13 13 48 use Scalar::Util 'looks_like_number';

13 17

13 7171

14 XSLoader::load('Stats::LikeR', $VERSION);

15 our @EXPORT_OK = qw(add_data aov cfilter chisq_test col col2col cor cor_test cov dnorm filter fisher_test glm group_by hoh2hoa hist kruskal_test ks_test ljoin lm matrix max mean median min mode oneway_test p_adjust power_t_test prcomp quantile rbinom read_table rnorm runif sample scale sd seq shapiro_test sum summary t_test transpose value_counts var var_test wilcox_test write_table);

16 our @EXPORT = @EXPORT_OK;

18 require XSLoader;

19 # ---- filter DSL: col() builds a predicate via overloading (pure Perl) -------

20 # Exported: filter (XS) and col. Place col()/Col/Pred near the top of the .pm;

21 # they need no XS. filter() is the XSUB.

22 26 26 1 135822 sub col { Stats::LikeR::Col->new($_[0]) }

23 {

24 package Stats::LikeR::Col;

25 26 33 26 173 sub new { bless { name => $_[1] }, ref($_[0]) || $_[0] }

26 # build a comparison leaf; if operands were swapped (4 > col('x')), flip the op

27 sub _c {

28 26 26 47 my ($self, $val, $swapped, $op, $flip) = @_;

29 26 100 62 Stats::LikeR::Pred->_leaf($self->{name}, $swapped ? $flip : $op, $val);

30 }

31 use overload

32 13 13 24 '>' => sub { $_[0]->_c($_[1],$_[2],'>','<') },

33 3 3 12 '<' => sub { $_[0]->_c($_[1],$_[2],'<','>') },

34 1 1 4 '>=' => sub { $_[0]->_c($_[1],$_[2],'>=','<=') },

35 1 1 8 '<=' => sub { $_[0]->_c($_[1],$_[2],'<=','>=') },

36 3 3 6 '==' => sub { $_[0]->_c($_[1],$_[2],'==','==') },

37 1 1 3 '!=' => sub { $_[0]->_c($_[1],$_[2],'!=','!=') },

38 0 0 0 'lt' => sub { $_[0]->_c($_[1],$_[2],'lt','gt') },

39 1 1 4 'gt' => sub { $_[0]->_c($_[1],$_[2],'gt','lt') },

40 0 0 0 'le' => sub { $_[0]->_c($_[1],$_[2],'le','ge') },

41 0 0 0 'ge' => sub { $_[0]->_c($_[1],$_[2],'ge','le') },

42 2 2 5 'eq' => sub { $_[0]->_c($_[1],$_[2],'eq','eq') },

43 1 1 3 'ne' => sub { $_[0]->_c($_[1],$_[2],'ne','ne') },

44 13 13 118 fallback => 1;

13 20

13 225

45 }

46 {

47 package Stats::LikeR::Pred;

48 26 26 324 sub _leaf { bless { op => $_[2], col => $_[1], val => $_[3] }, 'Stats::LikeR::Pred' }

49 4 4 44 sub _node { bless { op => $_[0], l => $_[1], r => $_[2] }, 'Stats::LikeR::Pred' }

50 use overload

51 2 2 4 '&' => sub { Stats::LikeR::Pred::_node('and', $_[0], $_[1]) },

52 1 1 2 '|' => sub { Stats::LikeR::Pred::_node('or', $_[0], $_[1]) },

53 1 1 3 '!' => sub { Stats::LikeR::Pred::_node('not', $_[0], undef) },

54 13 13 3986 fallback => 1;

13 14

13 98

55 }

56 sub summary {

57 5 5 1 661492 my ($data, %args);

58 5 36 my $current_sub = (split(/::/,(caller(0))[3]))[-1];

60 5 100 66 30 if (@_ && ref $_[0]) {

61 # Handles: summary(\@arr) or summary(\@arr, nrows => 5) or summary(\%h, nrow => 3)

62 4 5 $data = shift;

63 4 7 %args = @_; # capture any trailing key/value pairs

64 } else {

65 # Handles: summary(@runif) or summary(@runif, nrows => 2)

66 # Extract known trailing named arguments from the flat list

67 1 33 20 while (@_ >= 2 && defined $_[-2] && !ref($_[-2]) && $_[-2] =~ /^(?:nrows|nrow)$/) {

68 0 0 my $val = pop @_;

69 0 0 my $key = pop @_;

70 0 0 $args{$key} = $val;

71 }

72 # The remaining items in @_ make up the actual data array

73 1 6 my @list = @_;

74 1 2 $data = \@list;

75 }

76 # Normalize nrow -> nrows, default to 10

77 5 50 24 $args{nrows} //= delete($args{nrow}) // 10;

78 5 9 my $ref_type = ref $data;

79 5 50 66 17 if (($ref_type ne 'ARRAY') && ($ref_type ne 'HASH')) {

80 0 0 die "$current_sub' data must either be a hash or an array, not \"$ref_type\"";

81 }

82 5 7 my $single_arr = 0;

83 5 100 100 18 if (($ref_type eq 'ARRAY') && (ref $data->[0] eq '')) {

84 2 3 $single_arr = 1;

85 }

86 5 11 my @header = ('# values', 'Min.', '1st Qu.', 'Median', 'Mean', '3rd Qu.', 'Max.');

87 5 6 my @out;

88 5 100 13 if ($single_arr == 1) {

100

89 2 6 push @out, '-' x 75;

90 2 10 my $header = sprintf('%9s ' x scalar @header, @header);

91 2 3 push @out, $header;

92 2 3 push @out, '-' x 75;

93 2 22 my @undef = grep {!defined $data->[$_]} 0..scalar @{ $data }-1;

198 193

2 14

94 2 50 7 if (scalar @undef > 0) {

95 0 0 say STDERR join (',', @undef);

96 0 0 die "The above indices are not defined in $current_sub";

97 }

98 2 3 my @numeric = grep {looks_like_number($_)} @{ $data };

198 230

2 4

99 2 48 my $q = quantile(\@numeric, probs => [0.25, 0.75]);

100 2 73 my $vals = sprintf('%9.4g ' x scalar @header, scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));

101 2 9 push @out, $vals;

102 } elsif ($ref_type eq 'ARRAY') {

103 1 13 push @out, '-' x 75;

104 1 5 my $header = sprintf('%9s ' x scalar @header, @header);

105 1 3 unshift @header, 'Index';

106 1 2 $header = 'Index ' . $header;

107 1 1 push @out, $header;

108 1 2 push @out, '-' x 75;

109 1 1 my $rows_printed = 0;

110 1 4 foreach my $index (0..$#$data) {

111 2 2 my @undef = grep {!defined $data->[$index][$_]} 0..scalar @{ $data->[$index] }-1;

18 34

2 5

112 2 50 5 if (scalar @undef > 0) {

113 0 0 say STDERR join (',', @undef);

114 0 0 die "The above indices are not defined for index $index in $current_sub";

115 }

116 2 3 my @numeric = grep {looks_like_number($_)} @{ $data->[$index] };

18 23

2 3

117 2 16 my $q = quantile(\@numeric, probs => [0.25, 0.75]);

118 2 27 my $vals = sprintf('%6.4g', $index) . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));

119 2 4 push @out, $vals;

120 2 3 $rows_printed++;

121 2 50 6 last if $rows_printed >= $args{nrows}; # Changed to >= just to be safe

122 }

123 } elsif ($ref_type eq 'HASH') {

124 2 5 push @out, '-' x 78;

125 2 8 my $header = sprintf('%9s ' x scalar @header, @header);

126 2 3 unshift @header, 'Key';

127 2 5 $header = ' Key ' . $header;

128 2 2 push @out, $header;

129 2 3 push @out, '-' x 78;

130 2 2 my $rows_printed = 0;

131 2 3 foreach my $key (sort {lc $a cmp lc $b} keys %{ $data }) {

4 10

2 10

132 3 4 my @undef = grep {!defined $data->{$key}[$_]} 0..scalar @{ $data->{$key} }-1;

27 31

3 7

133 3 50 8 if (scalar @undef > 0) {

134 0 0 say STDERR join (',', @undef);

135 0 0 die "The above indices are not defined for key $key in $current_sub";

136 }

137 3 3 my @numeric = grep {looks_like_number($_)} @{ $data->{$key} };

27 58

3 4

138 3 24 my $q = quantile(\@numeric, probs => [0.25, 0.75]);

139 3 6 my $print_key = substr($key, 0, 9);

140 3 50 8 if ((length $print_key) < 9) { # make sure that short keys line up correctly

141 3 7 $print_key .= ' ' x (9 - length $print_key);

142 }

143 3 40 my $vals = $print_key . sprintf('%9.4g ' x (scalar @header - 1), scalar @numeric, min(\@numeric), $q->{'25%'}, median(\@numeric), mean(\@numeric), $q->{'75%'}, max(\@numeric));

144 3 6 push @out, $vals;

145 3 4 $rows_printed++;

146 3 100 11 last if $rows_printed >= $args{nrows};

147 }

148 }

149 5 60 say join ("\n", @out);

150 5 27 return \@out;

151 }

152

153 sub read_table {

154 523 523 1 736915 my $file = shift;

155 523 50 5180 die "\"$file\" is not a file" unless -f $file;

156 523 50 3195 die "\"$file\" is not readable" unless -r $file;

157

158 523 663 my %input_args = @_;

159 523 100 769 if (defined $input_args{delim}) {

160 1 3 $input_args{sep} = delete $input_args{delim};

161 }

162

163 523 534 my $default_sep = ',';

164 523 100 2033 if ($file =~ /\.tsv$/i) {

165 2 4 $default_sep = "\t";

166 } elsif ($file =~ /\.csv$/i) {

167 521 594 $default_sep = ",";

168 }

169

170 523 1148 my %args = (

171 sep => $default_sep,

172 comment => '#',

173 %input_args,

174 );

175

176 523 693 my %allowed_args = map {$_ => 1} (

3138 3906

177 'comment', 'output.type', 'filter', 'row.names', 'sep', 'delim'

178 );

179 523 930 my @undef_args = sort grep {!$allowed_args{$_}} keys %args;

1067 1501

180 523 50 813 if (scalar @undef_args > 0) {

181 0 0 my $current_sub = (split(/::/,(caller(0))[3]))[-1]; # only needed on the error path

182 0 0 say join (', ', @undef_args);

183 0 0 die "the above args aren't defined for $current_sub";

184 }

185 523 100 1276 $args{'output.type'} = $args{'output.type'} // 'aoh';

186 523 100 1127 if ($args{'output.type'} !~ m/^(?:aoh|hoa|hoh)$/) {

187 1 14 die "\"$args{'output.type'}\" isn't allowed";

188 }

189 522 550 my $filter = $args{filter};

190 522 50 66 1308 if (defined $filter && ref($filter) eq 'CODE') {

191 0 0 $filter = { 0 => $filter };

192 } elsif (defined $filter && ref($filter) ne 'HASH') {

193 0 0 die "'filter' must be a CODE or HASH reference";

194 }

195 522 591 my (@data, %data, @header, %mapped_filters, @sorted_filter_flds, %seen_rownames);

196 _parse_csv_file($file, $args{sep} // '', $args{comment} // '', sub {

197 6718 6718 7178 my ($line_ref) = @_;

198 6718 100 7792 if (!@header) {

199 # --- HEADER PROCESSING (copy made only here; runs once) ---

200 522 907 my @line = @$line_ref;

201 522 50 33 2691 $line[0] =~ s/^\Q$args{comment}\E// if @line && defined $line[0];

202 # NOTE: trailing-empty stripping removed — the header is now treated like

203 # data rows, so a consistent trailing separator no longer produces a false

204 # "Alignment error". The alignment check still rejects genuinely ragged rows.

205 522 752 @header = @line;

206 522 100 66 1180 if ((scalar @header > 0) && ($header[0] eq '')) {

207 10 15 $header[0] = 'row_name';

208 }

209 522 526 my %seen_h;

210 522 561 my @dup_cols = grep { $seen_h{$_}++ } @header;

1166 1792

211 522 100 722 warn "read_table: duplicate column name(s) in $file: @dup_cols (later values win)\n" if @dup_cols;

212 522 100 100 893 if (($args{'output.type'} eq 'hoh') && (not defined $args{'row.names'})) {

213 4 7 $args{'row.names'} = $header[0];

214 }

215 522 50 66 734 if ((defined $args{'row.names'}) && (!grep {$_ eq $args{'row.names'}} @header)) {

61 80

216 0 0 die "\"$args{'row.names'}\" isn't in the header of $file";

217 }

218 522 100 669 if ($filter) {

219 4 11 for my $k (keys %$filter) {

220 4 50 17 if ($k =~ /^\d+$/) {

221 0 0 $mapped_filters{$k} = $filter->{$k};

222 } else {

223 4 50 my ($idx) = grep { $header[$_] eq $k } 0..$#header;

44 55

224 4 50 13 die "Filter column '$k' not found in header" unless defined $idx;

225 4 18 $mapped_filters{$idx + 1} = $filter->{$k};

226 }

227 }

228 4 13 @sorted_filter_flds = sort { $a <=> $b } keys %mapped_filters; # constant per file

0 0

229 }

230 522 2093 return;

231 }

232 # --- DATA PROCESSING (operate on $line_ref directly; no per-row array copy) ---

233 6196 100 7248 if (scalar @$line_ref != scalar @header) {

234 1 12 die "Alignment error on $file (" . scalar(@$line_ref) . " fields vs " . scalar(@header) . " headers).";

235 }

236 6195 5423 my %line_hash;

237 6195 7706 for my $i (0 .. $#header) {

238 74732 65383 my $v = $line_ref->[$i];

239 74732 100 66 146376 $line_hash{$header[$i]} = (!defined($v) || $v eq '') ? undef : $v;

240 }

241 # --- APPLY FILTERS ---

242 6195 100 7181 if (@sorted_filter_flds) {

243 1847 10085 local %_ = %line_hash; # row available as %_; set once per row, not per field

244 1847 2557 my $skip = 0;

245 1847 1773 foreach my $fld (@sorted_filter_flds) {

246 1847 50 2438 local $_ = $fld == 0 ? $line_ref : $line_ref->[$fld - 1];

247 1847 100 2469 if (!$mapped_filters{$fld}->($line_ref, \%line_hash)) { $skip = 1; last; }

1131 2192

1131 1103

248 716 50 1798 if ($fld > 0) { # write back any mutation the callback made to $_

249 716 683 $line_ref->[$fld - 1] = $_;

250 716 100 66 1659 $line_hash{$header[$fld - 1]} = (!defined($_) || $_ eq '') ? undef : $_;

251 }

252 }

253 1847 100 11027 return if $skip;

254 }

255 # Populate requested data structure

256 5064 100 7635 if ($args{'output.type'} eq 'aoh') {

100

257 1863 12097 push @data, \%line_hash;

258 } elsif ($args{'output.type'} eq 'hoa') {

259 1114 1153 push @{ $data{$_} }, $line_hash{$_} for @header;

15757 30964

260 } elsif ($args{'output.type'} eq 'hoh') {

261 2087 2131 my $row_name = $line_hash{$args{'row.names'}};

262 warn "read_table: duplicate row name '$row_name' in $file (later values win)\n"

263 2087 100 3267 if $seen_rownames{$row_name}++;

264 2087 1962 foreach my $col (@header) {

265 29171 100 32791 next if $col eq $args{'row.names'};

266 27084 52166 $data{$row_name}{$col} = $line_hash{$col};

267 }

268 }

269 522 50 15799 });

270 521 100 6397 if ($args{'output.type'} eq 'aoh') {

271 509 2146 return \@data;

272 } else { # hoa or hoh

273 12 7422 return \%data;

274 }

275 }

276

277 1;

278 =encoding utf8

279

280 =head1 Synopsis

281

282 Get basic statistical functions working in Perl as if they were part of List::Util, like C, C, C, etc.

283 I've used Artificial Intelligence tools such as Claude, Gemini, and Grok to write this as well as using my own gray matter.

284 There are other similar tools on CPAN, but I want speed and a form like List::Util, which I've gotten here with the help of AI, which often required many attempts to do correctly.

285 This is meant to call subroutines directly through eXternal Subroutines (XS) for performance and portability.

286

287 There B other modules on CPAN that can do B of this, but this works the way that I B it to.

288

289 =head1 Functions/Subroutines

290

291 ========================================================================

292

293 =head2 add_data

294

295 Add data to an existing hash or array reference. This function acts as the equivalent of adding new rows, as well as an C (described below). It dynamically infers your target data structure, handles deeply nested records, and seamlessly coerces mismatched data shapes to preserve the structural integrity of your primary reference.

296

297 =head3 Hash of Hashes (HoH)

298

299 When the target is a Hash of Hashes, incoming hash keys update existing rows, and new keys create new rows.

300

301 $data = { 'Jack Smith' => { age => 30 } };

302

303 $n = {

304 'Jack Smith' => { # Update existing (Hash)

305 dept => 'Engineering'

306 },

307 'Jane Doe' => { age => 25, dept => 'Sales' }, # Add new (Hash)

308 'Invalid' => 'Not a reference' # Edge case safety

309 };

310

311 add_data($data, $n);

312

313 B

314

315 {

316 "Jack Smith": {

317 "age": 30,

318 "dept": "Engineering"

319 },

320 "Jane Doe": {

321 "age": 25,

322 "dept": "Sales"

323 }

324 }

325

326 =head3 Hash of Arrays (HoA)

327

328 When the target is a Hash of Arrays, incoming arrays are pushed onto the existing arrays, appending the new elements, similarly to R's C.

329

330 $data = { 'Project Alpha' => [ 'task1', 'task2' ] };

331

332 $n = {

333 'Project Alpha' => [ 'task3' ], # Appends to existing array

334 'Project Beta' => [ 'task1', 'task2' ] # Creates new array row

335 };

336

337 add_data($data, $n);

338

339 B

340

341 {

342 "Project Alpha": [ "task1", "task2", "task3" ],

343 "Project Beta": [ "task1", "task2" ]

344 }

345

346 =head3 Array of Hashes / Arrays (AoH / AoA)

347

348 C now natively supports Array references at the root level. When targeting an Array, it iterates through the source array and merges data at the corresponding indices.

349

350 $data = [

351 { id => 1, name => 'Alice' }

352 ];

353

354 $n = [

355 { role => 'Admin' }, # Updates index 0

356 { id => 2, name => 'Bob' } # Creates index 1

357 ];

358

359 add_data($data, $n);

360

361 B

362

363 [

364 { "id": 1, "name": "Alice", "role": "Admin" },

365 { "id": 2, "name": "Bob" }

366 ]

367

368 =head3 Advanced Structural Coercion & Cross-Merging

369

370 C strictly enforces the primary structure of your target reference (determined by inspecting its outer and inner bounds). If you mix Array and Hash types, the function automatically coerces the incoming data to match the target.

371

372 B<1. Inner Coercion (Mixing Rows):>

373

374 =over

375

376 =item * B Source Array rows are read in pairs and converted to key-value pairs.

377

378 =item * B Source Hash rows are flattened into key-value pairs and pushed onto the array.

379

380 =back

381

382 B<2. Root-Level Coercion (Mixing Outer Containers):>

383

384 =over

385

386 =item * B The function evaluates the Hash keys as numeric indices. (e.g., source key C<"0"> merges into target array index C<[0]>). Non-numeric keys are safely ignored.

387

388 =item * B The function converts the Array indices into stringified Hash keys. (e.g., source array index C<[1]> merges into target hash key C<"1">).

389

390 =back

391

392 =head3 Source is a mixed Hash. Keys dictate the target array index!

393

394 $n = {

395 '0' => { y => 20 }, # Merges into $data->[0]

396 '1' => [ 'z', 30 ], # Array pair coerced to Hash, creates $data->[1]

397 'ignored' => { k => 'v' } # Ignored: cannot map to an array index

398 };

399

400 add_data($data, $n);

401

402 B

403

404 [

405 { "x": 10, "y": 20 },

406 { "z": 30 }

407 ]

408

409 NB: If C is called on a completely empty target reference (e.g., C<$data = {}> or C<$data = []>), it will intelligently infer the required inner structure (Hashes vs Arrays) by inspecting the first valid row of the source data.

410

411 =head2 aov

412

413 Warning: assumes normal distribution

414

415 aov(

416 {

417 yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],

418 ctrl => [1, 1, 1, 0, 0, 0]

419 },

420 'yield ~ ctrl');

421

422 which returns

423

424 {

425 ctrl {

426 Df 1,

427 "F value" 25.6000000000001,

428 "Mean Sq" 1.70666666666667,

429 Pr(>F) 0.00718232855871859,

430 "Sum Sq" 1.70666666666667

431 },

432 Residuals {

433 Df 4,

434 "Mean Sq" 0.0666666666666665,

435 "Sum Sq" 0.266666666666666

436 }

437 }

438

439 You can also perform Two-Way ANOVA with categorical interactions using the C<*> operator. The parser will implicitly evaluate the main effects alongside the interaction:

440

441 my $res_2way = aov($data_2way, 'len ~ supp * dose');

442

443 It is robust against rank deficiency; collinear terms will gracefully receive 0 degrees of freedom and 0 sum of squares, matching R's behavior.

444

445 =head3 Input Parameters

446

447

448

449 =begin html

450

451

452
453
454								Parameter
455								Type
456								Default
457								Description
458								Example
459
460
461
462
463								`data_sv`
464								`HashRef` or `ArrayRef`
465								(Required)
466								The dataset to analyze. Accepts a Hash of Arrays (HoA) or Array of Hashes (AoH). If no formula is provided, it must be an HoA to allow automatic stacking (mimicking R's `stack()` on a named list).
467
468
469
470								`formula_sv`
471								`String`
472								`undef`
473								A symbolic description of the model to be fitted. If omitted, the formula automatically defaults to `'Value ~ Group'` and the input data is stacked.
474								`'yield ~ N * P'`
475
476
477

478

479 =end html

480

481

482

483 =head3 Output Variables

484

485 The function returns a single C containing the evaluated statistical results. Because the keys map dynamically to the terms parsed from your formula, the structure will vary based on your inputs.

486

487

488

489 =begin html

490

491

492
493
494								Parameter
495								Type
496								Default
497								Description
498								Example
499
500
501
502
503								(Term Name)
504								`HashRef`
505								`undef`
506								A nested hash for each independent term in the formula (e.g., `'Group'`, `'N:P'`), containing its ANOVA table statistics.
507								`{'Df' => 1, 'Sum Sq' => 14.2, 'Mean Sq' => 14.2, 'F value' => 25.81, 'Pr(>F)' => 0.0004}`
508
509
510								`Residuals`
511								`HashRef`
512								`undef`
513								A nested hash containing the residual (error) statistics for the fitted model.
514								`{'Df' => 10, 'Sum Sq' => 5.5, 'Mean Sq' => 0.55}`
515
516
517								`group_stats`
518								`HashRef`
519								`undef`
520								A nested hash containing descriptive statistics (`mean` and `size` / count) for every column evaluated in the original unstacked data structure.
521								`{'mean' => {'A' => 2.1, 'B' => 5.4}, 'size' => {'A' => 10, 'B' => 10}}`
522
523
524

525

526 =end html

527

528

529

530 =head3 omitting formula

531

532 As of version 0.07, in the case of an omitted formula, stacking is done:

533

534 aov(

535 {

536 yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],

537 ctrl => [1, 1, 1, 0, 0, 0]

538 },

539 );

540

541 is the equivalent of:

542

543 yield <- c(5.5, 5.4, 5.8, 4.5, 4.8, 4.2)

544 ctrl <- c(1, 1, 1, 0, 0, 0)

545

546 # Combine them into a named list (the R equivalent of your hash)

547 my_list <- list(yield = yield, ctrl = ctrl)

548

549 # Convert the list into a "long" dataframe

550 # This creates two columns: "values" and "ind" (the group name)

551 my_data <- stack(my_list)

552

553 # Rename columns for clarity (optional but good practice)

554 colnames(my_data) <- c("Value", "Group")

555 anova_model <- aov(Value ~ Group, data = my_data)

556 summary(anova_model)

557

558 in R

559

560 =head2 cfilter

561

562 Select B out of a table and return it in the same shape. A column is

563 the inner (second-level) key of a B or an B,

564 or the outer key of a B:

565

566 use Stats::LikeR;

567 my %hoa = ( x => [1,2,3], y => [4,5,6], z => [0,0,0] );

568 cfilter(\%hoa, keep => ['x','y']); # { x => [1,2,3], y => [4,5,6] }

569 cfilter(\%hoa, remove => ['z']); # { x => [1,2,3], y => [4,5,6] }

570

571 C takes exactly one of C or C. C returns only the

572 matching columns; C returns everything except them. The result is the

573 same shape as the input (HoH → HoH, HoA → HoA, AoH → AoH), with cell values

574 copied and the original structure left untouched.

575

576 =head3 Selecting by name

577

578 Pass an array ref of column names. Naming a column that is not present in the

579 data is an error (it catches typos), and a row that happens not to contain a

580 kept column simply comes back without it:

581

582 my @aoh = ( { a => 1, b => 2 }, { a => 3 } );

583 cfilter(\@aoh, keep => ['b']); # [ { b => 2 }, {} ]

584

585 =head3 Selecting by a predicate

586

587 Instead of names, C/C accept a B — a CODE ref or a

588 function name — evaluated once per column. It is called as

589

590 $predicate->($column_values, $column_name)

591

592 where C<$column_values> is an array ref of the column's B cells (undef

593 and missing cells are dropped, so functions like C get clean input).

594 With C, columns for which the predicate is true are kept; with C,

595 those columns are dropped.

596

597 # Keep only the constant columns (standard deviation zero):

598 my $const = cfilter(\%hoa, keep => sub { sd($_[0]) == 0 }); # { z => [0,0,0] }

599 # Drop the constant columns instead:

600 my $varying = cfilter(\%hoa, remove => sub { sd($_[0]) == 0 }); # { x=>..., y=>... }

601 # A bare function name resolves in Stats::LikeR:: (use a package for your own):

602 cfilter(\%hoa, keep => 'some_predicate');

603

604 A bare string is always treated as a B, not a single column

605 name, so to keep one column by name use an array ref: C<< keep =E ['x'] >>.

606

607 =head3 Errors

608

609 C dies (via C) when:

610

611 =over

612

613 =item * neither C nor C is given, or both are,

614

615 =item * a named column is not present in the data,

616

617 =item * the selector is neither an array ref nor a code ref / function name, or the

618 function name cannot be resolved,

619

620 =item * an unknown option is given, or the options are not C<< name =E value >> pairs,

621

622 =item * the data is not a hash/array reference of the expected shape (a hash of hash

623 refs or array refs, or an array of hash refs).

624

625 =back

626

627 =head2 chisq_test

628

629 The C function performs chi-squared contingency table tests and goodness-of-fit tests. It natively accepts both arrays and hashes (1D and 2D) and mathematically mirrors R's C, returning a structured hash reference of the results.

630

631 For 2x2 matrices, Yates' Continuity Correction is applied automatically.

632

633 =head3 Accepted Inputs

634

635

636

637 =begin html

638

639

640
641
642								Input Type
643								Data Structure
644								Applied Test
645
646
647
648
649								1D Array
650								`[ $v1, $v2, ... ]`
651								Chi-squared test for given probabilities
652
653
654								2D Array
655								`[ [ $v1, $v2 ], [ $v3, $v4 ] ]`
656								Pearson's Chi-squared test (Yates' correction if 2x2)
657
658
659								1D Hash
660								`{ key1 => $v1, key2 => $v2 }`
661								Chi-squared test for given probabilities
662
663
664								2D Hash
665								`{ row1 => { c1 => $v1, c2 => $v2 } }`
666								Pearson's Chi-squared test (Yates' correction if 2x2)
667
668
669

670

671 =end html

672

673

674

675 =head3 Output Object Structure

676

677 The function returns a single Hash Reference containing the following key-value pairs. The internal structure of C and C will always identically match the structure of your input.

678

679

680

681 =begin html

682

683

684
685
686								Key
687								Data Type
688								Description
689
690
691
692
693								data.name
694								String
695								Identifies the input type (e.g., `"Perl ArrayRef"` or `"Perl HashRef"`).
696
697
698								expected
699								Array/Hash Ref
700								The expected frequencies, matching the geometry of the input.
701
702
703								method
704								String
705								The specific statistical test applied.
706
707
708								observed
709								Array/Hash Ref
710								The original data passed to the function.
711
712
713								p.value
714								Float
715								The calculated p-value of the test.
716
717
718								parameter
719								Hash Ref
720								Contains the degrees of freedom (`df`).
721
722
723								statistic
724								Hash Ref
725								Contains the test statistic (`X-squared`).
726
727
728

729

730 =end html

731

732

733

734 =head3 Two-Dimensional Array

735

736 Passing an Array of Arrays (AoA) triggers a standard Pearson's Chi-squared test. If the input is exactly a 2x2 matrix, Yates' continuity correction is applied automatically.

737

738 my $test_data = [

739 [762, 327, 468],

740 [484, 239, 477]

741 ];

742 my $res = chisq_test($test_data);

743

744 B

745

746 {

747 'data.name' => 'Perl ArrayRef',

748 'expected' => [

749 [ 703.671381936888, 319.645266594124, 533.683351468988 ],

750 [ 542.328618063112, 246.354733405876, 411.316648531012 ]

751 ],

752 'method' => "Pearson's Chi-squared test",

753 'observed' => [

754 [ 762, 327, 468 ],

755 [ 484, 239, 477 ]

756 ],

757 'p.value' => 2.95358918321176e-07,

758 'parameter' => { 'df' => 2 },

759 'statistic' => { 'X-squared' => 30.0701490957547 }

760 }

761

762 =head3 1-Dimensional Array (Goodness of Fit)

763

764 Passing a flat Array Reference triggers a Goodness of Fit test, assuming equal expected probabilities across all items.

765

766 my $data = [10, 20, 30];

767 my $res = chisq_test($data);

768

769 B

770

771 {

772 'data.name' => 'Perl ArrayRef',

773 'expected' => [ 20, 20, 20 ],

774 'method' => 'Chi-squared test for given probabilities',

775 'observed' => [ 10, 20, 30 ],

776 'p.value' => 0.00673794699908547,

777 'parameter' => { 'df' => 2 },

778 'statistic' => { 'X-squared' => 10 }

779 }

780

781 =head3 2-Dimensional Hash (Pearson's Chi-squared)

782

783 Passing a Hash of Hashes (HoH) applies the exact same logic as a 2D Array, but preserves your nested string keys in the output. This is particularly useful when mapping data extracted directly from JSON, databases, or categorical mappings.

784

785 my $data = {

786 GroupA => { Success => 10, Failure => 15 },

787 GroupB => { Success => 20, Failure => 5 }

788 };

789

790 my $res = chisq_test($data);

791

792 B

793

794 {

795 'data.name' => 'Perl HashRef',

796 'expected' => {

797 'GroupA' => { 'Failure' => 10, 'Success' => 15 },

798 'GroupB' => { 'Failure' => 10, 'Success' => 15 }

799 },

800 'method' => "Pearson's Chi-squared test with Yates' continuity correction",

801 'observed' => {

802 'GroupA' => { 'Failure' => 15, 'Success' => 10 },

803 'GroupB' => { 'Failure' => 5, 'Success' => 20 }

804 },

805 'p.value' => 0.00937475878430379,

806 'parameter' => { 'df' => 1 },

807 'statistic' => { 'X-squared' => 6.75 }

808 }

809

810 =head3 One-Dimensional Hash (Goodness of Fit)

811

812 Flat Hash References evaluate Goodness of Fit while preserving your categorical keys in the C and C output blocks.

813

814 my $data = {

815 Apples => 10,

816 Oranges => 20,

817 Bananas => 30

818 };

819

820 my $res = chisq_test($data);

821

822 =head1 C

823

824 Apply a B to every pair of columns in a table and collect

825 the answers in a hash of hashes.

826

827 It's the workhorse behind things like correlation matrices: give it your data and

828 the name of a function that takes two columns (C, C, …) and you get

829 back every column compared against every other column.

830

831 use Stats::LikeR;

832

833 my %data = (

834 height => [ 170, 165, 180, 175 ],

835 weight => [ 70, 60, 85, 77 ],

836 age => [ 30, 41, 25, 38 ],

837 );

838

839 my $result = col2col(\%data, 'cor');

840

841 # $result->{height}{weight} == correlation of height vs weight

842 # $result->{height}{age} == correlation of height vs age

843 # ...and so on for every pair

844

845 ========================================================================

846

847 =head2 Arguments

848

849 col2col( $data, $command, $cols, %options )

850 col2col( $data, $command, \%options ) # options in place of $cols

851

852

853

854 =begin html

855

856

857
858
859								Position
860								Argument
861								What it is
862
863
864
865
866								1
867								`$data`
868								Your table, as a reference (see Data shapes below).
869
870
871								2
872								`$command`
873								A code block or the name of a two-column function.
874
875
876								3
877								`$cols`
878								(optional) Which columns to use as the "from" side. Omit for all.
879
880
881								4+
882								`%options`
883								(optional) `na`, `skip.errors`, … (see Options).
884
885
886

887

888 =end html

889

890

891

892 ========================================================================

893

894 =head2 Data shapes

895

896 C understands three layouts. In every case a B is the thing that

897 gets compared, and the result is keyed by column name.

898

899 B — keys are column names:

900

901 my %hoa = ( a => [1, 2, 3], b => [4, 5, 6] );

902

903 B — First keys are row names, second keys are columns:

904

905 my %hoh = (

906 row1 => { a => 1, b => 4 },

907 row2 => { a => 2, b => 5 },

908 );

909

910 B — each element is a row, inner keys are columns:

911

912 my @aoh = ( { a => 1, b => 4 }, { a => 2, b => 5 } );

913

914 All three produce the same result for the same underlying numbers. Missing or

915 C cells are handled by the C option (below).

916

917 ========================================================================

918

919 =head2 The command

920

921 The second argument is the function applied to each pair of columns. It is called

922 as:

923

924 $command->( $column_a, $column_b ) # two ARRAY refs

925

926 so inside a block the two columns arrive in C<@_>:

927

928 my $result = col2col(\%data, sub {

929 my ($x, $y) = @_; # $x and $y are array refs

930 cor($x, $y);

931 });

932

933 You can also pass a B. A bare name is looked up in

934 C, so these two are equivalent:

935

936 col2col(\%data, 'cor');

937 col2col(\%data, sub { cor($_[0], $_[1]) });

938

939 ========================================================================

940

941 =head2 The result

942

943 Always a hash of hashes: B<< C<< $result-E{from}{to} >> >>.

944

945 for my $from (sort keys %$result) {

946 for my $to (sort keys %{ $result->{$from} }) {

947 printf "%s vs %s = %s\n", $from, $to, $result->{$from}{$to};

948 }

949 }

950

951 A column is never compared with itself, so C<< $result-E{a}{a} >> does not exist.

952

953 ========================================================================

954

955 =head2 Restricting columns (C<$cols>)

956

957 By default every column is used as the "from" side. The third argument narrows

958 that down — handy when you only care about one variable.

959

960 # all columns vs all columns

961 my $all = col2col(\%data, 'cor');

962 # just ONE column vs every other column

963 my $one = col2col(\%data, 'cor', 'height');

964 my $cors = $one->{height}; # { weight => ..., age => ... }

965 # a FEW specific columns vs every other column

966 my $few = col2col(\%data, 'cor', ['height', 'weight']);

967

968 The "to" side is always every other column; C<$cols> only limits the outer keys.

969

970 ========================================================================

971

972 =head2 Options

973

974 Options can be given two ways:

975

976 col2col(\%data, 'cor', $cols, 'skip.errors' => 0); # after $cols

977 col2col(\%data, 'cor', { 'skip.errors' => 0 }); # hash ref, no $cols needed

978

979 The hash-ref form is convenient when you have B column restriction — it saves

980 you from passing a placeholder. (A hash ref I C<$cols>, so you can't use

981 it to restrict columns at the same time; use the trailing form for that.)

982

983 =head3 C — how undefined values are handled

984

985 Real data has gaps. C decides what the function sees.

986

987

988

989 =begin html

990

991

992
993
994								Value
995								Behaviour
996								Use for
997
998
999
1000
1001								`'pairwise'` (default)
1002								A row is used for a pair only if both columns are defined there. The two columns arrive aligned and equal-length.
1003								Paired stats like `cor`.
1004
1005
1006								`'omit'`
1007								Each column drops its own undefined values independently. The two columns may end up different lengths.
1008								Unpaired tests like `t_test`, `kruskal_test`, where a gap in one sample shouldn't discard a value in the other.
1009
1010
1011								`'keep'`
1012								Every row is passed through, `undef` and all.
1013								When your function does its own missing-data handling.
1014
1015
1016

1017

1018 =end html

1019

1020

1021

1022 # correlation: keep only complete pairs (the default)

1023 col2col(\%data, 'cor');

1024 # two-sample test: each column keeps its own values

1025 col2col(\%data, 't_test', undef, na => 'omit');

1026 col2col(\%data, 't_test', { na => 'omit' }); # same, no placeholder

1027

1028 C / C remain as boolean aliases for backward compatibility:

1029 C means C<'pairwise'>, C means C<'keep'>. Don't combine them with C.

1030

1031 =head3 C — keep going when a pair fails I<(default: true)>

1032

1033 Some functions croak on degenerate input — for example C dies if a column has

1034 zero variance. By default C B that croak per pair: instead of

1035 aborting the whole run, it stores the B of the error message in that

1036 cell, so the result tells you I pair failed and I. Every other cell is

1037 computed normally.

1038

1039 my $r = col2col(\%data, 'cor');

1040 # a good pair: $r->{a}{b} == 0.83

1041 # a bad pair: $r->{a}{const} eq 'cor: standard deviation of y is 0'

1042

1043 To restore the old "die on the first error" behaviour, turn it off:

1044

1045 col2col(\%data, 'cor', undef, 'skip.errors' => 0);

1046 col2col(\%data, 'cor', { 'skip.errors' => 0 });

1047

1048 Only errors from B are trapped. Mistakes in the call itself

1049 (unknown column, bad data, unknown function name, unknown option) always die.

1050

1051 ========================================================================

1052

1053 =head2 Worked examples

1054

1055 B

1056

1057 my $m = col2col(\%data, 'cor');

1058

1059 B

1060

1061 my $col = 'Testosterone, total (nmol/L)';

1062 my $cors = col2col($hoa, 'cor', $col)->{$col};

1063 for my $other (sort { ($cors->{$b} // -2) <=> ($cors->{$a} // -2) } keys %$cors) {

1064 next unless $cors->{$other} =~ /^-?\d/; # skip cells holding an error message

1065 printf "%-30s % .3f\n", $other, $cors->{$other};

1066 }

1067

1068 B

1069

1070 my $t = col2col($hoa, 't_test', undef, na => 'omit');

1071

1072 B

1073

1074 my $m = col2col($hoa, 'cor');

1075 for my $from (sort keys %$m) {

1076 for my $to (sort keys %{ $m->{$from} }) {

1077 my $v = $m->{$from}{$to};

1078 warn "$from vs $to: $v\n" if defined $v && $v !~ /^-?\d/; # non-numeric = error

1079 }

1080 }

1081

1082 ========================================================================

1083

1084 =head2 Gotchas

1085

1086 =over

1087

1088 =item * B, C<($col_a, $col_b)> — not a column and

1089 a name. Unpack with C.

1090

1091 =item * B<< C<'pairwise'> can still hit a constant I. >> A column with overall

1092 variance can be flat on just the rows it shares with one partner, so C may

1093 still croak for that pair. With the default C, that shows up as a

1094 message in the single offending cell rather than killing the run.

1095

1096 =item * B<< C does not modify your data. >> It reads the table and returns a new

1097 hash of hashes.

1098

1099 =item * B — i.e.

1100 C is the inner ("to") key. So C<< $result-E{A}{B} >> reading C<…deviation of y is 0>

1101 means column C is the degenerate one for that pair.

1102

1103 =back

1104

1105 =head2 cor

1106

1107 cor($array1, $array2, $method = 'pearson'),

1108

1109 that is, C is the default and will be used if C<$method> is not specified.

1110

1111 Just like R, C, C, and C are available

1112

1113 If you provide an array of arrays (a matrix), C will compute the correlation matrix automatically.

1114

1115 =head2 cor_test

1116

1117 my $result = cor_test(

1118 'x' => $x,

1119 'y' => $y,

1120 alternative => 'two.sided',

1121 method => 'pearson',

1122 continuity => 1

1123 );

1124

1125 C safely handles C (or C) values seamlessly by computing over pairwise complete observations.

1126

1127 =head2 cov

1128

1129 cov($array1, $array2, 'pearson')

1130

1131 or

1132

1133 cov($array1, $array2, 'spearman')

1134

1135 or

1136

1137 cov($array1, $array2, 'kendall')

1138

1139 =head2 dnorm

1140

1141 gives the density of the normal distribution, with the specified mean and standard deviation.

1142

1143 In other words, the predicted height of the value C, given a mean, standard deviation, and whether or not to use a log value.

1144

1145 returns a single scalar/number if a single value is given, otherwise returns an array reference.

1146

1147 Usage:

1148

1149 dnorm(4) # assumes a mean of 0 and standard deviation of 1

1150

1151 but default mean, standard deviation, and log can be passed as parameters:

1152

1153 $x = dnorm(0, mean => 0, sd => 2, 'log' => 0);

1154

1155 =head2 filter

1156

1157 Return a new data frame containing only the rows of C<$df> that match a predicate. The original C<$df> is never modified.

1158

1159 my $df2 = filter($df, col('column.name') > 4);

1160

1161 C accepts a predicate in one of two forms:

1162

1163 =over

1164

1165 =item 1. a B<< C expression >> — a small, composable comparison built with overloaded operators, and

1166

1167 =item 2. a B — for anything the operators can't express (multiple columns, regexes, arbitrary logic), in the same spirit as the C option of L<#>.

1168

1169 =back

1170

1171 Both C and C

1172

1173 =head3 Arguments

1174

1175

1176

1177 =begin html

1178

1179

1180
1181
1182								Position
1183								Name
1184								Description
1185
1186
1187
1188
1189								1
1190								`$df`
1191								The data frame to filter. Either an array of hashes (AoH — e.g. the default output of `read_table`) or a hash of arrays (HoA).
1192
1193
1194								2
1195								predicate
1196								Either a `col()` comparison object or a `CODE` reference.
1197
1198
1199

1200

1201 =end html

1202

1203

1204

1205 The return value is a B data frame of the B as the input (AoH in → AoH out, HoA in → HoA out). For an HoA, every column is filtered in parallel by row index, so all returned columns stay the same length and aligned.

1206

1207 =head3 The C form

1208

1209 C is a deferred reference to a column. It carries no data — only the column name — so it can be compared with a literal (or another value) to build a predicate that C evaluates once per row.

1210

1211 filter($df, col('age') >= 18); # keep rows where age >= 18

1212 filter($df, col('sex') eq 'f'); # keep rows where sex is 'f'

1213 filter($df, 18 <= col('age')); # operands may be in either order

1214

1215 =head3 Comparison operators

1216

1217

1218

1219 =begin html

1220

1221

1222
1223
1224								Kind
1225								Operators
1226								Comparison
1227
1228
1229
1230
1231								Numeric
1232								`>` `<` `>=` `<=` `==` `!=`
1233								numeric (the cell and the value are compared as numbers)
1234
1235
1236								String
1237								`gt` `lt` `ge` `le` `eq` `ne`
1238								string (the cell and the value are compared as strings)
1239
1240
1241

1242

1243 =end html

1244

1245

1246

1247 C may appear on either side of the operator; C<< 4 E col('x') >> is automatically rewritten to the equivalent C<< col('x') E 4 >>.

1248

1249 =head3 Combining predicates: C<&>, C<|>, C

1250

1251 Predicates compose with bitwise C<&> (and), C<|> (or), and C (not):

1252

1253 filter($df, (col('age') > 18) & (col('sex') eq 'f')); # and

1254 filter($df, (col('grp') eq 'a') | (col('grp') eq 'c')); # or

1255 filter($df, !(col('x') > 100)); # not

1256

1257 Comparison operators bind more tightly than C<&> and C<|>, so C<< (col('a') E 4) & (col('b') E 2) >> is parsed correctly, but the parentheses are recommended for readability.

1258

1259 =head3 The code-reference form

1260

1261 For logic the operators can't express, pass a C_{. It is called once per row; the B, available both as C<$_> and as the first argument C<$_[0]>. Return a true value to keep the row.}

1262

1263 filter($df, sub { $_->{x} > 4 && $_->{grp} eq 'a' });

1264 filter($df, sub { $_->{name} =~ /^A/ });

1265 filter($df, sub { $_[0]{score} > $_[0]{threshold} });

1266

1267 For an HoA, each row is assembled into a temporary hash reference (C<< { column =E value, ... } >>) before the sub is called, so the same C<< $_-E{column} >> syntax works regardless of the input shape.

1268

1269 =head3 Examples

1270

1271 use Stats::LikeR;

1272 my $df = read_table('patients.csv'); # array of hashes

1273 # numeric threshold

1274 my $adults = filter($df, col('Age') >= 18);

1275 # combine conditions

1276 my $target = filter($df, (col('Age') >= 18) & (col('Sex') eq 'f'));

1277 # arbitrary logic with a coderef

1278 my $flagged = filter($df, sub { $_->{ALT} > 40 || $_->{AST} > 40 });

1279 # hash-of-arrays input -> hash-of-arrays output, columns filtered in parallel

1280 my $hoa = read_table('patients.csv', 'output.type' => 'hoa');

1281 my $sub = filter($hoa, col('Age') > 32);

1282 # $sub->{Age}, $sub->{Sex}, ... are all the same length and row-aligned

1283

1284 =head3 Behavior and notes

1285

1286 =over

1287

1288 =item * B C builds and returns a new frame; C<$df> is left untouched.

1289

1290 =item * B<< A missing or C cell never matches >> a C comparison. For example C<< col('x') E 0 >> silently drops any row that has no C value or whose C is C.

1291

1292 =item * B, into the returned frame: the returned array references the I row hashes as the input (fast, low-memory). Mutating a row in the result would therefore also change it in the original. HoA values are copied into fresh arrays.

1293

1294 =item * B are well defined: a predicate true for every row returns a copy-shaped frame with all rows; a predicate true for none returns an empty frame (C<[]> for AoH, a hash of empty arrays for HoA).

1295

1296 =item * B Passing a non-reference, an array element that is not a hash reference, or an HoA column that is not an array reference raises a descriptive error.

1297

1298 =item * B The C/operator layer is pure Perl (operator overloading); the per-row evaluation is done in XS.

1299

1300 =back

1301

1302 =head3 See also

1303

1304 C (whose C option applies the same coderef convention while reading a file), C.

1305

1306 =head2 fisher_test

1307

1308 =head3 array reference entry

1309

1310 my $array_data = [

1311 [10, 2],

1312 [3, 15]

1313 ];

1314 my $res1 = fisher_test($array_data);

1315

1316 which returns a hash reference:

1317

1318 {

1319 alternative "two.sided",

1320 conf_int [

1321 [0] 2.75343836564204,

1322 [1] 300.682787419401

1323 ],

1324 conf_level 0.95,

1325 estimate {

1326 "odds ratio" 21.3053312750168

1327 },

1328 method "Fisher's Exact Test for Count Data",

1329 p_value 0.000536724119143435

1330 }

1331

1332 =head3 hash reference entry

1333

1334 $ft = fisher_test( {

1335 Guess => {

1336 Milk => 3, Tea => 1

1337 },

1338 Truth => {

1339 Milk => 1, Tea => 3

1340 }

1341 });

1342

1343 =head2 glm

1344

1345 takes a hash of an array as input

1346

1347 my %tooth_growth = (

1348 dose => [qw(0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0

1349 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5

1350 0.5 0.5 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0

1351 2.0 2.0 2.0)],

1352 len => [qw(4.2 11.5 7.3 5.8 6.4 10.0 11.2 11.2 5.2 7.0 16.5 16.5 15.2 17.3 22.5

1353 17.3 13.6 14.5 18.8 15.5 23.6 18.5 33.9 25.5 26.4 32.5 26.7 21.5 23.3 29.5

1354 15.2 21.5 17.6 9.7 14.5 10.0 8.2 9.4 16.5 9.7 19.7 23.3 23.6 26.4 20.0

1355 25.2 25.8 21.2 14.5 27.3 25.5 26.4 22.4 24.5 24.8 30.9 26.4 27.3 29.4 23.0)],

1356 supp => [qw(VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC

1357 VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ

1358 OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ)]

1359 );

1360

1361 my $glm_teeth = glm(

1362 data => \%tooth_growth,

1363 formula => 'len ~ dose + supp',

1364 family => 'gaussian'

1365 );

1366

1367 In addition to the C default, it fully supports logistic regression using the C family parameter via Iteratively Reweighted Least Squares (IRLS):

1368

1369 my $glm_bin = glm(formula => 'am ~ wt + hp', data => \%mtcars, family => 'binomial');

1370

1371 =head3 Input Parameters

1372

1373

1374

1375 =begin html

1376

1377

1378
1379
1380								Parameter
1381								Type
1382								Default
1383								Description
1384								Example
1385
1386
1387
1388
1389								`formula`
1390								`String`
1391								None (Required)
1392								A symbolic description of the model to be fitted. Supports operators like `+`, `:`, `*`, `^`, and `-1` (to remove the intercept).
1393								`'am ~ wt + hp'`, `'y ~ x - 1'`
1394
1395
1396								`data`
1397								`HashRef` or `ArrayRef`
1398								None (Required)
1399								The dataset containing the variables used in the formula. Accepts either a Hash of Arrays (HoA) or an Array of Hashes (AoH).
1400								`\%mtcars`, `[{x => 1, y => 2}, ...]`
1401
1402
1403								`family`
1404								`String`
1405								`'gaussian'`
1406								A description of the error distribution and link function to be used in the model. Currently supports `'gaussian'` (identity link) and `'binomial'` (logit link).
1407								`'binomial'`
1408
1409
1410

1411

1412 =end html

1413

1414

1415

1416 =head3 Output variables

1417

1418

1419

1420 =begin html

1421

1422

1423
1424
1425								Variable
1426								Type
1427								Description
1428								Example
1429
1430
1431
1432
1433								`aic`
1434								`Double`
1435								Akaike's Information Criterion for the fitted model.
1436								`123.45`
1437
1438
1439								`boundary`
1440								`Integer (Boolean)`
1441								`1` if the fitted values computationally reached the `0` or `1` boundary (specific to the binomial family), `0` otherwise.
1442								`0`
1443
1444
1445								`coefficients`
1446								`HashRef`
1447								A hash mapping the expanded model term names to their estimated coefficient values.
1448								`{'Intercept' => 1.5, 'wt' => -0.5}`
1449
1450
1451								`converged`
1452								`Integer (Boolean)`
1453								`1` if the Iteratively Reweighted Least Squares (IRLS) algorithm converged within the maximum iterations, `0` otherwise.
1454								`1`
1455
1456
1457								`deviance`
1458								`Double`
1459								The residual deviance of the fitted model.
1460								`15.2`
1461
1462
1463								`deviance.resid`
1464								`HashRef`
1465								A hash mapping data row names to their computed deviance residuals.
1466								`{'Mazda RX4' => 0.12}`
1467
1468
1469								`df.null`
1470								`Integer`
1471								The residual degrees of freedom for the null model.
1472								`31`
1473
1474
1475								`df.residual`
1476								`Integer`
1477								The residual degrees of freedom for the fitted model.
1478								`30`
1479
1480
1481								`family`
1482								`String`
1483								The statistical family used to fit the model.
1484								`"gaussian"`
1485
1486
1487								`fitted.values`
1488								`HashRef`
1489								A hash mapping data row names to the fitted mean values (the model's predictions on the scale of the response).
1490								`{'Mazda RX4' => 0.85}`
1491
1492
1493								`iter`
1494								`Integer`
1495								The number of IRLS iterations performed before convergence or hitting the iteration limit.
1496								`4`
1497
1498
1499								`null.deviance`
1500								`Double`
1501								The deviance for the null model (a baseline model containing only an intercept, or an offset of 0 if the intercept is removed).
1502								`43.5`
1503
1504
1505								`rank`
1506								`Integer`
1507								The numeric rank of the fitted linear model (the number of estimated, non-aliased parameters).
1508								`2`
1509
1510
1511								`summary`
1512								`HashRef`
1513								A nested hash mapping each term to its detailed summary statistics, including `Estimate`, `Std. Error`, `t value` / `z value`, and `Pr(> t )` / `Pr(> z )`. Aliased parameters return `"NaN"`.
1514								`{'wt' => {'Estimate' => -0.5, 'Std. Error' => 0.1, ...}}`
1515
1516
1517								`terms`
1518								`ArrayRef`
1519								An ordered list of the expanded term names included in the model matrix.
1520								`['Intercept', 'wt', 'hp']`
1521
1522
1523

1524

1525 =end html

1526

1527

1528

1529 =head2 group_by

1530

1531 Take a hash of arrays, hash of hashes, or array of hashes, and group a column by another column.

1532

1533 my $aoh_data = [

1534 { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 },

1535 { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 },

1536 { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 },

1537 { 'Gender' => 'Female' } # Intentional missing target value

1538 ];

1539

1540 as well as

1541

1542 $hoh_data = {

1543 'Patient_A' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 20.5 },

1544 'Patient_B' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => 1.8 },

1545 'Patient_C' => { 'Gender' => 'Male', 'Testosterone, total (nmol/L)' => 18.2 },

1546 'Patient_D' => { 'Gender' => 'Female' }, # Intentional missing target value

1547 'Patient_E' => { 'Gender' => 'Female', 'Testosterone, total (nmol/L)' => undef } # Explicit undef

1548 };

1549

1550 and

1551

1552 my $hoa_data = {

1553 'Gender' => ['Male', 'Female', 'Male', 'Female'],

1554 'Testosterone, total (nmol/L)' => [22.1, 2.5, 19.4, undef ]

1555 };

1556

1557 then run the function thus:

1558

1559 group_by( $hoa_data, 'Testosterone, total (nmol/L)', 'Gender');

1560

1561 The output can be thought of like a hash, with the first string broken down by the second.

1562

1563 all become hash of arrays:

1564

1565 {

1566 Female [

1567 [0] 1.8

1568 ],

1569 Male [

1570 [0] 18.2,

1571 [1] 20.5

1572 ]

1573 }

1574

1575 returns an empty array of hashes if neither target nor group keys are found.

1576

1577 =head3 Filtering

1578

1579 Data can be further broken down with filter/subs like in C:

1580

1581 my $testosterone = group_by($d, # group testosterone by "Gender"

1582 'Testosterone, total (nmol/L)',

1583 'Gender',

1584 { 'Race/Hispanic origin w/ NH Asian' => sub { $_ eq $n } },# filter

1585 { 'Testosterone, total (nmol/L)' => sub { $_ ne 'NA' } } # filter

1586 );

1587

1588 where each filter filters on the columns, e.g. second hash keys.

1589

1590 =head2 hoh2hoa

1591

1592 Convert a B (row-major: outer key = row, inner key = column)

1593 into a B (column-major: key = column, value = that column's

1594 cells down the rows).

1595

1596 use Stats::LikeR;

1597

1598 my %hoh = (

1599 'r1' => { 'a' => 1, 'b' => 2 },

1600 'r2' => { 'a' => 3, 'b' => 4 },

1601 );

1602

1603 my $hoa = hoh2hoa(\%hoh);

1604

1605 which returns

1606 {

1607 a => [1, 3],

1608 b => [2, 4],

1609 }

1610

1611 =head3 Behavior

1612

1613 =over

1614

1615 =item * B are the union of every inner key, so a key that appears in only

1616 some rows still becomes a column.

1617

1618 =item * B are emitted in sorted outer-key (row-name) order, and that one order

1619 is used for every column, so the arrays stay aligned and the result is

1620 reproducible regardless of hash ordering.

1621

1622 =item * B — a missing inner key, or a cell whose value is C — are filled

1623 with the fill value (see C below). Every column therefore has

1624 exactly one entry per row.

1625

1626 =item * Values are B into the result; the original structure is left

1627 untouched.

1628

1629 =item * An B hash of hashes returns an empty hash of arrays (it is not an

1630 error).

1631

1632 =back

1633

1634 =head3 Options

1635

1636 Options are passed as trailing C<< name =E value >> pairs.

1637

1638

1639

1640 =begin html

1641

1642

1643
1644
1645								Option
1646								Default
1647								Meaning
1648
1649
1650
1651
1652								`undef.val`
1653								`undef`
1654								Value used to fill a missing key or an `undef` cell. Any defined scalar works, including `0` and `''`. Passing `undef` keeps the default.
1655
1656
1657								`row.names`
1658								(none)
1659								If set to a string, an extra column of that name is added holding the sorted row labels, aligned with the data. Dies if the name collides with an existing column.
1660
1661
1662

1663

1664 =end html

1665

1666

1667

1668 # Ragged input with an explicit fill string:

1669 my %ragged = (

1670 'r1' => { 'a' => 1, 'b' => 2 },

1671 'r2' => { 'a' => 3, 'c' => 9 },

1672 );

1673 my $hoa = hoh2hoa(\%ragged, 'undef.val' => 'NA');

1674 # {

1675 # a => [1, 3 ],

1676 # b => [2, 'NA'],

1677 # c => ['NA', 9 ],

1678 # }

1679

1680 # Keep the row labels as a column:

1681 my $with_ids = hoh2hoa(\%ragged, 'row.names' => 'id');

1682 # {

1683 # id => ['r1', 'r2'],

1684 # a => [1, 3 ],

1685 # b => [2, undef],

1686 # c => [undef, 9 ],

1687 # }

1688

1689 =head3 Errors

1690

1691 C dies (via C) when:

1692

1693 =over

1694

1695 =item * the argument is not a hash reference,

1696

1697 =item * any value in the hash is not itself a hash reference,

1698

1699 =item * an unknown option is given, or the options are not C<< name =E value >> pairs,

1700

1701 =item * C is not a plain string, or it names an already-present column.

1702

1703 =back

1704

1705 =head2 hist

1706

1707 Computes the histogram of the given data values, operating in single $O(N)$ pass performance. It returns the bin counts, computed breaks, midpoints, and density.

1708

1709 my $res = hist([1, 2, 2, 3, 3, 3, 4, 4, 5], breaks => 4);

1710

1711 If C is not explicitly provided, it defaults to calculating the number of bins using Sturges' formula.

1712

1713 =head2 kruskal_test

1714

1715 Essentially the test determines if all groups have the same median (same distribution) (an excellent review is at https://library.virginia.edu/data/articles/getting-started-with-the-kruskal-wallis-test)

1716

1717 Performs a Kruskal-Wallis rank sum test, see

1718 https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/kruskal.test

1719

1720 =head3 hash of array entry

1721

1722 I feel that this is better, and more easily read, than what you get in R:

1723

1724 my %x = (

1725 'normal.subjects' => [2.9, 3.0, 2.5, 2.6, 3.2],

1726 'obs. airway disease' => [3.8, 2.7, 4.0, 2.4],

1727 'asbestosis' => [2.8, 3.4, 3.7, 2.2, 2.0]

1728 );

1729 $kt = kruskal_test(\%x);

1730

1731 =head3 R-like array entry

1732

1733 my @xk = (2.9, 3.0, 2.5, 2.6, 3.2); # normal subjects

1734 my @yk = (3.8, 2.7, 4.0, 2.4); # with obstructive airway disease

1735 my @zk = (2.8, 3.4, 3.7, 2.2, 2.0); # with asbestosis

1736 my @x = (@xk, @yk, @zk);

1737 my @g = (

1738 (map {'Normal subjects'} 0..4),

1739 (map {'Subjects with obstructive airway disease'} 0..3),

1740 map {'Subjects with asbestosis'} 0..4

1741 );

1742 my $kt = kruskal_test(\@x, \@g);

1743

1744 =head2 ks_test

1745

1746 The Kolmogorov-Smirnov test, which tests whether or not two arrays/lists of data are part of the same distribution is implemented simply:

1747

1748 $ks = ks_test(\@x, \@y, alternative => 'greater');

1749

1750 returning a hash reference.

1751

1752 Also, a single array can be tested against a normal distribution:

1753

1754 $ks = ks_test($ksx, 'pnorm');

1755

1756 The p-value precision is about 1e-8, which I want to improve, but am not sure how.

1757

1758 =head2 ljoin

1759

1760 Consider a hash: C<$h{$row}{$col}>, and another hash C<$i{$row}{$col}>.

1761 C will add information for C<$col> in C<%i> for each C<$row> to C<%h>, where C<$row> exists in both C<%h> and C<%i>

1762

1763 For example,

1764

1765 {

1766 "Jack Smith" {

1767 age 30

1768 }

1769 }

1770

1771 and a second hash,

1772 {

1773 "Jack Smith" {

1774 dept "Engineering"

1775 },

1776 "Jane Doe" {

1777 age 25

1778 }

1779 }

1780

1781 in this case, running C will modify \%h to result:

1782

1783 {

1784 "Jack Smith" {

1785 age 30,

1786 dept "Engineering"

1787 }

1788 }

1789

1790 =head2 lm

1791

1792 This is the linear models function.

1793

1794 $lm = lm(formula => 'mpg ~ wt + hp', data => $mtcars);

1795

1796 where C<$mtcars> is a hash of hashes

1797

1798 C also supports generating interaction terms directly within the formula using the C<*> operator:

1799

1800 my $lm = lm(formula => 'mpg ~ wt * hp^2', data => \%mtcars);

1801

1802 If your data contains missing numbers (C or C), C handles listwise deletion dynamically to ensure mathematical integrity before fitting.

1803

1804 the dot operator also works:

1805

1806 $lm = lm(formula => 'y ~ .', data => $dot_data);

1807

1808 =head2 matrix

1809

1810 my $mat1 = matrix(

1811 data => [1..6],

1812 nrow => 2

1813 );

1814

1815 You can also pass C<< byrow =E 1 >> if you want the matrix populated row-wise instead of column-wise.

1816

1817 As of version 0.10, parameters do not need to be named, so that C works more like R:

1818

1819 my $d = matrix(rnorm(32000), 1000, 32);

1820

1821 works as C, C, and C

1822

1823 =head2 max

1824

1825 max(1,2,3);

1826

1827 or

1828

1829 my @arr = 1..8;

1830 max(@arr, 4, 5)

1831

1832 as of version 0.02, max will die if any undefined values are provided

1833

1834 =head2 mean

1835

1836 mean(1,2,3);

1837

1838 or

1839

1840 my @arr = 1..8;

1841 mean(@arr, 4, 5)

1842

1843 or

1844

1845 mean([1,1], [2,2]) # 1.5

1846

1847 as of version 0.02, mean will die if any undefined values are provided

1848

1849 =head2 median

1850

1851 works like mean, taking array references and arrays:

1852

1853 median( $test_data[$i][0] )

1854

1855 as of version 0.02, median will die if any undefined values are provided

1856

1857 =head2 min

1858

1859 min(1,2,3);

1860

1861 or

1862

1863 my @arr = 1..8;

1864 min(@arr, 4, 5)

1865

1866 as of version 0.02, min will die if any undefined values are provided

1867

1868 =head2 mode

1869

1870 Takes either an array or an array reference, and returns an array of the most common scalars (numbers or strings)

1871

1872 @arr = mode([1,3,3,3]); # returns (3)

1873

1874 @arr = mode('a','a','c','c','z'); # returns ('a', 'c')

1875

1876 =head2 oneway_test

1877

1878 Like ANOVA/aov but does not assume normality

1879

1880 =head3 hash of array input

1881

1882 $test_data = oneway_test({

1883 yield => [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],

1884 ctrl => [1, 1, 1, 0, 0, 0]

1885 });

1886

1887 which will output a hash reference:

1888

1889 {

1890 Group {

1891 Df 1,

1892 "F value" 177.504798464491,

1893 "Mean Sq" 61.6533333333333,

1894 Pr(>F) 1.31343255160843e-07,

1895 "Sum Sq" 61.6533333333333

1896 },

1897 group_stats {

1898 mean {

1899 ctrl 0.5,

1900 yield 5.03333333333333

1901 },

1902 size {

1903 ctrl 6,

1904 yield 6

1905 }

1906 },

1907 Residuals {

1908 Df 9.81767348326473,

1909 "Mean Sq" 0.353783749200256,

1910 "Sum Sq" 3.47333333333333

1911 }

1912

1913 }

1914

1915 =head3 array of array input

1916

1917 oneway_test([

1918 [5.5, 5.4, 5.8, 4.5, 4.8, 4.2],

1919 [1, 1, 1, 0, 0, 0]

1920 ]);

1921

1922 which will output a nearly identical hash reference as for hash of arrays:

1923

1924 {

1925 Group {

1926 Df 1,

1927 "F value" 177.504798464491,

1928 "Mean Sq" 61.6533333333333,

1929 Pr(>F) 1.31343255160843e-07,

1930 "Sum Sq" 61.6533333333333

1931 },

1932 group_stats {

1933 mean {

1934 "Index 0" 5.03333333333333,

1935 "Index 1" 0.5

1936 },

1937 size {

1938 "Index 0" 6,

1939 "Index 1" 6

1940 }

1941 },

1942 Residuals {

1943 Df 9.81767348326473,

1944 "Mean Sq" 0.353783749200256,

1945 "Sum Sq" 3.47333333333333

1946 }

1947 }

1948

1949 =head2 p_adjust

1950

1951 Returns array of false-discovery-rate-corrected p-values, where methods available are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr"

1952

1953 my @q = p_adjust(\@pvalues, $method);

1954

1955 =head2 power_t_test

1956

1957 $test_data = power_t_test(

1958 n => 30, delta => 0.5,

1959 sd => 1.0, sig_level => 0.05

1960 );

1961

1962 It also allows configuring the test type (C<< type =E 'one.sample' >>, C<'two.sample'>, C<'paired'>) and alternative hypothesis (C<< alternative =E 'one.sided' >>). You can also pass C<< strict =E 1 >> to strictly evaluate both tails of the distribution.

1963

1964

1965

1966 =begin html

1967

1968

1969
1970
1971								Parameter
1972								Type
1973								Default
1974								Description
1975
1976
1977
1978
1979								`n`
1980								Float
1981								`undef`
1982								Number of observations (per group for two-sample, pairs for paired).
1983
1984
1985								`delta`
1986								Float
1987								`undef`
1988								True difference in means.
1989
1990
1991								`sd`
1992								Float
1993								1.0
1994								Standard deviation.
1995
1996
1997								`sig_level`
1998								Float
1999								0.05
2000								Significance level (Type I error probability). Also accepts `sig.level`.
2001
2002
2003								`power`
2004								Float
2005								`undef`
2006								Power of test (1 minus Type II error probability).
2007
2008
2009								`type`
2010								String
2011								`"two.sample"`
2012								Type of t-test: `"two.sample"`, `"one.sample"`, or `"paired"`.
2013
2014
2015								`alternative`
2016								String
2017								`"two.sided"`
2018								One- or two-sided test: `"two.sided"`, `"one.sided"`, `"greater"`, or `"less"`.
2019
2020
2021								`strict`
2022								Boolean
2023								0 (False)
2024								Use strict interpretation of two-sided power calculations.
2025
2026
2027								`tol`
2028								Float
2029								~`1.22e-4`
2030								Numerical tolerance used for the internal root-finding algorithm.
2031
2032
2033

2034

2035 =end html

2036

2037

2038

2039 =head2 prcomp

2040

2041 Principal Component Analysis

2042

2043 =head3 Options

2044

2045

2046

2047 =begin html

2048

2049

2050
2051
2052								Option
2053								Type
2054								Default
2055								Description
2056
2057
2058
2059
2060								`center`
2061								Boolean
2062								`1` (True)
2063								If true, the variables are shifted to be zero-centered before the analysis takes place.
2064
2065
2066								`scale`
2067								Boolean
2068								`0` (False)
2069								If true, the variables are scaled to have unit variance before the analysis takes place. Note: If a column has zero variance, the function will `croak` to prevent division by zero.
2070
2071
2072								`retx`
2073								Boolean
2074								`1` (True)
2075								If true, the rotated data (the original data multiplied by the rotation matrix) is returned under the key `x`.
2076
2077
2078								`tol`
2079								Number
2080								`undef`
2081								A value indicating the magnitude below which components should be omitted. Components are omitted if their standard deviation is less than or equal to `tol` times the standard deviation of the first component.
2082
2083
2084								`rank`
2085								Integer
2086								`undef`
2087								Optionally specify a strict limit on the number of principal components to return. The function will return `min(rank, rows, columns)` components.
2088
2089
2090

2091

2092 =end html

2093

2094

2095

2096 =head3 Results

2097

2098 =head4 Returned Data Structure

2099

2100 The C function returns a HashRef containing the following keys representing the results of the Principal Component Analysis:

2101

2102

2103

2104 =begin html

2105

2106

2107
2108
2109								Key
2110								Type
2111								Description
2112
2113
2114
2115
2116								`sdev`
2117								ArrayRef[Number]
2118								The standard deviations of the principal components. Mathematically, these are the square roots of the eigenvalues of the covariance matrix.
2119
2120
2121								`rotation`
2122								ArrayRef[ArrayRef]
2123								A 2D array representing the matrix of variable loadings (the eigenvectors). Each inner array represents a row, and the columns correspond to the principal components.
2124
2125
2126								`x`
2127								ArrayRef[ArrayRef]
2128								A 2D array containing the rotated data (often referred to as PCA scores). This is the original data projected onto the principal components. Note: Only present if the `retx` option is true.
2129
2130
2131								`center`
2132								ArrayRef[Number] or `0`
2133								The centering values used (typically the column means). Returns false (`0`) if centering was disabled.
2134
2135
2136								`scale`
2137								ArrayRef[Number] or `0`
2138								The scaling values used (typically the column standard deviations). Returns false (`0`) if scaling was disabled.
2139
2140
2141								`varnames`
2142								ArrayRef[String]
2143								The sorted names of the original variables. Note: Only present if the input data was a Hash of Arrays (HoA) or a Hash of Hashes (HoH).
2144
2145
2146

2147

2148 =end html

2149

2150

2151

2152 =head3 Using array of arrays

2153

2154 my $aoa = [

2155 [2, 4],

2156 [4, 2],

2157 [6, 6]

2158 ];

2159

2160 my $pca = prcomp($aoa);

2161

2162 which returns

2163

2164 {

2165 center [

2166 [0] 4,

2167 [1] 4

2168 ],

2169 rotation [

2170 [0] [

2171 [0] 0.707106781186547,

2172 [1] 0.707106781186548

2173 ],

2174 [1] [

2175 [0] 0.707106781186548,

2176 [1] -0.707106781186547

2177 ]

2178 ],

2179 scale 0,

2180 sdev [

2181 [0] 2.44948974278318,

2182 [1] 1.4142135623731

2183 ],

2184 x [

2185 [0] [

2186 [0] -1.41421356237309,

2187 [1] -1.4142135623731

2188 ],

2189 [1] [

2190 [0] -1.4142135623731,

2191 [1] 1.41421356237309

2192 ],

2193 [2] [

2194 [0] 2.82842712474619,

2195 [1] 2.22044604925031e-16

2196 ]

2197 ]

2198 }

2199

2200 =head3 Hash of Arrays

2201

2202 my $hoa = { B => [4, 2, 6], A => [2, 4, 6] };

2203 my $pca = prcomp($hoa);

2204

2205 =head2 quantile

2206

2207 Calculates sample quantiles using R's continuous Type 7 interpolation.

2208

2209 my $quantile = quantile('x' => [1..99], probs => [0.05, 0.1, 0.25]);

2210

2211 If the C parameter is omitted, it behaves identically to R by defaulting to the 0, 25, 50, 75, and 100 percentiles (C). The returned hash keys match R's standardized naming convention (e.g., C<"25%">, C<"33.3%">).

2212

2213 =head2 rbinom

2214

2215 Create a binomial distribution of numbers

2216

2217 my $binom = rbinom( n => $n, prob => 0.5, size => 9);

2218

2219 It hooks directly into Perl's internal PRNG system, respecting C seeds.

2220

2221 =head2 read_table

2222

2223 I've tried to make this as simple as possible, trying to follow from R:

2224

2225 my $test_data = read_table('t/HepatitisCdata.csv');

2226

2227 =head3 options

2228

2229

2230

2231 =begin html

2232

2233

2234
2235
2236								Option
2237								Description
2238								Example
2239
2240
2241
2242
2243								`comment`
2244								Comment character, by default `#`
2245								`comment => %` or whatever; does not apply with header
2246
2247
2248								`output.type`
2249								data type for output: array of hash, hash of array, or hash of hash
2250								`'output.type' => 'aoh'`
2251
2252
2253								`filter`
2254								Only take in rows with a certain filter
2255								`filter => { Sex => sub {$_ eq 'f'} }`
2256
2257
2258								`row.names`
2259								include row names in retrieved data; off by default
2260
2261
2262
2263								`sep`
2264								field separator character; synonym with `delim`
2265								`sep => "\t"`
2266
2267
2268								`delim`
2269								field separator character; synonym with `sep`
2270								`delim => "\t"`
2271
2272
2273

2274

2275 =end html

2276

2277

2278

2279 output types can be AOH (aoa), HOA (hoa), HOH (hoh)

2280

2281 read_table($filename, 'output.type' => 'aoh');

2282

2283 read_table($filename, 'output.type' => 'hoa');

2284

2285 and, like Text::CSV_XS, filters can be applied in order to save RAM on big files:

2286

2287 $test_data = read_table(

2288 't/HepatitisCdata.csv',

2289 filter => {

2290 Sex => sub {$_ eq 'f'} # where "Sex" is the column name, and "$_" is the value for that column

2291 },

2292 'output.type' => 'aoh'

2293 );

2294

2295 the default delimiter is C<,>

2296 Suffixes C<.csv> and C<.tsv> are automatically detected from file names, but if specified, are overridden by C and/or C. C is given priority.

2297

2298 =head2 rnorm

2299

2300 Make a normal distribution of numbers, with pre-set mean C, standard deviation C, and number C.

2301

2302 my ($rmean, $sd, $n) = (10, 2, 9999);

2303 my $normals = rnorm( n => $n, mean => $rmean, sd => $sd);

2304

2305 =head2 runif

2306

2307 Make an approximately uniform distribution into an array

2308

2309 =head3 named arguments

2310

2311 my $unif = runif( n => $n, min => 0, max => 1);

2312

2313 where C is the number of items, the values are between C and C

2314

2315 =head3 positional args

2316

2317 this is to match R's behavior:

2318

2319 runif( 9 )

2320

2321 will make 9 numbers in [0,1]

2322

2323 runif(9, 0, 99)

2324

2325 will match C, C, and C respectively

2326

2327 =head2 sample

2328

2329 take a sample of hash or array slices.

2330

2331 my $h = sample(\%h, 4); # take 4 hash keys and their values into $h

2332

2333 or, alternatively, with arrays:

2334

2335 my $arr = sample(\@arr, 3); # take 3 indices of an array

2336

2337 =head2 scale

2338

2339 my @scaled_results = scale(1..5);

2340

2341 You can also pass an options hash to disable centering or scaling:

2342

2343 my @scaled_results = scale(1..5, { center => false, scale => 1 });

2344

2345 It fully supports matrix operations. By passing an array of arrays, C processes the data column by column independently:

2346

2347 my $scaled_mat = scale([[1, 2], [3, 4], [5, 6]]);

2348

2349 =head2 sd

2350

2351 my $stdev = sd(2,4,4,4,5,5,7,9);

2352

2353 Correct answer is 2.1380899352994

2354

2355 C can accept both array references as well as arrays:

2356

2357 my $stdev = sd([2,4,4,4,5,5,7,9]);

2358

2359 As of version 0.02, sd will croak/die if any undefined values are provided.

2360

2361 =head2 seq

2362

2363 Works as closely as I can to R's seq, which is very similar to Perl's C loops. Returns an array, not an array reference.

2364

2365 =head3 Standard integer sequence

2366

2367 say 'seq(1, 5):';

2368 my @seq = seq(1, 5);

2369 say join(', ', @seq), "\n";

2370

2371 say 'seq(1, 2, 0.25):';

2372 @seq = seq(1, 2, 0.25);

2373

2374 =head3 Fractional steps

2375

2376 say 'seq(1, 2, 0.25):';

2377 @seq = seq(1, 2, 0.25);

2378 say join(", ", @seq), "\n";

2379 for (my $idx = 2; $idx >= 1; $idx -= 0.25) { # count down to pop

2380 is_approx(pop @seq, $idx, "seq item $idx with fractional step");

2381 }

2382

2383 =head3 Negative steps

2384

2385 say 'seq(10, 5, -1):';

2386 @seq = seq(10, 5, -1);

2387 say join(", ", @seq), "\n";

2388 for (my $idx = 5; $idx <= 10; $idx++) { # count down to pop

2389 is_approx(pop @seq, $idx, "seq item $idx with negative step");

2390 }

2391

2392 =head2 shapiro_test

2393

2394 tests to see if an array reference is normally distributed, returns a p-value and a statistic

2395

2396 my $shapiro = shapiro_test(

2397 [1..5]

2398 );

2399

2400 and returns the hash reference:

2401

2402 {

2403 p.value 0.589650577093106,

2404 p_value 0.589650577093106,

2405 statistic 0.960870680168535,

2406 W 0.960870680168535

2407 }

2408

2409 =head2 sum

2410

2411 returns sum, but using both arrays and array references.

2412

2413 my $test_data = [1..8];

2414 sum($test_data)

2415

2416 which I prefer, compared to List::Util's required casting into an array:

2417

2418 sum(@{ $test_data });

2419

2420 which passing a reference is shorter and much easier to read. Stats::LikeR, however, will work for B

2421

2422 as of version 0.02, C will cause the script to die if any undefined values are provided

2423

2424 =head2 summary

2425

2426 Analogous to R's C
, but does not deal with outputs from other functions.

2427 C
only describes data as it is entered.

2428 An option C or its synonym C specifies the maximum number of rows that will print.

2429

2430 =head3 array of array input

2431

2432 my @arr;

2433 foreach my $i (0..18) {

2434 push @arr, runif(22);

2435 }

2436

2437 and then C, or C

2438

2439 ---------------------------------------------------------------------------

2440 Index # values Min. 1st Qu. Median Mean 3rd Qu. Max.

2441 ---------------------------------------------------------------------------

2442 0 22 0.04312 0.286 0.4975 0.5121 0.7296 0.9633

2443 1 22 0.05932 0.1483 0.495 0.4737 0.7699 0.9371

2444 2 22 0.02742 0.1588 0.4045 0.4325 0.6682 0.9878

2445 3 22 0.009233 0.2552 0.5398 0.5147 0.7755 0.9808

2446 4 22 0.06727 0.2432 0.5019 0.4855 0.7121 0.9043

2447 5 22 0.001032 0.1646 0.3021 0.3727 0.5704 0.9556

2448

2449 =head3 hash of array input

2450

2451 $test_data = summary(

2452 {

2453 A => runif(9),

2454 B => runif(9)

2455 },

2456 );

2457

2458 =head2 t_test

2459

2460 There are 1-sample and 2-sample t-tests, from one or two arrays:

2461

2462 my $t_test = t_test( $array1, mu => 0.2334 );

2463

2464 or 2-sample:

2465

2466 $t_test = t_test(

2467 $array1, $array2,

2468 paired => 1

2469 );

2470

2471 returns a hash reference, which looks like:

2472

2473 conf_int => [

2474 -0.06672889, 0.25672889

2475 ],

2476 df => 5,

2477 estimate => 0.095,

2478 p_value => 0.19143688433660,

2479 statistic => 1.50996688705414

2480

2481 the two groups compared can be specified, though not necessarily, as C and C, just like in R:

2482

2483 $t_test = t_test(

2484 'x' => $array1, 'y' => $array2,

2485 paired => 1

2486 );

2487

2488 =head3 Parameters

2489

2490

2491

2492 =begin html

2493

2494

2495
2496
2497								Parameter
2498								Type
2499								Default
2500								Description
2501
2502
2503
2504
2505								`x`
2506								Array Reference
2507								Required
2508								The first vector of data. Must contain at least 2 elements.
2509
2510
2511								`y`
2512								Array Reference
2513								`undef`
2514								The second vector of data. Required for two-sample or paired tests.
2515
2516
2517								`mu`
2518								Float
2519								0.0
2520								The true value of the mean (or difference in means) for the null hypothesis.
2521
2522
2523								`paired`
2524								Boolean
2525								`FALSE`
2526								If true, performs a paired t-test. `x` and `y` must be the same length.
2527
2528
2529								`var_equal`
2530								Boolean
2531								`FALSE`
2532								If true, assumes equal variances (standard two-sample). If false, performs Welch's t-test with unequal variances.
2533
2534
2535								`conf_level`
2536								Float
2537								0.95
2538								Confidence level for the returned confidence interval. Must be between 0 and 1.
2539
2540
2541								`alternative`
2542								String
2543								`"two.sided"`
2544								Direction of the alternative hypothesis: `"two.sided"`, `"less"`, or `"greater"`.
2545
2546
2547

2548

2549 =end html

2550

2551

2552

2553 =head3 Return Hash

2554

2555

2556

2557 =begin html

2558

2559

2560
2561
2562								Key
2563								Description
2564
2565
2566
2567
2568								`statistic`
2569								The computed t-statistic.
2570
2571
2572								`df`
2573								Degrees of freedom for the test.
2574
2575
2576								`p_value`
2577								The calculated p-value based on the test directionality.
2578
2579
2580								`conf_int`
2581								An Array Reference containing two elements: `[lower_bound, upper_bound]`.
2582
2583
2584								`estimate`
2585								The estimated mean of `x` (one-sample) OR the mean of the differences (paired).
2586
2587
2588								`estimate_x`
2589								The estimated mean of the `x` vector (only returned in two-sample tests).
2590
2591
2592								`estimate_y`
2593								The estimated mean of the `y` vector (only returned in two-sample tests).
2594
2595
2596

2597

2598 =end html

2599

2600

2601

2602 =head2 transpose

2603

2604 Transposes a two-dimensional data structure, swapping rows and columns. Accepts either an array of arrays or a hash of hashes.

2605 Returns a new reference of the same type; the input is never modified.

2606

2607 =head3 Array of array input

2608

2609 Takes a reference to an array of array references and returns a new AoA where C.

2610

2611 my $matrix = [[1, 2, 3], [4, 5, 6]];

2612 my $t = transpose($matrix);

2613 # [[1, 4],

2614 # [2, 5],

2615 # [3, 6]]

2616

2617 All rows must be the same length; a ragged input is a fatal error.

2618 C is valid as an element value and is preserved exactly. An empty outer array or an array of empty rows both return C<[]>.

2619

2620 Dies if:

2621 - any inner element is not an array reference

2622 - rows differ in length (ragged array)

2623

2624 =head3 Hash of hash input

2625

2626 Takes a reference to a hash of hash references and returns a new HoH where C.

2627

2628 my $table = { alice => { score => 97, grade => 'A' }, bob => { score => 84, grade => 'B' } };

2629 my $t = transpose($table);

2630 # { score => { alice => 97, bob => 84 },

2631 # grade => { alice => 'A', bob => 'B' } }

2632

2633 Inner keys do not need to be uniform across rows. If a given column key appears in only some rows, the output hash for that column will simply contain only those rows — no padding or C-filling is performed.

2634

2635 my $sparse = {

2636 a => { x => 1, y => 2 },

2637 b => { x => 3, z => 4 } };

2638

2639 my $t = transpose($sparse);

2640 # { x => { a => 1, b => 3 },

2641 # y => { a => 2 },

2642 # z => { b => 4 } }

2643

2644 An empty outer hash or an outer hash whose inner hashes are all empty both return C<{}>.

2645

2646 Dies if any inner element is not a hash reference

2647

2648 =head2 value_counts

2649

2650 Count the values in a given data set, return a hash reference showing how many times each particular value is present.

2651

2652 =head3 Scalar

2653

2654 $hash = value_counts('c');

2655

2656 returns C<< { c =E 1 } >>

2657

2658 =head3 Array reference

2659

2660 value_counts(['a','b','b']);

2661

2662 returns C<< { a =E 1, b =E 2} >>

2663

2664 =head3 Array

2665

2666 my $value_counts = value_counts('a','b','b');

2667

2668 like an array reference above, returns C<< { a =E 1, b =E 2} >>

2669

2670 =head3 Hash

2671

2672 my $value_counts = value_counts( { A => 'a', B => 'a', C => 'b' } );

2673

2674 returns C<< { a =E 2, b =E 1} >>

2675

2676 =head3 Hash of array

2677

2678 my $value_counts = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']});

2679

2680 without a key (like above), the occurences of C, C, and C are counted.

2681

2682 With a key, like C for above, only values within that hash key are counted:

2683

2684 my $vc = value_counts({ 'a' => ['j', 't', 't'], 'b' => ['j', 't', 'v']}, 'a');

2685

2686 =head3 Hash of hash (table)

2687

2688 $hash = value_counts( {

2689 A => {

2690 a => 'x',

2691 b => 'z'

2692 },

2693 B => {

2694 a => 'x'

2695 },

2696 C => {

2697 a => 'y'

2698 }

2699 }, 'a');

2700

2701 the column, or second hash key, that you wish to count, is specified at the command line

2702

2703 =head2 var

2704

2705 as simple as possible:

2706

2707 var(2, 4, 5, 8, 9)

2708

2709 as of version 0.02, C will die if any undefined values are provided

2710

2711 like C, C, etc., C can accept array references, to make code simpler:

2712

2713 my $ref = \@arr;

2714 var($ref) = var(@arr)

2715

2716 =head2 var_test

2717

2718 As described by R: Performs an F test to compare the variances of two samples from normal populations

2719

2720 use Stats::LikeR;

2721

2722 my @x = (2.9, 3.0, 2.5, 2.6, 3.2);

2723 my @y = (3.8, 2.7, 4.0, 2.4);

2724

2725 my $vt = var_test(\@x, \@y);

2726

2727 also, conf_level can be set:

2728

2729 $vt = var_test(\@x, \@y, conf_level => 0.99);

2730

2731 as well as a ratio (from R: the hypothesized ratio of the population variances of C and C:

2732

2733 $test_data = var_test(\@xk, \@yk, ratio => 2);

2734

2735 =head2 wilcox_test

2736

2737 $test_data = wilcox_test(

2738 [1.83, 0.50, 1.62, 2.48, 1.68, 1.88, 1.55, 3.06, 1.30],

2739 [0.878, 0.647, 0.598, 2.05, 1.06, 1.29, 1.06, 3.14, 1.29]

2740 );

2741

2742 It fully supports paired tests (C<< paired =E 1 >>) and can calculate exact p-values (the default for C<< N E 50 >> without ties). If ties are encountered, it automatically switches to an approximation with continuity correction.

2743

2744 =head2 write_table

2745

2746 mimics R's C, with data as first argument to subroutine, and output file as second

2747

2748 write_table(\@data_aoh, $tmp_file, sep => "\t", 'row.names' => 1);

2749

2750 You can also precisely filter and reorder which columns are written by passing an array reference to C:

2751

2752 write_table(\@data, $tmp_file, sep => "\t", 'col.names' => ['c', 'a']);

2753

2754 undefined variables are printed as C by default, but can be set as you wish using C

2755

2756 write_table(\%data_hoa, '/tmp/undef.val.tsv', sep => "\t", 'undef.val' => 'nan')

2757

2758 as of version 0.07, C determines comma and tab-separated delimiters from the filename, but will override if C or C are explicitly set.

2759

2760 Args can also be accepted:

2761

2762 write_table( 'data' => \%flat, 'file' => $f );

2763

2764 =head1 changes

2765

2766 =head2 0.14

2767

2768 C function added for rows

2769

2770 C reads undefined values to C instead of C, which makes calculations easier

2771

2772 C writes undef by default as an empty string C<''>

2773

2774 C transforms a hash of hashes into an hash of arrays

2775

2776 C uses C instead of C to allow for high-precision 128-bit floats to be used on quadmath machines when available: https://www.cpantesters.org/cpan/report/296f4868-631f-11f1-abba-ff15558d240b

2777

2778 Numerous switches from C to C for local precision, like above

2779

2780 numerous changes to C for ease of use and working with datasets with numerous undefined values

2781

2782 dist.ini now links to math library when compiling: https://www.cpantesters.org/cpan/report/785e26d8-6397-11f1-89c0-dc066e8775ea

2783

2784 C now should be complete, errors with confidence intervals fixed

2785

2786 =head2 0.13

2787

2788 C: speed improvements; commented headers are now allowed

2789

2790 C: fix for

2791

2792 Attempt to free temp prematurely: SV 0x56417a2ae610 at t/write_table.t line 182.

2793 main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203

2794 Attempt to free unreferenced scalar: SV 0x56417a2ae610 at t/write_table.t line 183.

2795 main::wrote_ok(",age\x{a}Alice,30\x{a}Bob,25\x{a}", "row.names => 'name' uses that column as labels", HASH(0x56417a272250), "row.names", "name") called at t/write_table.t line 203

2796

2797 C gives better warnings for incorrect types of data given

2798

2799 Numerous changes to dist.ini to improve CPAN testing, especially for Win32

2800

2801 =head2 0.12

2802

2803 C can also take hash of arrays, and various mixes of data types

2804

2805 C: Addition of C keywords in many places; should improve CPU performance

2806

2807 Better POD formatting, correction of output hash for README's C

2808

2809 C can now accept hash of hashes as input

2810

2811 new C function for switching 2D hash keys and 2D array indices, and C for comparing columns against columns

2812

2813 removed unused function from C helpers

2814

2815 C: addition of restrict keywords in preinit, should improve CPU performance

2816

2817 MANIFEST.skip changed to MANIFEST.SKIP to improve CPAN testing

2818

2819 using C for tests of C, which may or may not work with CPAN testers (experimental)

2820

2821 Added function name to warnings, so I actually know which function is producing the error

2822

2823 C can also take C and C as args, in addition to positions

2824

2825 fixed C as it could hang if given empty C or C

2826

2827 Added C<__EXTENSIONS__> to source XS file for better CPAN testing

2828

2829 =head2 0.11

2830

2831 better POD formatting for tables

2832

2833 addition of MANIFEST.skip to get better testing results on CPAN

2834

2835 C: bugfix for when there is no intercept in the formula, new test cases in t/glm.t

2836

2837 C now accepts simple hashes as input, in addition to hash of arrays, hash of hashes, and arrays of hashes

2838

2839 Better documentation for t-test

2840

2841 =head2 0.10

2842

2843 changes to compilation for CPAN, trying to get this work on Windows

2844

2845 Addition of C and C

2846

2847 C will work without key names, just like in R. Testing for C has improved.

2848

2849 =head2 0.09

2850

2851 context changes in XS C, C, and C to get better CPAN testing results

2852

2853 C keywords added to C to increase speed

2854

2855 =head2 0.08

2856

2857 Speed improvement in C
of hashes.

2858

2859 Addition of C, C, C, C, and C functions

2860

2861 Chi-squared function no longer has Perl wrapper, and all code is in XS, which should result in a minor speed increase with 1 less function call.

2862

2863 Compiler changes for GNU source and inclusion of C, to ensure more CPAN testing works better.

2864

2865 C now returns hash-of-hash in {row}{column}

2866

2867 =head2 0.07

2868

2869 Addition of C
function.

2870

2871 Formulas can now be omitted from C, resulting in a stacked calculation as R would think.

2872

2873 Addition of C for multi-group comparisons that does not assume normality like C does.

2874

2875 C and C now automatically set separators for C<.csv> files as C<,> and C<.tsv> files as C<"\t">, respectively, so these values no longer need to be specified separately from the file name.