File Coverage

lib/PDL/Stats/Basic.pd

Criterion	Covered	Total	%
statement	121	136	88.9
branch	33	56	58.9
condition	10	18	55.5
subroutine	10	11	90.9
pod	2	5	40.0
total	176	226	77.8

line	stmt	bran	cond	sub	pod	time	code
1							use strict;
2							use warnings;
3
4							my $F = [qw(F D)];
5
6							pp_add_exported(qw(binomial_test rtable which_id code_ivs
7							));
8
9							pp_addpm({At=>'Top'}, <<'EOD');
10	4			4		65
	4					16
	4					198
11	4			4		65	use strict;
	4					6
	4					279
12	4			4		1100	use warnings;
	4					2425
	4					30
13	4			4		152371	use PDL::LiteF;
	4					8
	4					13058
14							use Carp;
15
16							eval { require PDL::Core; require PDL::GSL::CDF; };
17							my $CDF = 1 if !$@;
18
19							=head1 NAME
20
21							PDL::Stats::Basic -- basic statistics and related utilities such as standard deviation, Pearson correlation, and t-tests.
22
23							=head1 DESCRIPTION
24
25							The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are broadcastable and methods that are NOT broadcastable, respectively.
26
27							Does not have mean or median function here. see SEE ALSO.
28
29							=head1 SYNOPSIS
30
31							use PDL::LiteF;
32							use PDL::Stats::Basic;
33
34							my $stdv = $data->stdv;
35
36							or
37
38							my $stdv = stdv( $data );
39
40							=cut
41
42							EOD
43
44							pp_addhdr('
45							#include
46							'
47							);
48
49							pp_def('stdv',
50							Pars => 'a(n); [o]b()',
51							GenericTypes => $F,
52							HandleBad => 1,
53							Code => '
54							$GENERIC(b) sa = 0, a2 = 0;
55							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
56							loop (n) %{
57							PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,)
58							sa += $a();
59							a2 += $a() * $a();
60							%}
61							if (N < 1) { $SETBAD(b()); continue; }
62							$GENERIC() var = a2 / N - (sa/N)*(sa/N);
63							if (var < 0) var = 0;
64							$b() = sqrt(var);
65							',
66							Doc => 'Sample standard deviation.',
67							);
68
69							pp_def('stdv_unbiased',
70							Pars => 'a(n); [o]b()',
71							GenericTypes => $F,
72							HandleBad => 1,
73							Code => '
74							$GENERIC(b) sa = 0, a2 = 0;
75							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
76							loop (n) %{
77							PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,)
78							sa += $a();
79							a2 += $a() * $a();
80							%}
81							if (N < 2) { $SETBAD(b()); continue; }
82							$GENERIC() var = a2/(N-1) - sasa/(N(N-1));
83							if (var < 0) var = 0;
84							$b() = sqrt(var);
85							',
86							Doc => 'Unbiased estimate of population standard deviation.',
87							);
88
89							pp_def('var',
90							Pars => 'a(n); [o]b()',
91							GenericTypes => $F,
92							HandleBad => 1,
93							Code => '
94							$GENERIC(b) a2 = 0, sa = 0;
95							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
96							loop (n) %{
97							PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,)
98							sa += $a();
99							a2 += $a() * $a();
100							%}
101							if (N < 1) { $SETBAD(b()); continue; }
102							$b() = a2 / N - sasa/(NN);
103							',
104							Doc => 'Sample variance.',
105							);
106
107							pp_def('var_unbiased',
108							Pars => 'a(n); [o]b()',
109							GenericTypes => $F,
110							HandleBad => 1,
111							Code => '
112							$GENERIC(b) a2 = 0, sa = 0;
113							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
114							loop (n) %{
115							PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,)
116							a2 += $a() * $a();
117							sa += $a();
118							%}
119							if (N < 2) { $SETBAD(b()); continue; }
120							$b() = (a2 - sa*sa/N) / (N-1);
121							',
122							Doc => 'Unbiased estimate of population variance.',
123							);
124
125							pp_def('se',
126							Pars => 'a(n); [o]b()',
127							GenericTypes => $F,
128							HandleBad => 1,
129							Code => '
130							$GENERIC(b) sa = 0, a2 = 0;
131							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
132							loop (n) %{
133							PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,)
134							sa += $a();
135							a2 += $a() * $a();
136							%}
137							$GENERIC() se2 = (a2 - sasa/N) / (N(N-1));
138							if (se2 < 0) se2 = 0;
139							$b() = sqrt(se2);
140							',
141							Doc => '
142							=for ref
143
144							Standard error of the mean. Useful for calculating confidence intervals.
145
146							=for example
147
148							# 95% confidence interval for samples with large N
149							$ci_95_upper = $data->average + 1.96 * $data->se;
150							$ci_95_lower = $data->average - 1.96 * $data->se;
151							',
152							);
153
154							pp_def('ss',
155							Pars => 'a(n); [o]b()',
156							GenericTypes => $F,
157							HandleBad => 1,
158							Code => '
159							$GENERIC(b) sa = 0, a2 = 0;
160							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
161							loop (n) %{
162							PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,)
163							sa += $a();
164							a2 += $a() * $a();
165							%}
166							if (N < 1) { $SETBAD(b()); continue; }
167							$b() = a2 - sa*sa/N;
168							',
169							Doc => 'Sum of squared deviations from the mean.',
170							);
171
172							pp_def('skew',
173							Pars => 'a(n); [o]b()',
174							GenericTypes => $F,
175							HandleBad => 1,
176							Code => '
177							$GENERIC(b) sa = 0, m = 0, d=0, d2 = 0, d3 = 0;
178							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
179							loop (n) %{
180							PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,)
181							sa += $a();
182							%}
183							if (N < 1) { $SETBAD(b()); continue; }
184							m = sa / N;
185							loop (n) %{
186							if ( $ISGOOD($a()) ) {
187							d = $a() - m;
188							d2 += d*d;
189							d3 += ddd;
190							}
191							%}
192							$b() = d3/N / pow(d2/N, 1.5);
193							',
194							Doc => 'Sample skewness, measure of asymmetry in data. skewness == 0 for normal distribution.',
195							);
196
197							pp_def('skew_unbiased',
198							Pars => 'a(n); [o]b()',
199							GenericTypes => $F,
200							HandleBad => 1,
201							Code => '
202							$GENERIC(b) sa = 0, m = 0, d=0, d2 = 0, d3 = 0;
203							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
204							loop (n) %{
205							PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,)
206							sa += $a();
207							%}
208							if (N < 3) { $SETBAD(b()); continue; }
209							m = sa / N;
210							loop (n) %{
211							PDL_IF_BAD(if ($ISBAD($a())) continue;,)
212							d = $a() - m;
213							d2 += d*d;
214							d3 += ddd;
215							%}
216							$b() = sqrt(N(N-1)) / (N-2) d3/N / pow(d2/N, 1.5);
217							',
218							Doc => 'Unbiased estimate of population skewness. This is the number in GNumeric Descriptive Statistics.',
219							);
220
221							pp_def('kurt',
222							Pars => 'a(n); [o]b()',
223							GenericTypes => $F,
224							HandleBad => 1,
225							Code => '
226							$GENERIC(b) sa = 0, m = 0, d=0, d2 = 0, d4 = 0;
227							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
228							loop (n) %{
229							PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,)
230							sa += $a();
231							%}
232							if (N < 1) { $SETBAD(b()); continue; }
233							m = sa / N;
234							loop (n) %{
235							PDL_IF_BAD(if ($ISBAD($a())) continue;,)
236							d = $a() - m;
237							d2 += d*d;
238							d4 += ddd*d;
239							%}
240							$b() = N * d4 / (d2*d2) - 3;
241							',
242							Doc => 'Sample kurtosis, measure of "peakedness" of data. kurtosis == 0 for normal distribution.',
243							);
244
245							pp_def('kurt_unbiased',
246							Pars => 'a(n); [o]b()',
247							GenericTypes => $F,
248							HandleBad => 1,
249							Code => '
250							$GENERIC(b) sa = 0, m = 0, d=0, d2 = 0, d4 = 0;
251							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
252							loop (n) %{
253							PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,)
254							sa += $a();
255							%}
256							if (N < 4) { $SETBAD(b()); continue; }
257							m = sa / N;
258							loop (n) %{
259							PDL_IF_BAD(if ($ISBAD($a())) continue;,)
260							d = $a() - m;
261							d2 += d*d;
262							d4 += ddd*d;
263							%}
264							$b() = ((N-1)N(N+1) * d4 / (d2d2) - 3 (N-1)(N-1)) / ((N-2)(N-3));
265							',
266							Doc => 'Unbiased estimate of population kurtosis. This is the number in GNumeric Descriptive Statistics.',
267							);
268
269							pp_def('cov',
270							Pars => 'a(n); b(n); [o]c()',
271							GenericTypes => $F,
272							HandleBad => 1,
273							Code => '
274							$GENERIC(c) ab = 0, sa = 0, sb = 0;
275							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
276							loop (n) %{
277							PDL_IF_BAD(if ($ISBAD($a()) \|\| $ISBAD($b())) continue; N++;,)
278							ab += $a() * $b();
279							sa += $a();
280							sb += $b();
281							%}
282							if (N < 1) { $SETBAD(c()); continue; }
283							$c() = ab / N - (sa/N) * (sb/N);
284							',
285							Doc => 'Sample covariance. see B for ways to call',
286							);
287
288							pp_def('cov_table',
289							Pars => 'a(n,m); [o]c(m,m)',
290							HandleBad => 1,
291							RedoDimsCode => 'if ($SIZE(n) < 2) $CROAK("too few N");',
292							Code => '
293							$GENERIC(a) a_, b_;
294							PDL_Indx M = $SIZE(m), i, j;
295							for (i=0; i
296							for (j=i; j
297							$GENERIC(c) ab = 0, sa = 0, sb = 0;
298							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
299							loop (n) %{
300							PDL_IF_BAD(if ($ISBAD($a(m=>i)) \|\| $ISBAD($a(m=>j))) continue; N++;,)
301							sa += a_ = $a(m=>i);
302							sb += b_ = $a(m=>j);
303							ab += a_ * b_;
304							%}
305							if (N < 2) {
306							$SETBAD($c(m0=>i, m1=>j));
307							$SETBAD($c(m0=>j, m1=>i));
308							continue;
309							}
310							$GENERIC(c) cov = ab - (sa * sb) / N;
311							$c(m0=>i, m1=>j) =
312							$c(m0=>j, m1=>i) = cov / N;
313							}
314							}
315							',
316							Doc => '
317							=for ref
318
319							Square covariance table. Gives the same result as broadcasting using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls.
320
321							=for example
322
323							Usage:
324
325							# 5 obs x 3 var, 2 such data tables
326
327							pdl> $a = random 5, 3, 2
328
329							pdl> p $cov = $a->cov_table
330							[
331							[
332							[ 8.9636438 -1.8624472 -1.2416588]
333							[-1.8624472 14.341514 -1.4245366]
334							[-1.2416588 -1.4245366 9.8690655]
335							]
336							[
337							[ 10.32644 -0.31311789 -0.95643674]
338							[-0.31311789 15.051779 -7.2759577]
339							[-0.95643674 -7.2759577 5.4465141]
340							]
341							]
342							# diagonal elements of the cov table are the variances
343							pdl> p $a->var
344							[
345							[ 8.9636438 14.341514 9.8690655]
346							[ 10.32644 15.051779 5.4465141]
347							]
348
349							for the same cov matrix table using B,
350
351							pdl> p $a->dummy(2)->cov($a->dummy(1))
352							',
353
354							);
355
356							pp_def('corr',
357							Pars => 'a(n); b(n); [o]c()',
358							GenericTypes => $F,
359							HandleBad => 1,
360							RedoDimsCode => 'if ($SIZE(n) < 2) $CROAK("too few N");',
361							Code => '
362							$GENERIC(c) ab, sa, sb, a2, b2, cov, va, vb;
363							ab=0; sa=0; sb=0; a2=0; b2=0;
364							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
365							loop (n) %{
366							PDL_IF_BAD(if ($ISBAD($a()) \|\| $ISBAD($b())) continue; N++;,)
367							ab += $a() * $b();
368							sa += $a();
369							sb += $b();
370							a2 += $a() * $a();
371							b2 += $b() * $b();
372							%}
373							if (N < 2) { $SETBAD(c()); continue; }
374							cov = ab - (sa * sb) / N;
375							va = a2 - sa*sa / N;
376							vb = b2 - sb*sb / N;
377							$c() = cov / sqrt( va * vb );
378							',
379							Doc => '
380							=for ref
381
382							Pearson correlation coefficient. r = cov(X,Y) / (stdv(X) * stdv(Y)).
383
384							=for example
385
386							Usage:
387
388							pdl> $a = random 5, 3
389							pdl> $b = sequence 5,3
390							pdl> p $a->corr($b)
391
392							[0.20934208 0.30949881 0.26713007]
393
394							for square corr table
395
396							pdl> p $a->corr($a->dummy(1))
397
398							[
399							[ 1 -0.41995259 -0.029301192]
400							[ -0.41995259 1 -0.61927619]
401							[-0.029301192 -0.61927619 1]
402							]
403
404							but it is easier and faster to use B.
405							',
406							);
407
408							pp_def('corr_table',
409							Pars => 'a(n,m); [o]c(m,m)',
410							HandleBad => 1,
411							RedoDimsCode => 'if ($SIZE(n) < 2) $CROAK("too few N");',
412							Code => '
413							$GENERIC(a) a_, b_;
414							$GENERIC(c) ab, sa, sb, a2, b2, cov, va, vb, r;
415							PDL_Indx M = $SIZE(m), i, j;
416							for (i=0; i
417							for (j=i+1; j
418							ab=0; sa=0; sb=0; a2=0; b2=0;
419							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
420							loop (n) %{
421							PDL_IF_BAD(if ($ISBAD($a(m=>i)) \|\| $ISBAD($a(m=>j))) continue; N++;,)
422							sa += a_ = $a(m=>i);
423							sb += b_ = $a(m=>j);
424							ab += a_ * b_;
425							a2 += a_ * a_;
426							b2 += b_ * b_;
427							%}
428							if (N < 2) {
429							$SETBAD($c(m0=>i, m1=>j));
430							$SETBAD($c(m0=>j, m1=>i));
431							continue;
432							}
433							cov = ab - (sa * sb) / N;
434							va = a2 - sa*sa / N;
435							vb = b2 - sb*sb / N;
436							r = cov / sqrt( va * vb );
437							$c(m0=>i, m1=>j) =
438							$c(m0=>j, m1=>i) = r;
439							}
440							PDL_IF_BAD(PDL_Indx N = 0;
441							loop (n) %{
442							if ($ISGOOD($a(m=>i)))
443							N ++;
444							if (N > 1)
445							break;
446							%}
447							if (N < 2) { $SETBAD($c(m0=>i, m1=>i)); continue; },)
448							$c(m0=>i, m1=>i) = 1.0;
449							}
450							',
451							Doc => '
452							=for ref
453
454							Square Pearson correlation table. Gives the same result as broadcasting using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls.
455
456							=for example
457
458							Usage:
459
460							# 5 obs x 3 var, 2 such data tables
461
462							pdl> $a = random 5, 3, 2
463
464							pdl> p $a->corr_table
465							[
466							[
467							[ 1 -0.69835951 -0.18549048]
468							[-0.69835951 1 0.72481605]
469							[-0.18549048 0.72481605 1]
470							]
471							[
472							[ 1 0.82722569 -0.71779883]
473							[ 0.82722569 1 -0.63938828]
474							[-0.71779883 -0.63938828 1]
475							]
476							]
477
478							for the same result using B,
479
480							pdl> p $a->dummy(2)->corr($a->dummy(1))
481
482							This is also how to use B and B with such a table.
483							',
484							);
485
486							pp_def('t_corr',
487							Pars => 'r(); n(); [o]t()',
488							GenericTypes => $F,
489							HandleBad => 1,
490							Code => '
491							PDL_IF_BAD(
492							if ($ISBAD(r()) \|\| $ISBAD(n()) ) { $SETBAD( $t() ); continue; }
493							if ($n() <= 2) { $SETBAD(t()); continue; }
494							,)
495							$t() = $r() / pow( (1 - $r()*$r()) / ($n() - 2) , .5);
496							',
497							Doc => '
498							=for ref
499
500							t significance test for Pearson correlations.
501
502							=for example
503
504							$corr = $data->corr( $data->dummy(1) );
505							$n = $data->n_pair( $data->dummy(1) );
506							$t_corr = $corr->t_corr( $n );
507
508							use PDL::GSL::CDF;
509
510							$p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t_corr->abs, $n-2 ));
511							',
512							);
513
514							pp_def('n_pair',
515							Pars => 'a(n); b(n); indx [o]c()',
516							GenericTypes => [qw/L Q/],
517							HandleBad => 1,
518							Code => '
519							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
520							PDL_IF_BAD(loop(n) %{
521							if ($ISBAD($a()) \|\| $ISBAD($b())) continue;
522							N++;
523							%},)
524							$c() = N;
525							',
526							Doc => 'Returns the number of good pairs between 2 lists. Useful with B (esp. when bad values are involved)',
527							);
528
529							pp_def('corr_dev',
530							Pars => 'a(n); b(n); [o]c()',
531							GenericTypes => $F,
532							HandleBad => 1,
533							RedoDimsCode => 'if ($SIZE(n) < 2) $CROAK("too few N");',
534							Code => '
535							$GENERIC(c) ab, a2, b2, cov, va, vb;
536							ab = 0; a2 = 0; b2 = 0;
537							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
538							loop (n) %{
539							PDL_IF_BAD(if ($ISBAD($a()) \|\| $ISBAD($b())) continue; N++;,)
540							ab += $a() * $b();
541							a2 += $a() * $a();
542							b2 += $b() * $b();
543							%}
544							if (N < 2) { $SETBAD(c()); continue; }
545							cov = ab / N;
546							va = a2 / N;
547							vb = b2 / N;
548							$c() = cov / sqrt( va * vb );
549							',
550							Doc => 'Calculates correlations from B vals. Seems faster than doing B from original vals when data pdl is big',
551							);
552
553							pp_def('t_test',
554							Pars => 'a(n); b(m); [o]t(); [o]d()',
555							GenericTypes => $F,
556							HandleBad => 1,
557							RedoDimsCode => '
558							if ($SIZE(n) < 2) $CROAK("too few N");
559							if ($SIZE(m) < 2) $CROAK("too few M");
560							',
561							Code => '
562							$GENERIC(t) N = PDL_IF_BAD(0,$SIZE(n)), M = PDL_IF_BAD(0,$SIZE(m)), sa = 0, sb = 0, a2 = 0, b2 = 0;
563							loop (n) %{
564							PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,)
565							sa += $a();
566							a2 += $a() * $a();
567							%}
568							loop (m) %{
569							PDL_IF_BAD(if ($ISBAD($b())) continue;,)
570							sb += $b();
571							b2 += $b() * $b();
572							PDL_IF_BAD(M++;,)
573							%}
574							if (N < 2 \|\| M < 2) {
575							$SETBAD($t());
576							$SETBAD($d());
577							continue;
578							}
579							$d() = N + M - 2;
580							$GENERIC(t) va = (a2 - sa*sa/N) / (N-1);
581							$GENERIC(t) vb = (b2 - sb*sb/M) / (M-1);
582							$GENERIC(t) sdiff = sqrt( (1/N + 1/M) * ((N-1)va + (M-1)vb) / $d() );
583							$t() = (sa/N - sb/M) / sdiff;
584							',
585							Doc => '
586							=for ref
587
588							Independent sample t-test, assuming equal var.
589
590							=for example
591
592							my ($t, $df) = t_test( $pdl1, $pdl2 );
593							use PDL::GSL::CDF;
594							my $p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t->abs, $df ));
595							',
596							);
597
598							pp_def('t_test_nev',
599							Pars => 'a(n); b(m); [o]t(); [o]d()',
600							GenericTypes => $F,
601							HandleBad => 1,
602							RedoDimsCode => '
603							if ($SIZE(n) < 2) $CROAK("too few N");
604							if ($SIZE(m) < 2) $CROAK("too few M");
605							',
606							Code => '
607							$GENERIC(t) N = PDL_IF_BAD(0,$SIZE(n)), M = PDL_IF_BAD(0,$SIZE(m)), sa = 0, sb = 0, a2 = 0, b2 = 0;
608							loop (n) %{
609							PDL_IF_BAD(if ($ISBAD($a())) continue; N++;,)
610							sa += $a();
611							a2 += $a() * $a();
612							%}
613							loop (m) %{
614							PDL_IF_BAD(if ($ISBAD($b())) continue; M++;,)
615							sb += $b();
616							b2 += $b() * $b();
617							%}
618							if (N < 2 \|\| M < 2) {
619							$SETBAD($t());
620							$SETBAD($d());
621							continue;
622							}
623							$GENERIC(t) se_a_2 = (a2 - sasa/N) / (N(N-1));
624							$GENERIC(t) se_b_2 = (b2 - sbsb/M) / (M(M-1));
625							$GENERIC(t) sdiff = sqrt( se_a_2 + se_b_2 );
626							$t() = (sa/N - sb/M) / sdiff;
627							$d() = (se_a_2 + se_b_2)*(se_a_2 + se_b_2)
628							/ ( se_a_2se_a_2 / (N-1) + se_b_2se_b_2 / (M-1) )
629							;
630							',
631							Doc => 'Independent sample t-test, NOT assuming equal var. ie Welch two sample t test. Df follows Welch-Satterthwaite equation instead of Satterthwaite (1946, as cited by Hays, 1994, 5th ed.). It matches GNumeric, which matches R.',
632							);
633
634							pp_def('t_test_paired',
635							Pars => 'a(n); b(n); [o]t(); [o]d()',
636							GenericTypes => $F,
637							HandleBad => 1,
638							RedoDimsCode => 'if ($SIZE(n) < 2) $CROAK("too few N");',
639							Code => '
640							$GENERIC(t) N = PDL_IF_BAD(0,$SIZE(n)), s_dif = 0, diff2 = 0;
641							loop (n) %{
642							PDL_IF_BAD(if ($ISBAD($a()) \|\| $ISBAD($b())) continue; N++;,)
643							$GENERIC(t) diff = $a() - $b();
644							s_dif += diff;
645							diff2 += diff*diff;
646							%}
647							if (N < 2) {
648							$SETBAD($t());
649							$SETBAD($d());
650							continue;
651							}
652							$d() = N - 1;
653							$t() = s_dif / sqrt( ( Ndiff2 - s_difs_dif ) / (N-1) );
654							',
655							Doc => 'Paired sample t-test.',
656							);
657
658							pp_addpm pp_line_numbers(__LINE__, <<'EOD');
659
660							=head2 binomial_test
661
662							=for Sig
663
664							Signature: (x(); n(); p_expected(); [o]p())
665
666							=for ref
667
668							Binomial test. One-tailed significance test for two-outcome distribution. Given the number of successes, the number of trials, and the expected probability of success, returns the probability of getting this many or more successes.
669
670							This function does NOT currently support bad value in the number of successes.
671
672							=for example
673
674							Usage:
675
676							# assume a fair coin, ie. 0.5 probablity of getting heads
677							# test whether getting 8 heads out of 10 coin flips is unusual
678
679							my $p = binomial_test( 8, 10, 0.5 ); # 0.0107421875. Yes it is unusual.
680
681							=cut
682
683							*binomial_test = \&PDL::binomial_test;
684							sub PDL::binomial_test {
685	0			0	0	0	my ($x, $n, $P) = @_;
686
687	0	0				0	carp 'Please install PDL::GSL::CDF.' unless $CDF;
688	0	0				0	carp 'This function does NOT currently support bad value in the number of successes.' if $x->badflag();
689
690	0					0	my $pdlx = pdl($x);
691	0					0	$pdlx->badflag(1);
692	0					0	$pdlx = $pdlx->setvaltobad(0);
693
694	0					0	my $p = 1 - PDL::GSL::CDF::gsl_cdf_binomial_P( $pdlx - 1, $P, $n );
695	0					0	$p = $p->setbadtoval(1);
696	0					0	$p->badflag(0);
697
698	0					0	return $p;
699							}
700
701
702							=head1 METHODS
703
704							=head2 rtable
705
706							=for ref
707
708							Reads either file or file handle. Returns observation x variable pdl and var and obs ids if specified. Ids in perl @ ref to allow for non-numeric ids. Other non-numeric entries are treated as missing, which are filled with $opt{MISSN} then set to BAD. Can specify num of data rows to read from top but not arbitrary range.
709
710							*If passed handle, it will not be closed here.
711
712							=for options
713
714							Default options (case insensitive):
715
716							V => 1, # verbose. prints simple status
717							TYPE => double,
718							C_ID => 1, # boolean. file has col id.
719							R_ID => 1, # boolean. file has row id.
720							R_VAR => 0, # boolean. set to 1 if var in rows
721							SEP => "\t", # can take regex qr//
722							MISSN => -999, # this value treated as missing and set to BAD
723							NROW => '', # set to read specified num of data rows
724
725							=for usage
726
727							Usage:
728
729							Sample file diet.txt:
730
731							uid height weight diet
732							akw 72 320 1
733							bcm 68 268 1
734							clq 67 180 2
735							dwm 70 200 2
736
737							($data, $idv, $ido) = rtable 'diet.txt';
738
739							# By default prints out data info and @$idv index and element
740
741							reading diet.txt for data and id... OK.
742							data table as PDL dim o x v: PDL: Double D [4,3]
743							0 height
744							1 weight
745							2 diet
746
747							Another way of using it,
748
749							$data = rtable( \*STDIN, {TYPE=>long} );
750
751							=cut
752
753							sub rtable {
754	3			3	1	351330	# returns obs x var data matrix and var and obs ids
755							my ($src, $opt) = @_;
756	3					8
757	3	50	33			32	my $fh_in;
	3					7
758	0	0				0	if ($src =~ /STDIN/ or ref $src eq 'GLOB') { $fh_in = $src }
759							else { open $fh_in, $src or croak "$!" }
760	3					14
761							my %opt = ( V => 1,
762							TYPE => double,
763							C_ID => 1,
764							R_ID => 1,
765							R_VAR => 0,
766							SEP => "\t",
767							MISSN => -999,
768							NROW => '',
769	3	50				41	);
	3					18
770	3	50				11	if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
771							$opt{V} and print "reading $src for data and id... ";
772	3					8
773							local $PDL::undefval = $opt{MISSN};
774	3					7
775	3	50				10	my $id_c = []; # match declaration of $id_r for return purpose
776	3					17	if ($opt{C_ID}) {
777	3					64	chomp( $id_c = <$fh_in> );
778	3	50				12	my @entries = split $opt{SEP}, $id_c;
779	3					8	$opt{R_ID} and shift @entries;
780							$id_c = \@entries;
781							}
782	3					18
783	3					46	my ($c_row, $id_r, $data, @data) = (0, [], PDL->null, );
784	132					157	while (<$fh_in>) {
785	132					468	chomp;
786							my @entries = split /$opt{SEP}/, $_, -1;
787	132	50				314
788							$opt{R_ID} and push @$id_r, shift @entries;
789
790	132	100	66			185	# rudimentary check for numeric entry
	669					2177
791							for (@entries) { $_ = $opt{MISSN} unless defined $_ and m/\d\b/ }
792	132					245
793	132					3025	push @data, pdl( $opt{TYPE}, \@entries );
794							$c_row ++;
795	132	50	33			486	last
796							if $opt{NROW} and $c_row == $opt{NROW};
797							}
798							# not explicitly closing $fh_in here in case it's passed from outside
799							# $fh_in will close by going out of scope if opened here.
800	3					11
801	3					250	$data = pdl $opt{TYPE}, @data;
802							@data = ();
803							# rid of last col unless there is data there
804	3	100				36	$data = $data->slice([0, $data->getdim(0)-2])->sever
805							unless ( nelem $data->slice(-1)->where($data->slice(-1) != $opt{MISSN}) );
806	3					961
807							my ($idv, $ido) = ($id_r, $id_c);
808	3	50				35	# var in columns instead of rows
809							$opt{R_VAR} == 0
810							and ($data, $idv, $ido) = ($data->inplace->transpose, $id_c, $id_r);
811	3	50				61
812	0					0	if ($opt{V}) {
813	0		0			0	print "OK.\ndata table as PDL dim o x v: " . $data->info . "\n";
814							$idv and print "$_\t$$idv[$_]\n" for 0..$#$idv;
815							}
816	3					125
817	3					31	$data = $data->setvaltobad( $opt{MISSN} );
818	3	50				748	$data->check_badflag;
		50
819							return wantarray? (@$idv? ($data, $idv, $ido) : ($data, $ido)) : $data;
820							}
821
822							=head2 group_by
823
824							Returns pdl reshaped according to the specified factor variable. Most useful when used in conjunction with other broadcasting calculations such as average, stdv, etc. When the factor variable contains unequal number of cases in each level, the returned pdl is padded with bad values to fit the level with the most number of cases. This allows the subsequent calculation (average, stdv, etc) to return the correct results for each level.
825
826							Usage:
827
828							# simple case with 1d pdl and equal number of n in each level of the factor
829
830							pdl> p $a = sequence 10
831							[0 1 2 3 4 5 6 7 8 9]
832
833							pdl> p $factor = $a > 4
834							[0 0 0 0 0 1 1 1 1 1]
835
836							pdl> p $a->group_by( $factor )->average
837							[2 7]
838
839							# more complex case with broadcasting and unequal number of n across levels in the factor
840
841							pdl> p $a = sequence 10,2
842							[
843							[ 0 1 2 3 4 5 6 7 8 9]
844							[10 11 12 13 14 15 16 17 18 19]
845							]
846
847							pdl> p $factor = qsort $a( ,0) % 3
848							[
849							[0 0 0 0 1 1 1 2 2 2]
850							]
851
852							pdl> p $a->group_by( $factor )
853							[
854							[
855							[ 0 1 2 3]
856							[10 11 12 13]
857							]
858							[
859							[ 4 5 6 BAD]
860							[ 14 15 16 BAD]
861							]
862							[
863							[ 7 8 9 BAD]
864							[ 17 18 19 BAD]
865							]
866							]
867							ARRAY(0xa2a4e40)
868
869							# group_by supports perl factors, multiple factors
870							# returns factor labels in addition to pdl in array context
871
872							pdl> p $a = sequence 12
873							[0 1 2 3 4 5 6 7 8 9 10 11]
874
875							pdl> $odd_even = [qw( e o e o e o e o e o e o )]
876
877							pdl> $magnitude = [qw( l l l l l l h h h h h h )]
878
879							pdl> ($a_grouped, $label) = $a->group_by( $odd_even, $magnitude )
880
881							pdl> p $a_grouped
882							[
883							[
884							[0 2 4]
885							[1 3 5]
886							]
887							[
888							[ 6 8 10]
889							[ 7 9 11]
890							]
891							]
892
893							pdl> p Dumper $label
894							$VAR1 = [
895							[
896							'e_l',
897							'o_l'
898							],
899							[
900							'e_h',
901							'o_h'
902							]
903							];
904
905
906							=cut
907
908	3			3	0	10368	*group_by = \&PDL::group_by;
909	3					10	sub PDL::group_by {
910							my $p = shift;
911	3	100				13	my @factors = @_;
912	2					4
913	2					4	if ( @factors == 1 ) {
914	2	50				7	my $factor = $factors[0];
915	0					0	my $label;
916	0					0	if (ref $factor eq 'ARRAY') {
917							$label = _ordered_uniq($factor);
918	2					11	$factor = code_ivs($factor);
919	2					44	} else {
920							my $perl_factor = [$factor->list];
921							$label = _ordered_uniq($perl_factor);
922	2					8	}
923
924	2	100				27	my $p_reshaped = _group_by_single_factor( $p, $factor );
925
926							return wantarray? ($p_reshaped, $label) : $p_reshaped;
927							}
928	1	50				3
	2					9
929							# make sure all are arrays instead of pdls
930	1					1	@factors = map { ref($_) eq 'PDL'? [$_->list] : $_ } @factors;
931	1					2
	1					5
932	10					16	my (@cells);
	20					68
933	10					21	for my $ele (0 .. $#{$factors[0]}) {
934							my $c = join '_', map { $_->[$ele] } @factors;
935							push @cells, $c;
936	1					3	}
937	1					3	# get uniq cell labels (ref List::MoreUtils::uniq)
	10					24
938							my %seen;
939	1					5	my @uniq_cells = grep {! $seen{$_}++ } @cells;
940
941	1					3	my $flat_factor = code_ivs( \@cells );
942
943							my $p_reshaped = _group_by_single_factor( $p, $flat_factor );
944	1					3
945	1					4	# get levels of each factor and reshape accordingly
946	2					4	my @levels;
947	2					10	for (@factors) {
948	2					7	my %uniq;
949							@uniq{ @$_ } = ();
950							push @levels, scalar keys %uniq;
951	1					15	}
952
953							$p_reshaped = $p_reshaped->reshape( $p_reshaped->dim(0), @levels )->sever;
954	1					46
955	1	50				25	# make labels for the returned data structure matching pdl structure
956	1					4	my @labels;
957	2					3	if (wantarray) {
958	2					6	for my $ifactor (0 .. $#levels) {
959	4					8	my @factor_label;
960	4					10	for my $ilevel (0 .. $levels[$ifactor]-1) {
961							my $i = $ifactor * $levels[$ifactor] + $ilevel;
962	2					5	push @factor_label, $uniq_cells[$i];
963							}
964							push @labels, \@factor_label;
965							}
966	1	50				15	}
967
968							return wantarray? ($p_reshaped, \@labels) : $p_reshaped;
969							}
970
971	2			2		4	# get uniq cell labels (ref List::MoreUtils::uniq)
972							sub _ordered_uniq {
973	2					4	my $arr = shift;
974	2					5
	20					71
975							my %seen;
976	2					10	my @uniq = grep { ! $seen{$_}++ } @$arr;
977
978							return \@uniq;
979							}
980	3			3		6
981	3					5	sub _group_by_single_factor {
982							my $p = shift;
983	3					13	my $factor = shift;
984	3	50				769
985							$factor = $factor->squeeze;
986							die "Currently support only 1d factor pdl."
987	3	50				16	if $factor->ndims > 1;
988
989							die "Data pdl and factor pdl do not match!"
990							unless $factor->dim(0) == $p->dim(0);
991	3					9
992	3					6	# get active dim that will be split according to factor and dims to broadcast over
993							my @p_broadcastdims = $p->dims;
994	3					15	my $p_dim0 = shift @p_broadcastdims;
995
996	3					2174	my $uniq = $factor->uniq;
997	3					13
998	9					1967	my @uniq_ns;
999							for ($uniq->list) {
1000							push @uniq_ns, which( $factor == $_ )->nelem;
1001							}
1002	3					391
1003	3					176	# get number of n's in each group, find the biggest, fit output pdl to this
1004							my $uniq_ns = pdl \@uniq_ns;
1005	3					257	my $max = pdl(\@uniq_ns)->max->sclr;
1006	3					330
1007	3					355	my $badvalue = int($p->max + 1);
1008	9					1757	my $p_tmp = ones($max, @p_broadcastdims, $uniq->nelem) * $badvalue;
1009	9					1512	for (0 .. $#uniq_ns) {
1010							my $i = which $factor == $uniq->slice($_);
1011							$p_tmp->dice_axis(-1,$_)->squeeze->slice([0,$uniq_ns[$_]-1]) .= $p->slice($i);
1012	3					723	}
1013	3					62
1014							$p_tmp->badflag(1);
1015							return $p_tmp->setvaltobad($badvalue);
1016							}
1017
1018							=head2 which_id
1019
1020							=for ref
1021
1022							Lookup specified var (obs) ids in $idv ($ido) (see B) and return indices in $idv ($ido) as pdl if found. The indices are ordered by the specified subset. Useful for selecting data by var (obs) id.
1023
1024							=for usage
1025
1026							my $ind = which_id $ido, ['smith', 'summers', 'tesla'];
1027
1028							my $data_subset = $data( $ind, );
1029
1030							# take advantage of perl pattern matching
1031							# e.g. use data from people whose last name starts with s
1032
1033							my $i = which_id $ido, [ grep { /^s/ } @$ido ];
1034
1035							my $data_s = $data($i, );
1036
1037							=cut
1038	54			54	1	128
1039	54					89	sub which_id {
	54					283
1040	54					287	my ($id, $id_s) = @_;
1041							my %ind; @ind{ @$id } = (0 .. $#$id);
1042							pdl grep defined, map $ind{$_}, @$id_s;
1043							}
1044
1045	269			269	0	4287	my %code_bad = map +($_=>1), '', 'BAD';
1046	269	100				1464	sub code_ivs {
1047	269	100	100			44282	my ($var_ref) = @_;
1048	269		100			1060	$var_ref = [ $var_ref->list ] if UNIVERSAL::isa($var_ref, 'PDL');
	269					14843
1049	269	100				9058	my @filtered = map !defined($_) \|\| $code_bad{$_} ? undef : $_, @$var_ref;
1050	269					23405	my ($l, %level) = 0; $level{$_} //= $l++ for grep defined, @filtered;
1051	269	100				57783	my $pdl = pdl(map defined($_) ? $level{$_} : -1, @filtered)->setvaltobad(-1);
1052							$pdl->check_badflag;
1053							wantarray ? ($pdl, \%level) : $pdl;
1054							}
1055
1056
1057							=head1 SEE ALSO
1058
1059							PDL::Basic (hist for frequency counts)
1060
1061							PDL::Ufunc (sum, avg, median, min, max, etc.)
1062
1063							PDL::GSL::CDF (various cumulative distribution functions)
1064
1065							=head1 REFERENCES
1066
1067							Hays, W.L. (1994). Statistics (5th ed.). Fort Worth, TX: Harcourt Brace College Publishers.
1068
1069							=head1 AUTHOR
1070
1071							Copyright (C) 2009 Maggie J. Xiong
1072
1073							All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution.
1074
1075							=cut
1076
1077							EOD
1078
1079							pp_done();