File Coverage

blib/lib/PDL/Stats/Basic.pm

Criterion	Covered	Total	%
statement	9	9	100.0
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	12	12	100.0

line	stmt	sub	time	code
1				#
2				# GENERATED WITH PDL::PP from lib/PDL/Stats/Basic.pd! Don't modify!
3				#
4				package PDL::Stats::Basic;
5
6				our @EXPORT_OK = qw(binomial_test rtable which_id code_ivs stdv stdv_unbiased var var_unbiased se ss skew skew_unbiased kurt kurt_unbiased cov cov_table corr corr_table t_corr n_pair corr_dev t_test t_test_nev t_test_paired );
7				our %EXPORT_TAGS = (Func=>\@EXPORT_OK);
8
9	4	4	533364	use PDL::Core;
	4		81242
	4		36
10	4	4	1949	use PDL::Exporter;
	4		18
	4		41
11	4	4	246	use DynaLoader;
	4		10
	4		2279
12
13
14
15				our @ISA = ( 'PDL::Exporter','DynaLoader' );
16				push @PDL::Core::PP, __PACKAGE__;
17				bootstrap PDL::Stats::Basic ;
18
19
20
21
22
23
24
25
26				#line 9 "lib/PDL/Stats/Basic.pd"
27
28				use strict;
29				use warnings;
30				use PDL::LiteF;
31				use Carp;
32
33				eval { require PDL::Core; require PDL::GSL::CDF; };
34				my $CDF = 1 if !$@;
35
36				=head1 NAME
37
38				PDL::Stats::Basic -- basic statistics and related utilities such as standard deviation, Pearson correlation, and t-tests.
39
40				=head1 DESCRIPTION
41
42				The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are broadcastable and methods that are NOT broadcastable, respectively.
43
44				Does not have mean or median function here. see SEE ALSO.
45
46				=head1 SYNOPSIS
47
48				use PDL::LiteF;
49				use PDL::Stats::Basic;
50
51				my $stdv = $data->stdv;
52
53				or
54
55				my $stdv = stdv( $data );
56
57				=cut
58				#line 59 "lib/PDL/Stats/Basic.pm"
59
60
61				=head1 FUNCTIONS
62
63				=cut
64
65
66
67
68
69
70				=head2 stdv
71
72				=for sig
73
74				Signature: (a(n); [o]b())
75				Types: (float double)
76
77				=for usage
78
79				$b = stdv($a);
80				stdv($a, $b); # all arguments given
81				$b = $a->stdv; # method call
82				$a->stdv($b);
83
84				=for ref
85
86				Sample standard deviation.
87
88				=pod
89
90				Broadcasts over its inputs.
91
92				=for bad
93
94				C processes bad values.
95				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
96
97				=cut
98
99
100
101
102				*stdv = \&PDL::stdv;
103
104
105
106
107
108
109				=head2 stdv_unbiased
110
111				=for sig
112
113				Signature: (a(n); [o]b())
114				Types: (float double)
115
116				=for usage
117
118				$b = stdv_unbiased($a);
119				stdv_unbiased($a, $b); # all arguments given
120				$b = $a->stdv_unbiased; # method call
121				$a->stdv_unbiased($b);
122
123				=for ref
124
125				Unbiased estimate of population standard deviation.
126
127				=pod
128
129				Broadcasts over its inputs.
130
131				=for bad
132
133				C processes bad values.
134				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
135
136				=cut
137
138
139
140
141				*stdv_unbiased = \&PDL::stdv_unbiased;
142
143
144
145
146
147
148				=head2 var
149
150				=for sig
151
152				Signature: (a(n); [o]b())
153				Types: (float double)
154
155				=for usage
156
157				$b = var($a);
158				var($a, $b); # all arguments given
159				$b = $a->var; # method call
160				$a->var($b);
161
162				=for ref
163
164				Sample variance.
165
166				=pod
167
168				Broadcasts over its inputs.
169
170				=for bad
171
172				C `processes bad values.`
173				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
174
175				=cut
176
177
178
179
180				*var = \&PDL::var;
181
182
183
184
185
186
187				=head2 var_unbiased
188
189				=for sig
190
191				Signature: (a(n); [o]b())
192				Types: (float double)
193
194				=for usage
195
196				$b = var_unbiased($a);
197				var_unbiased($a, $b); # all arguments given
198				$b = $a->var_unbiased; # method call
199				$a->var_unbiased($b);
200
201				=for ref
202
203				Unbiased estimate of population variance.
204
205				=pod
206
207				Broadcasts over its inputs.
208
209				=for bad
210
211				C processes bad values.
212				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
213
214				=cut
215
216
217
218
219				*var_unbiased = \&PDL::var_unbiased;
220
221
222
223
224
225
226				=head2 se
227
228				=for sig
229
230				Signature: (a(n); [o]b())
231				Types: (float double)
232
233				=for usage
234
235				$b = se($a);
236				se($a, $b); # all arguments given
237				$b = $a->se; # method call
238				$a->se($b);
239
240				=for ref
241
242				Standard error of the mean. Useful for calculating confidence intervals.
243
244				=for example
245
246				# 95% confidence interval for samples with large N
247				$ci_95_upper = $data->average + 1.96 * $data->se;
248				$ci_95_lower = $data->average - 1.96 * $data->se;
249
250
251				=pod
252
253				Broadcasts over its inputs.
254
255				=for bad
256
257				C processes bad values.
258				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
259
260				=cut
261
262
263
264
265				*se = \&PDL::se;
266
267
268
269
270
271
272				=head2 ss
273
274				=for sig
275
276				Signature: (a(n); [o]b())
277				Types: (float double)
278
279				=for usage
280
281				$b = ss($a);
282				ss($a, $b); # all arguments given
283				$b = $a->ss; # method call
284				$a->ss($b);
285
286				=for ref
287
288				Sum of squared deviations from the mean.
289
290				=pod
291
292				Broadcasts over its inputs.
293
294				=for bad
295
296				C processes bad values.
297				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
298
299				=cut
300
301
302
303
304				*ss = \&PDL::ss;
305
306
307
308
309
310
311				=head2 skew
312
313				=for sig
314
315				Signature: (a(n); [o]b())
316				Types: (float double)
317
318				=for usage
319
320				$b = skew($a);
321				skew($a, $b); # all arguments given
322				$b = $a->skew; # method call
323				$a->skew($b);
324
325				=for ref
326
327				Sample skewness, measure of asymmetry in data. skewness == 0 for normal distribution.
328
329				=pod
330
331				Broadcasts over its inputs.
332
333				=for bad
334
335				C processes bad values.
336				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
337
338				=cut
339
340
341
342
343				*skew = \&PDL::skew;
344
345
346
347
348
349
350				=head2 skew_unbiased
351
352				=for sig
353
354				Signature: (a(n); [o]b())
355				Types: (float double)
356
357				=for usage
358
359				$b = skew_unbiased($a);
360				skew_unbiased($a, $b); # all arguments given
361				$b = $a->skew_unbiased; # method call
362				$a->skew_unbiased($b);
363
364				=for ref
365
366				Unbiased estimate of population skewness. This is the number in GNumeric Descriptive Statistics.
367
368				=pod
369
370				Broadcasts over its inputs.
371
372				=for bad
373
374				C processes bad values.
375				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
376
377				=cut
378
379
380
381
382				*skew_unbiased = \&PDL::skew_unbiased;
383
384
385
386
387
388
389				=head2 kurt
390
391				=for sig
392
393				Signature: (a(n); [o]b())
394				Types: (float double)
395
396				=for usage
397
398				$b = kurt($a);
399				kurt($a, $b); # all arguments given
400				$b = $a->kurt; # method call
401				$a->kurt($b);
402
403				=for ref
404
405				Sample kurtosis, measure of "peakedness" of data. kurtosis == 0 for normal distribution.
406
407				=pod
408
409				Broadcasts over its inputs.
410
411				=for bad
412
413				C processes bad values.
414				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
415
416				=cut
417
418
419
420
421				*kurt = \&PDL::kurt;
422
423
424
425
426
427
428				=head2 kurt_unbiased
429
430				=for sig
431
432				Signature: (a(n); [o]b())
433				Types: (float double)
434
435				=for usage
436
437				$b = kurt_unbiased($a);
438				kurt_unbiased($a, $b); # all arguments given
439				$b = $a->kurt_unbiased; # method call
440				$a->kurt_unbiased($b);
441
442				=for ref
443
444				Unbiased estimate of population kurtosis. This is the number in GNumeric Descriptive Statistics.
445
446				=pod
447
448				Broadcasts over its inputs.
449
450				=for bad
451
452				C processes bad values.
453				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
454
455				=cut
456
457
458
459
460				*kurt_unbiased = \&PDL::kurt_unbiased;
461
462
463
464
465
466
467				=head2 cov
468
469				=for sig
470
471				Signature: (a(n); b(n); [o]c())
472				Types: (float double)
473
474				=for usage
475
476				$c = cov($a, $b);
477				cov($a, $b, $c); # all arguments given
478				$c = $a->cov($b); # method call
479				$a->cov($b, $c);
480
481				=for ref
482
483				Sample covariance. see B for ways to call
484
485				=pod
486
487				Broadcasts over its inputs.
488
489				=for bad
490
491				C processes bad values.
492				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
493
494				=cut
495
496
497
498
499				*cov = \&PDL::cov;
500
501
502
503
504
505
506				=head2 cov_table
507
508				=for sig
509
510				Signature: (a(n,m); [o]c(m,m))
511				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
512				float double ldouble)
513
514				=for usage
515
516				$c = cov_table($a);
517				cov_table($a, $c); # all arguments given
518				$c = $a->cov_table; # method call
519				$a->cov_table($c);
520
521				=for ref
522
523				Square covariance table. Gives the same result as broadcasting using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls.
524
525				=for example
526
527				Usage:
528
529				# 5 obs x 3 var, 2 such data tables
530
531				pdl> $a = random 5, 3, 2
532
533				pdl> p $cov = $a->cov_table
534				[
535				[
536				[ 8.9636438 -1.8624472 -1.2416588]
537				[-1.8624472 14.341514 -1.4245366]
538				[-1.2416588 -1.4245366 9.8690655]
539				]
540				[
541				[ 10.32644 -0.31311789 -0.95643674]
542				[-0.31311789 15.051779 -7.2759577]
543				[-0.95643674 -7.2759577 5.4465141]
544				]
545				]
546				# diagonal elements of the cov table are the variances
547				pdl> p $a->var
548				[
549				[ 8.9636438 14.341514 9.8690655]
550				[ 10.32644 15.051779 5.4465141]
551				]
552
553				for the same cov matrix table using B,
554
555				pdl> p $a->dummy(2)->cov($a->dummy(1))
556
557
558				=pod
559
560				Broadcasts over its inputs.
561
562				=for bad
563
564				C processes bad values.
565				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
566
567				=cut
568
569
570
571
572				*cov_table = \&PDL::cov_table;
573
574
575
576
577
578
579				=head2 corr
580
581				=for sig
582
583				Signature: (a(n); b(n); [o]c())
584				Types: (float double)
585
586				=for usage
587
588				$c = corr($a, $b);
589				corr($a, $b, $c); # all arguments given
590				$c = $a->corr($b); # method call
591				$a->corr($b, $c);
592
593				=for ref
594
595				Pearson correlation coefficient. r = cov(X,Y) / (stdv(X) * stdv(Y)).
596
597				=for example
598
599				Usage:
600
601				pdl> $a = random 5, 3
602				pdl> $b = sequence 5,3
603				pdl> p $a->corr($b)
604
605				[0.20934208 0.30949881 0.26713007]
606
607				for square corr table
608
609				pdl> p $a->corr($a->dummy(1))
610
611				[
612				[ 1 -0.41995259 -0.029301192]
613				[ -0.41995259 1 -0.61927619]
614				[-0.029301192 -0.61927619 1]
615				]
616
617				but it is easier and faster to use B.
618
619
620				=pod
621
622				Broadcasts over its inputs.
623
624				=for bad
625
626				C processes bad values.
627				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
628
629				=cut
630
631
632
633
634				*corr = \&PDL::corr;
635
636
637
638
639
640
641				=head2 corr_table
642
643				=for sig
644
645				Signature: (a(n,m); [o]c(m,m))
646				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
647				float double ldouble)
648
649				=for usage
650
651				$c = corr_table($a);
652				corr_table($a, $c); # all arguments given
653				$c = $a->corr_table; # method call
654				$a->corr_table($c);
655
656				=for ref
657
658				Square Pearson correlation table. Gives the same result as broadcasting using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls.
659
660				=for example
661
662				Usage:
663
664				# 5 obs x 3 var, 2 such data tables
665
666				pdl> $a = random 5, 3, 2
667
668				pdl> p $a->corr_table
669				[
670				[
671				[ 1 -0.69835951 -0.18549048]
672				[-0.69835951 1 0.72481605]
673				[-0.18549048 0.72481605 1]
674				]
675				[
676				[ 1 0.82722569 -0.71779883]
677				[ 0.82722569 1 -0.63938828]
678				[-0.71779883 -0.63938828 1]
679				]
680				]
681
682				for the same result using B,
683
684				pdl> p $a->dummy(2)->corr($a->dummy(1))
685
686				This is also how to use B and B with such a table.
687
688
689				=pod
690
691				Broadcasts over its inputs.
692
693				=for bad
694
695				C processes bad values.
696				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
697
698				=cut
699
700
701
702
703				*corr_table = \&PDL::corr_table;
704
705
706
707
708
709
710				=head2 t_corr
711
712				=for sig
713
714				Signature: (r(); n(); [o]t())
715				Types: (float double)
716
717				=for usage
718
719				$t = t_corr($r, $n);
720				t_corr($r, $n, $t); # all arguments given
721				$t = $r->t_corr($n); # method call
722				$r->t_corr($n, $t);
723
724				=for ref
725
726				t significance test for Pearson correlations.
727
728				=for example
729
730				$corr = $data->corr( $data->dummy(1) );
731				$n = $data->n_pair( $data->dummy(1) );
732				$t_corr = $corr->t_corr( $n );
733
734				use PDL::GSL::CDF;
735
736				$p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t_corr->abs, $n-2 ));
737
738
739				=pod
740
741				Broadcasts over its inputs.
742
743				=for bad
744
745				C processes bad values.
746				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
747
748				=cut
749
750
751
752
753				*t_corr = \&PDL::t_corr;
754
755
756
757
758
759
760				=head2 n_pair
761
762				=for sig
763
764				Signature: (a(n); b(n); indx [o]c())
765				Types: (long longlong)
766
767				=for usage
768
769				$c = n_pair($a, $b);
770				n_pair($a, $b, $c); # all arguments given
771				$c = $a->n_pair($b); # method call
772				$a->n_pair($b, $c);
773
774				=for ref
775
776				Returns the number of good pairs between 2 lists. Useful with B (esp. when bad values are involved)
777
778				=pod
779
780				Broadcasts over its inputs.
781
782				=for bad
783
784				C processes bad values.
785				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
786
787				=cut
788
789
790
791
792				*n_pair = \&PDL::n_pair;
793
794
795
796
797
798
799				=head2 corr_dev
800
801				=for sig
802
803				Signature: (a(n); b(n); [o]c())
804				Types: (float double)
805
806				=for usage
807
808				$c = corr_dev($a, $b);
809				corr_dev($a, $b, $c); # all arguments given
810				$c = $a->corr_dev($b); # method call
811				$a->corr_dev($b, $c);
812
813				=for ref
814
815				Calculates correlations from B vals. Seems faster than doing B from original vals when data pdl is big
816
817				=pod
818
819				Broadcasts over its inputs.
820
821				=for bad
822
823				C processes bad values.
824				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
825
826				=cut
827
828
829
830
831				*corr_dev = \&PDL::corr_dev;
832
833
834
835
836
837
838				=head2 t_test
839
840				=for sig
841
842				Signature: (a(n); b(m); [o]t(); [o]d())
843				Types: (float double)
844
845				=for usage
846
847				($t, $d) = t_test($a, $b);
848				t_test($a, $b, $t, $d); # all arguments given
849				($t, $d) = $a->t_test($b); # method call
850				$a->t_test($b, $t, $d);
851
852				=for ref
853
854				Independent sample t-test, assuming equal var.
855
856				=for example
857
858				my ($t, $df) = t_test( $pdl1, $pdl2 );
859				use PDL::GSL::CDF;
860				my $p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t->abs, $df ));
861
862
863				=pod
864
865				Broadcasts over its inputs.
866
867				=for bad
868
869				C processes bad values.
870				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
871
872				=cut
873
874
875
876
877				*t_test = \&PDL::t_test;
878
879
880
881
882
883
884				=head2 t_test_nev
885
886				=for sig
887
888				Signature: (a(n); b(m); [o]t(); [o]d())
889				Types: (float double)
890
891				=for usage
892
893				($t, $d) = t_test_nev($a, $b);
894				t_test_nev($a, $b, $t, $d); # all arguments given
895				($t, $d) = $a->t_test_nev($b); # method call
896				$a->t_test_nev($b, $t, $d);
897
898				=for ref
899
900				Independent sample t-test, NOT assuming equal var. ie Welch two sample t test. Df follows Welch-Satterthwaite equation instead of Satterthwaite (1946, as cited by Hays, 1994, 5th ed.). It matches GNumeric, which matches R.
901
902				=pod
903
904				Broadcasts over its inputs.
905
906				=for bad
907
908				C processes bad values.
909				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
910
911				=cut
912
913
914
915
916				*t_test_nev = \&PDL::t_test_nev;
917
918
919
920
921
922
923				=head2 t_test_paired
924
925				=for sig
926
927				Signature: (a(n); b(n); [o]t(); [o]d())
928				Types: (float double)
929
930				=for usage
931
932				($t, $d) = t_test_paired($a, $b);
933				t_test_paired($a, $b, $t, $d); # all arguments given
934				($t, $d) = $a->t_test_paired($b); # method call
935				$a->t_test_paired($b, $t, $d);
936
937				=for ref
938
939				Paired sample t-test.
940
941				=pod
942
943				Broadcasts over its inputs.
944
945				=for bad
946
947				C processes bad values.
948				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
949
950				=cut
951
952
953
954
955				*t_test_paired = \&PDL::t_test_paired;
956
957
958
959
960
961				#line 658 "lib/PDL/Stats/Basic.pd"
962
963				#line 659 "lib/PDL/Stats/Basic.pd"
964
965				=head2 binomial_test
966
967				=for Sig
968
969				Signature: (x(); n(); p_expected(); [o]p())
970
971				=for ref
972
973				Binomial test. One-tailed significance test for two-outcome distribution. Given the number of successes, the number of trials, and the expected probability of success, returns the probability of getting this many or more successes.
974
975				This function does NOT currently support bad value in the number of successes.
976
977				=for example
978
979				Usage:
980
981				# assume a fair coin, ie. 0.5 probablity of getting heads
982				# test whether getting 8 heads out of 10 coin flips is unusual
983
984				my $p = binomial_test( 8, 10, 0.5 ); # 0.0107421875. Yes it is unusual.
985
986				=cut
987
988				*binomial_test = \&PDL::binomial_test;
989				sub PDL::binomial_test {
990				my ($x, $n, $P) = @_;
991
992				carp 'Please install PDL::GSL::CDF.' unless $CDF;
993				carp 'This function does NOT currently support bad value in the number of successes.' if $x->badflag();
994
995				my $pdlx = pdl($x);
996				$pdlx->badflag(1);
997				$pdlx = $pdlx->setvaltobad(0);
998
999				my $p = 1 - PDL::GSL::CDF::gsl_cdf_binomial_P( $pdlx - 1, $P, $n );
1000				$p = $p->setbadtoval(1);
1001				$p->badflag(0);
1002
1003				return $p;
1004				}
1005
1006				=head1 METHODS
1007
1008				=head2 rtable
1009
1010				=for ref
1011
1012				Reads either file or file handle. Returns observation x variable pdl and var and obs ids if specified. Ids in perl @ ref to allow for non-numeric ids. Other non-numeric entries are treated as missing, which are filled with $opt{MISSN} then set to BAD. Can specify num of data rows to read from top but not arbitrary range.
1013
1014				*If passed handle, it will not be closed here.
1015
1016				=for options
1017
1018				Default options (case insensitive):
1019
1020				V => 1, # verbose. prints simple status
1021				TYPE => double,
1022				C_ID => 1, # boolean. file has col id.
1023				R_ID => 1, # boolean. file has row id.
1024				R_VAR => 0, # boolean. set to 1 if var in rows
1025				SEP => "\t", # can take regex qr//
1026				MISSN => -999, # this value treated as missing and set to BAD
1027				NROW => '', # set to read specified num of data rows
1028
1029				=for usage
1030
1031				Usage:
1032
1033				Sample file diet.txt:
1034
1035				uid height weight diet
1036				akw 72 320 1
1037				bcm 68 268 1
1038				clq 67 180 2
1039				dwm 70 200 2
1040
1041				($data, $idv, $ido) = rtable 'diet.txt';
1042
1043				# By default prints out data info and @$idv index and element
1044
1045				reading diet.txt for data and id... OK.
1046				data table as PDL dim o x v: PDL: Double D [4,3]
1047				0 height
1048				1 weight
1049				2 diet
1050
1051				Another way of using it,
1052
1053				$data = rtable( \*STDIN, {TYPE=>long} );
1054
1055				=cut
1056
1057				sub rtable {
1058				# returns obs x var data matrix and var and obs ids
1059				my ($src, $opt) = @_;
1060
1061				my $fh_in;
1062				if ($src =~ /STDIN/ or ref $src eq 'GLOB') { $fh_in = $src }
1063				else { open $fh_in, $src or croak "$!" }
1064
1065				my %opt = ( V => 1,
1066				TYPE => double,
1067				C_ID => 1,
1068				R_ID => 1,
1069				R_VAR => 0,
1070				SEP => "\t",
1071				MISSN => -999,
1072				NROW => '',
1073				);
1074				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
1075				$opt{V} and print "reading $src for data and id... ";
1076
1077				local $PDL::undefval = $opt{MISSN};
1078
1079				my $id_c = []; # match declaration of $id_r for return purpose
1080				if ($opt{C_ID}) {
1081				chomp( $id_c = <$fh_in> );
1082				my @entries = split $opt{SEP}, $id_c;
1083				$opt{R_ID} and shift @entries;
1084				$id_c = \@entries;
1085				}
1086
1087				my ($c_row, $id_r, $data, @data) = (0, [], PDL->null, );
1088				while (<$fh_in>) {
1089				chomp;
1090				my @entries = split /$opt{SEP}/, $_, -1;
1091
1092				$opt{R_ID} and push @$id_r, shift @entries;
1093
1094				# rudimentary check for numeric entry
1095				for (@entries) { $_ = $opt{MISSN} unless defined $_ and m/\d\b/ }
1096
1097				push @data, pdl( $opt{TYPE}, \@entries );
1098				$c_row ++;
1099				last
1100				if $opt{NROW} and $c_row == $opt{NROW};
1101				}
1102				# not explicitly closing $fh_in here in case it's passed from outside
1103				# $fh_in will close by going out of scope if opened here.
1104
1105				$data = pdl $opt{TYPE}, @data;
1106				@data = ();
1107				# rid of last col unless there is data there
1108				$data = $data->slice([0, $data->getdim(0)-2])->sever
1109				unless ( nelem $data->slice(-1)->where($data->slice(-1) != $opt{MISSN}) );
1110
1111				my ($idv, $ido) = ($id_r, $id_c);
1112				# var in columns instead of rows
1113				$opt{R_VAR} == 0
1114				and ($data, $idv, $ido) = ($data->inplace->transpose, $id_c, $id_r);
1115
1116				if ($opt{V}) {
1117				print "OK.\ndata table as PDL dim o x v: " . $data->info . "\n";
1118				$idv and print "$_\t$$idv[$_]\n" for 0..$#$idv;
1119				}
1120
1121				$data = $data->setvaltobad( $opt{MISSN} );
1122				$data->check_badflag;
1123				return wantarray? (@$idv? ($data, $idv, $ido) : ($data, $ido)) : $data;
1124				}
1125
1126				=head2 group_by
1127
1128				Returns pdl reshaped according to the specified factor variable. Most useful when used in conjunction with other broadcasting calculations such as average, stdv, etc. When the factor variable contains unequal number of cases in each level, the returned pdl is padded with bad values to fit the level with the most number of cases. This allows the subsequent calculation (average, stdv, etc) to return the correct results for each level.
1129
1130				Usage:
1131
1132				# simple case with 1d pdl and equal number of n in each level of the factor
1133
1134				pdl> p $a = sequence 10
1135				[0 1 2 3 4 5 6 7 8 9]
1136
1137				pdl> p $factor = $a > 4
1138				[0 0 0 0 0 1 1 1 1 1]
1139
1140				pdl> p $a->group_by( $factor )->average
1141				[2 7]
1142
1143				# more complex case with broadcasting and unequal number of n across levels in the factor
1144
1145				pdl> p $a = sequence 10,2
1146				[
1147				[ 0 1 2 3 4 5 6 7 8 9]
1148				[10 11 12 13 14 15 16 17 18 19]
1149				]
1150
1151				pdl> p $factor = qsort $a( ,0) % 3
1152				[
1153				[0 0 0 0 1 1 1 2 2 2]
1154				]
1155
1156				pdl> p $a->group_by( $factor )
1157				[
1158				[
1159				[ 0 1 2 3]
1160				[10 11 12 13]
1161				]
1162				[
1163				[ 4 5 6 BAD]
1164				[ 14 15 16 BAD]
1165				]
1166				[
1167				[ 7 8 9 BAD]
1168				[ 17 18 19 BAD]
1169				]
1170				]
1171				ARRAY(0xa2a4e40)
1172
1173				# group_by supports perl factors, multiple factors
1174				# returns factor labels in addition to pdl in array context
1175
1176				pdl> p $a = sequence 12
1177				[0 1 2 3 4 5 6 7 8 9 10 11]
1178
1179				pdl> $odd_even = [qw( e o e o e o e o e o e o )]
1180
1181				pdl> $magnitude = [qw( l l l l l l h h h h h h )]
1182
1183				pdl> ($a_grouped, $label) = $a->group_by( $odd_even, $magnitude )
1184
1185				pdl> p $a_grouped
1186				[
1187				[
1188				[0 2 4]
1189				[1 3 5]
1190				]
1191				[
1192				[ 6 8 10]
1193				[ 7 9 11]
1194				]
1195				]
1196
1197				pdl> p Dumper $label
1198				$VAR1 = [
1199				[
1200				'e_l',
1201				'o_l'
1202				],
1203				[
1204				'e_h',
1205				'o_h'
1206				]
1207				];
1208
1209				=cut
1210
1211				*group_by = \&PDL::group_by;
1212				sub PDL::group_by {
1213				my $p = shift;
1214				my @factors = @_;
1215
1216				if ( @factors == 1 ) {
1217				my $factor = $factors[0];
1218				my $label;
1219				if (ref $factor eq 'ARRAY') {
1220				$label = _ordered_uniq($factor);
1221				$factor = code_ivs($factor);
1222				} else {
1223				my $perl_factor = [$factor->list];
1224				$label = _ordered_uniq($perl_factor);
1225				}
1226
1227				my $p_reshaped = _group_by_single_factor( $p, $factor );
1228
1229				return wantarray? ($p_reshaped, $label) : $p_reshaped;
1230				}
1231
1232				# make sure all are arrays instead of pdls
1233				@factors = map { ref($_) eq 'PDL'? [$_->list] : $_ } @factors;
1234
1235				my (@cells);
1236				for my $ele (0 .. $#{$factors[0]}) {
1237				my $c = join '_', map { $_->[$ele] } @factors;
1238				push @cells, $c;
1239				}
1240				# get uniq cell labels (ref List::MoreUtils::uniq)
1241				my %seen;
1242				my @uniq_cells = grep {! $seen{$_}++ } @cells;
1243
1244				my $flat_factor = code_ivs( \@cells );
1245
1246				my $p_reshaped = _group_by_single_factor( $p, $flat_factor );
1247
1248				# get levels of each factor and reshape accordingly
1249				my @levels;
1250				for (@factors) {
1251				my %uniq;
1252				@uniq{ @$_ } = ();
1253				push @levels, scalar keys %uniq;
1254				}
1255
1256				$p_reshaped = $p_reshaped->reshape( $p_reshaped->dim(0), @levels )->sever;
1257
1258				# make labels for the returned data structure matching pdl structure
1259				my @labels;
1260				if (wantarray) {
1261				for my $ifactor (0 .. $#levels) {
1262				my @factor_label;
1263				for my $ilevel (0 .. $levels[$ifactor]-1) {
1264				my $i = $ifactor * $levels[$ifactor] + $ilevel;
1265				push @factor_label, $uniq_cells[$i];
1266				}
1267				push @labels, \@factor_label;
1268				}
1269				}
1270
1271				return wantarray? ($p_reshaped, \@labels) : $p_reshaped;
1272				}
1273
1274				# get uniq cell labels (ref List::MoreUtils::uniq)
1275				sub _ordered_uniq {
1276				my $arr = shift;
1277
1278				my %seen;
1279				my @uniq = grep { ! $seen{$_}++ } @$arr;
1280
1281				return \@uniq;
1282				}
1283
1284				sub _group_by_single_factor {
1285				my $p = shift;
1286				my $factor = shift;
1287
1288				$factor = $factor->squeeze;
1289				die "Currently support only 1d factor pdl."
1290				if $factor->ndims > 1;
1291
1292				die "Data pdl and factor pdl do not match!"
1293				unless $factor->dim(0) == $p->dim(0);
1294
1295				# get active dim that will be split according to factor and dims to broadcast over
1296				my @p_broadcastdims = $p->dims;
1297				my $p_dim0 = shift @p_broadcastdims;
1298
1299				my $uniq = $factor->uniq;
1300
1301				my @uniq_ns;
1302				for ($uniq->list) {
1303				push @uniq_ns, which( $factor == $_ )->nelem;
1304				}
1305
1306				# get number of n's in each group, find the biggest, fit output pdl to this
1307				my $uniq_ns = pdl \@uniq_ns;
1308				my $max = pdl(\@uniq_ns)->max->sclr;
1309
1310				my $badvalue = int($p->max + 1);
1311				my $p_tmp = ones($max, @p_broadcastdims, $uniq->nelem) * $badvalue;
1312				for (0 .. $#uniq_ns) {
1313				my $i = which $factor == $uniq->slice($_);
1314				$p_tmp->dice_axis(-1,$_)->squeeze->slice([0,$uniq_ns[$_]-1]) .= $p->slice($i);
1315				}
1316
1317				$p_tmp->badflag(1);
1318				return $p_tmp->setvaltobad($badvalue);
1319				}
1320
1321				=head2 which_id
1322
1323				=for ref
1324
1325				Lookup specified var (obs) ids in $idv ($ido) (see B) and return indices in $idv ($ido) as pdl if found. The indices are ordered by the specified subset. Useful for selecting data by var (obs) id.
1326
1327				=for usage
1328
1329				my $ind = which_id $ido, ['smith', 'summers', 'tesla'];
1330
1331				my $data_subset = $data( $ind, );
1332
1333				# take advantage of perl pattern matching
1334				# e.g. use data from people whose last name starts with s
1335
1336				my $i = which_id $ido, [ grep { /^s/ } @$ido ];
1337
1338				my $data_s = $data($i, );
1339
1340				=cut
1341
1342				sub which_id {
1343				my ($id, $id_s) = @_;
1344				my %ind; @ind{ @$id } = (0 .. $#$id);
1345				pdl grep defined, map $ind{$_}, @$id_s;
1346				}
1347
1348				my %code_bad = map +($_=>1), '', 'BAD';
1349				sub code_ivs {
1350				my ($var_ref) = @_;
1351				$var_ref = [ $var_ref->list ] if UNIVERSAL::isa($var_ref, 'PDL');
1352				my @filtered = map !defined($_) \|\| $code_bad{$_} ? undef : $_, @$var_ref;
1353				my ($l, %level) = 0; $level{$_} //= $l++ for grep defined, @filtered;
1354				my $pdl = pdl(map defined($_) ? $level{$_} : -1, @filtered)->setvaltobad(-1);
1355				$pdl->check_badflag;
1356				wantarray ? ($pdl, \%level) : $pdl;
1357				}
1358
1359				=head1 SEE ALSO
1360
1361				PDL::Basic (hist for frequency counts)
1362
1363				PDL::Ufunc (sum, avg, median, min, max, etc.)
1364
1365				PDL::GSL::CDF (various cumulative distribution functions)
1366
1367				=head1 REFERENCES
1368
1369				Hays, W.L. (1994). Statistics (5th ed.). Fort Worth, TX: Harcourt Brace College Publishers.
1370
1371				=head1 AUTHOR
1372
1373				Copyright (C) 2009 Maggie J. Xiong
1374
1375				All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDL distribution.
1376
1377				=cut
1378				#line 1379 "lib/PDL/Stats/Basic.pm"
1379
1380				# Exit with OK status
1381
1382				1;