File Coverage

blib/lib/PDLA/Stats/Basic.pm

Criterion	Covered	Total	%
statement	132	147	89.8
branch	33	56	58.9
condition	13	24	54.1
subroutine	12	13	92.3
pod	2	4	50.0
total	192	244	78.6

line	stmt	bran	cond	sub	pod	time	code
1
2							#
3							# GENERATED WITH PDLA::PP! Don't modify!
4							#
5							package PDLA::Stats::Basic;
6
7							@EXPORT_OK = qw( binomial_test rtable which_id PDLA::PP stdv PDLA::PP stdv_unbiased PDLA::PP var PDLA::PP var_unbiased PDLA::PP se PDLA::PP ss PDLA::PP skew PDLA::PP skew_unbiased PDLA::PP kurt PDLA::PP kurt_unbiased PDLA::PP cov PDLA::PP cov_table PDLA::PP corr PDLA::PP corr_table PDLA::PP t_corr PDLA::PP n_pair PDLA::PP corr_dev PDLA::PP t_test PDLA::PP t_test_nev PDLA::PP t_test_paired );
8							%EXPORT_TAGS = (Func=>[@EXPORT_OK]);
9
10	5			5		297597	use PDLA::Core;
	5					86823
	5					39
11	5			5		1354	use PDLA::Exporter;
	5					7
	5					27
12	5			5		103	use DynaLoader;
	5					9
	5					244
13
14
15
16
17							@ISA = ( 'PDLA::Exporter','DynaLoader' );
18							push @PDLA::Core::PP, __PACKAGE__;
19							bootstrap PDLA::Stats::Basic ;
20
21
22
23
24
25	5			5		717	use PDLA::LiteF;
	5					960
	5					29
26	5			5		172345	use PDLA::NiceSlice;
	5					19
	5					43
27	5			5		86215	use Carp;
	5					12
	5					9480
28
29							$PDLA::onlinedoc->scan(__FILE__) if $PDLA::onlinedoc;
30
31							eval { require PDLA::GSL::CDF; };
32							my $CDF = 1 if !$@;
33
34							=head1 NAME
35
36							PDLA::Stats::Basic -- basic statistics and related utilities such as standard deviation, Pearson correlation, and t-tests.
37
38							=head1 DESCRIPTION
39
40							The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are NOT threadable, respectively.
41
42							Does not have mean or median function here. see SEE ALSO.
43
44							=head1 SYNOPSIS
45
46							use PDLA::LiteF;
47							use PDLA::NiceSlice;
48							use PDLA::Stats::Basic;
49
50							my $stdv = $data->stdv;
51
52							or
53
54							my $stdv = stdv( $data );
55
56							=cut
57
58
59
60
61
62
63
64							=head1 FUNCTIONS
65
66
67
68							=cut
69
70
71
72
73
74
75							=head2 stdv
76
77							=for sig
78
79							Signature: (a(n); float+ [o]b())
80
81
82
83							=for ref
84
85							Sample standard deviation.
86
87							=cut
88
89
90							=for bad
91
92							stdv processes bad values.
93							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
94
95
96							=cut
97
98
99
100
101
102
103							*stdv = \&PDLA::stdv;
104
105
106
107
108
109							=head2 stdv_unbiased
110
111							=for sig
112
113							Signature: (a(n); float+ [o]b())
114
115
116
117							=for ref
118
119							Unbiased estimate of population standard deviation.
120
121							=cut
122
123
124							=for bad
125
126							stdv_unbiased processes bad values.
127							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
128
129
130							=cut
131
132
133
134
135
136
137							*stdv_unbiased = \&PDLA::stdv_unbiased;
138
139
140
141
142
143							=head2 var
144
145							=for sig
146
147							Signature: (a(n); float+ [o]b())
148
149
150
151							=for ref
152
153							Sample variance.
154
155							=cut
156
157
158							=for bad
159
160							var processes bad values.
161							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
162
163
164							=cut
165
166
167
168
169
170
171							*var = \&PDLA::var;
172
173
174
175
176
177							=head2 var_unbiased
178
179							=for sig
180
181							Signature: (a(n); float+ [o]b())
182
183
184
185							=for ref
186
187							Unbiased estimate of population variance.
188
189							=cut
190
191
192							=for bad
193
194							var_unbiased processes bad values.
195							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
196
197
198							=cut
199
200
201
202
203
204
205							*var_unbiased = \&PDLA::var_unbiased;
206
207
208
209
210
211							=head2 se
212
213							=for sig
214
215							Signature: (a(n); float+ [o]b())
216
217
218
219							=for ref
220
221							Standard error of the mean. Useful for calculating confidence intervals.
222
223							=for usage
224
225							# 95% confidence interval for samples with large N
226
227							$ci_95_upper = $data->average + 1.96 * $data->se;
228							$ci_95_lower = $data->average - 1.96 * $data->se;
229
230
231
232
233							=for bad
234
235							se processes bad values.
236							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
237
238
239							=cut
240
241
242
243
244
245
246							*se = \&PDLA::se;
247
248
249
250
251
252							=head2 ss
253
254							=for sig
255
256							Signature: (a(n); float+ [o]b())
257
258
259
260							=for ref
261
262							Sum of squared deviations from the mean.
263
264							=cut
265
266
267							=for bad
268
269							ss processes bad values.
270							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
271
272
273							=cut
274
275
276
277
278
279
280							*ss = \&PDLA::ss;
281
282
283
284
285
286							=head2 skew
287
288							=for sig
289
290							Signature: (a(n); float+ [o]b())
291
292
293
294							=for ref
295
296							Sample skewness, measure of asymmetry in data. skewness == 0 for normal distribution.
297
298							=cut
299
300
301							=for bad
302
303							skew processes bad values.
304							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
305
306
307							=cut
308
309
310
311
312
313
314							*skew = \&PDLA::skew;
315
316
317
318
319
320							=head2 skew_unbiased
321
322							=for sig
323
324							Signature: (a(n); float+ [o]b())
325
326
327
328							=for ref
329
330							Unbiased estimate of population skewness. This is the number in GNumeric Descriptive Statistics.
331
332							=cut
333
334
335							=for bad
336
337							skew_unbiased processes bad values.
338							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
339
340
341							=cut
342
343
344
345
346
347
348							*skew_unbiased = \&PDLA::skew_unbiased;
349
350
351
352
353
354							=head2 kurt
355
356							=for sig
357
358							Signature: (a(n); float+ [o]b())
359
360
361
362							=for ref
363
364							Sample kurtosis, measure of "peakedness" of data. kurtosis == 0 for normal distribution.
365
366							=cut
367
368
369							=for bad
370
371							kurt processes bad values.
372							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
373
374
375							=cut
376
377
378
379
380
381
382							*kurt = \&PDLA::kurt;
383
384
385
386
387
388							=head2 kurt_unbiased
389
390							=for sig
391
392							Signature: (a(n); float+ [o]b())
393
394
395
396							=for ref
397
398							Unbiased estimate of population kurtosis. This is the number in GNumeric Descriptive Statistics.
399
400							=cut
401
402
403							=for bad
404
405							kurt_unbiased processes bad values.
406							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
407
408
409							=cut
410
411
412
413
414
415
416							*kurt_unbiased = \&PDLA::kurt_unbiased;
417
418
419
420
421
422							=head2 cov
423
424							=for sig
425
426							Signature: (a(n); b(n); float+ [o]c())
427
428
429
430							=for ref
431
432							Sample covariance. see B for ways to call
433
434							=cut
435
436
437							=for bad
438
439							cov processes bad values.
440							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
441
442
443							=cut
444
445
446
447
448
449
450							*cov = \&PDLA::cov;
451
452
453
454
455
456							=head2 cov_table
457
458							=for sig
459
460							Signature: (a(n,m); float+ [o]c(m,m))
461
462
463
464							=for ref
465
466							Square covariance table. Gives the same result as threading using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls.
467
468							=for usage
469
470							Usage:
471
472							# 5 obs x 3 var, 2 such data tables
473
474							perldl> $a = random 5, 3, 2
475
476							perldl> p $cov = $a->cov_table
477							[
478							[
479							[ 8.9636438 -1.8624472 -1.2416588]
480							[-1.8624472 14.341514 -1.4245366]
481							[-1.2416588 -1.4245366 9.8690655]
482							]
483							[
484							[ 10.32644 -0.31311789 -0.95643674]
485							[-0.31311789 15.051779 -7.2759577]
486							[-0.95643674 -7.2759577 5.4465141]
487							]
488							]
489							# diagonal elements of the cov table are the variances
490							perldl> p $a->var
491							[
492							[ 8.9636438 14.341514 9.8690655]
493							[ 10.32644 15.051779 5.4465141]
494							]
495
496							for the same cov matrix table using B,
497
498							perldl> p $a->dummy(2)->cov($a->dummy(1))
499
500
501
502
503							=for bad
504
505							cov_table processes bad values.
506							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
507
508
509							=cut
510
511
512
513
514
515
516							*cov_table = \&PDLA::cov_table;
517
518
519
520
521
522							=head2 corr
523
524							=for sig
525
526							Signature: (a(n); b(n); float+ [o]c())
527
528
529
530							=for ref
531
532							Pearson correlation coefficient. r = cov(X,Y) / (stdv(X) * stdv(Y)).
533
534							=for usage
535
536							Usage:
537
538							perldl> $a = random 5, 3
539							perldl> $b = sequence 5,3
540							perldl> p $a->corr($b)
541
542							[0.20934208 0.30949881 0.26713007]
543
544							for square corr table
545
546							perldl> p $a->corr($a->dummy(1))
547
548							[
549							[ 1 -0.41995259 -0.029301192]
550							[ -0.41995259 1 -0.61927619]
551							[-0.029301192 -0.61927619 1]
552							]
553
554							but it is easier and faster to use B.
555
556							=cut
557
558
559							=for bad
560
561							corr processes bad values.
562							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
563
564
565							=cut
566
567
568
569
570
571
572							*corr = \&PDLA::corr;
573
574
575
576
577
578							=head2 corr_table
579
580							=for sig
581
582							Signature: (a(n,m); float+ [o]c(m,m))
583
584
585
586							=for ref
587
588							Square Pearson correlation table. Gives the same result as threading using B but it calculates only half the square, hence much faster. And it is easier to use with higher dimension pdls.
589
590							=for usage
591
592							Usage:
593
594							# 5 obs x 3 var, 2 such data tables
595
596							perldl> $a = random 5, 3, 2
597
598							perldl> p $a->corr_table
599							[
600							[
601							[ 1 -0.69835951 -0.18549048]
602							[-0.69835951 1 0.72481605]
603							[-0.18549048 0.72481605 1]
604							]
605							[
606							[ 1 0.82722569 -0.71779883]
607							[ 0.82722569 1 -0.63938828]
608							[-0.71779883 -0.63938828 1]
609							]
610							]
611
612							for the same result using B,
613
614							perldl> p $a->dummy(2)->corr($a->dummy(1))
615
616							This is also how to use B and B with such a table.
617
618
619
620
621							=for bad
622
623							corr_table processes bad values.
624							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
625
626
627							=cut
628
629
630
631
632
633
634							*corr_table = \&PDLA::corr_table;
635
636
637
638
639
640							=head2 t_corr
641
642							=for sig
643
644							Signature: (r(); n(); [o]t())
645
646
647
648							=for usage
649
650							$corr = $data->corr( $data->dummy(1) );
651							$n = $data->n_pair( $data->dummy(1) );
652							$t_corr = $corr->t_corr( $n );
653
654							use PDLA::GSL::CDF;
655
656							$p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t_corr->abs, $n-2 ));
657
658							=for ref
659
660							t significance test for Pearson correlations.
661
662							=cut
663
664
665							=for bad
666
667							t_corr processes bad values.
668							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
669
670
671							=cut
672
673
674
675
676
677
678							*t_corr = \&PDLA::t_corr;
679
680
681
682
683
684							=head2 n_pair
685
686							=for sig
687
688							Signature: (a(n); b(n); indx [o]c())
689
690
691
692							=for ref
693
694							Returns the number of good pairs between 2 lists. Useful with B (esp. when bad values are involved)
695
696							=cut
697
698
699							=for bad
700
701							n_pair processes bad values.
702							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
703
704
705							=cut
706
707
708
709
710
711
712							*n_pair = \&PDLA::n_pair;
713
714
715
716
717
718							=head2 corr_dev
719
720							=for sig
721
722							Signature: (a(n); b(n); float+ [o]c())
723
724
725
726							=for usage
727
728							$corr = $a->dev_m->corr_dev($b->dev_m);
729
730							=for ref
731
732							Calculates correlations from B vals. Seems faster than doing B from original vals when data pdl is big
733
734							=cut
735
736
737							=for bad
738
739							corr_dev processes bad values.
740							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
741
742
743							=cut
744
745
746
747
748
749
750							*corr_dev = \&PDLA::corr_dev;
751
752
753
754
755
756							=head2 t_test
757
758							=for sig
759
760							Signature: (a(n); b(m); float+ [o]t(); [o]d())
761
762
763
764							=for usage
765
766							my ($t, $df) = t_test( $pdl1, $pdl2 );
767
768							use PDLA::GSL::CDF;
769
770							my $p_2tail = 2 * (1 - gsl_cdf_tdist_P( $t->abs, $df ));
771
772							=for ref
773
774							Independent sample t-test, assuming equal var.
775
776							=cut
777
778
779							=for bad
780
781							t_test processes bad values.
782							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
783
784
785							=cut
786
787
788
789
790
791
792							*t_test = \&PDLA::t_test;
793
794
795
796
797
798							=head2 t_test_nev
799
800							=for sig
801
802							Signature: (a(n); b(m); float+ [o]t(); [o]d())
803
804
805
806							=for ref
807
808							Independent sample t-test, NOT assuming equal var. ie Welch two sample t test. Df follows Welch-Satterthwaite equation instead of Satterthwaite (1946, as cited by Hays, 1994, 5th ed.). It matches GNumeric, which matches R.
809
810							=for usage
811
812							my ($t, $df) = $pdl1->t_test( $pdl2 );
813
814							=cut
815
816
817							=for bad
818
819							t_test_nev processes bad values.
820							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
821
822
823							=cut
824
825
826
827
828
829
830							*t_test_nev = \&PDLA::t_test_nev;
831
832
833
834
835
836							=head2 t_test_paired
837
838							=for sig
839
840							Signature: (a(n); b(n); float+ [o]t(); [o]d())
841
842
843
844							=for ref
845
846							Paired sample t-test.
847
848							=cut
849
850
851							=for bad
852
853							t_test_paired processes bad values.
854							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
855
856
857							=cut
858
859
860
861
862
863
864							*t_test_paired = \&PDLA::t_test_paired;
865
866
867
868
869							=head2 binomial_test
870
871							=for Sig
872
873							Signature: (x(); n(); p_expected(); [o]p())
874
875							=for ref
876
877							Binomial test. One-tailed significance test for two-outcome distribution. Given the number of successes, the number of trials, and the expected probability of success, returns the probability of getting this many or more successes.
878
879							This function does NOT currently support bad value in the number of successes.
880
881							=for usage
882
883							Usage:
884
885							# assume a fair coin, ie. 0.5 probablity of getting heads
886							# test whether getting 8 heads out of 10 coin flips is unusual
887
888							my $p = binomial_test( 8, 10, 0.5 ); # 0.0107421875. Yes it is unusual.
889
890							=cut
891
892							*binomial_test = \&PDLA::binomial_test;
893							sub PDLA::binomial_test {
894	0			0	0	0	my ($x, $n, $P) = @_;
895
896	0	0				0	carp 'Please install PDLA::GSL::CDF.' unless $CDF;
897	0	0				0	carp 'This function does NOT currently support bad value in the number of successes.' if $x->badflag();
898
899	0					0	my $pdlx = pdl($x);
900	0					0	$pdlx->badflag(1);
901	0					0	$pdlx = $pdlx->setvaltobad(0);
902
903	0					0	my $p = 1 - PDLA::GSL::CDF::gsl_cdf_binomial_P( $pdlx - 1, $P, $n );
904	0					0	$p = $p->setbadtoval(1);
905	0					0	$p->badflag(0);
906
907	0					0	return $p;
908							}
909
910
911							=head1 METHODS
912
913							=head2 rtable
914
915							=for ref
916
917							Reads either file or file handle. Returns observation x variable pdl and var and obs ids if specified. Ids in perl @ ref to allow for non-numeric ids. Other non-numeric entries are treated as missing, which are filled with $opt{MISSN} then set to BAD. Can specify num of data rows to read from top but not arbitrary range.
918
919							*If passed handle, it will not be closed here.
920
921							*PDLA::Bad::setvaltobad only works consistently with the default TYPE double before PDLA-2.4.4_04.
922
923							=for options
924
925							Default options (case insensitive):
926
927							V => 1, # verbose. prints simple status
928							TYPE => double,
929							C_ID => 1, # boolean. file has col id.
930							R_ID => 1, # boolean. file has row id.
931							R_VAR => 0, # boolean. set to 1 if var in rows
932							SEP => "\t", # can take regex qr//
933							MISSN => -999, # this value treated as missing and set to BAD
934							NROW => '', # set to read specified num of data rows
935
936							=for usage
937
938							Usage:
939
940							Sample file diet.txt:
941
942							uid height weight diet
943							akw 72 320 1
944							bcm 68 268 1
945							clq 67 180 2
946							dwm 70 200 2
947
948							($data, $idv, $ido) = rtable 'diet.txt';
949
950							# By default prints out data info and @$idv index and element
951
952							reading diet.txt for data and id... OK.
953							data table as PDLA dim o x v: PDLA: Double D [4,3]
954							0 height
955							1 weight
956							2 diet
957
958							Another way of using it,
959
960							$data = rtable( \*STDIN, {TYPE=>long} );
961
962							=cut
963
964							sub rtable {
965							# returns obs x var data matrix and var and obs ids
966	3			3	1	28082	my ($src, $opt) = @_;
967
968	3					6	my $fh_in;
969	3	50	33			31	if ($src =~ /STDIN/ or ref $src eq 'GLOB') { $fh_in = $src }
	3					9
970	0	0				0	else { open $fh_in, $src or croak "$!" }
971
972	3					16	my %opt = ( V => 1,
973							TYPE => double,
974							C_ID => 1,
975							R_ID => 1,
976							R_VAR => 0,
977							SEP => "\t",
978							MISSN => -999,
979							NROW => '',
980							);
981	3		33			62	$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
982	3	50				11	$opt{V} and print STDERR "reading $src for data and id... ";
983
984	3					8	local $PDLA::undefval = $opt{MISSN};
985
986	3					7	my $id_c = []; # match declaration of $id_r for return purpose
987	3	50				7	if ($opt{C_ID}) {
988	3					17	chomp( $id_c = <$fh_in> );
989	3					52	my @entries = split $opt{SEP}, $id_c;
990	3	50				12	$opt{R_ID} and shift @entries;
991	3					9	$id_c = \@entries;
992							}
993
994	3					20	my ($c_row, $id_r, $data, @data) = (0, [], PDLA->null, );
995	3					70	while (<$fh_in>) {
996	132					170	chomp;
997	132					354	my @entries = split /$opt{SEP}/, $_, -1;
998
999	132	50				287	$opt{R_ID} and push @$id_r, shift @entries;
1000
1001							# rudimentary check for numeric entry
1002	132	100	66			172	for (@entries) { $_ = $opt{MISSN} unless defined $_ and /\d\b/ }
	669					1858
1003
1004	132					259	push @data, pdl( $opt{TYPE}, \@entries );
1005	132					2509	$c_row ++;
1006							last
1007	132	50	33			483	if $opt{NROW} and $c_row == $opt{NROW};
1008							}
1009							# not explicitly closing $fh_in here in case it's passed from outside
1010							# $fh_in will close by going out of scope if opened here.
1011
1012	3					27	$data = pdl $opt{TYPE}, @data;
1013	3					205	@data = ();
1014							# rid of last col unless there is data there
1015							$data = $data(0:$data->getdim(0)-2, )->sever
1016	3	100				23	unless ( nelem $data(-1, )->where($data(-1, ) != $opt{MISSN}) );
1017
1018	3					614	my ($idv, $ido) = ($id_r, $id_c);
1019							# var in columns instead of rows
1020	3	50				23	$opt{R_VAR} == 0
1021							and ($data, $idv, $ido) = ($data->inplace->transpose, $id_c, $id_r);
1022
1023	3	50				109	if ($opt{V}) {
1024	0					0	print STDERR "OK.\ndata table as PDLA dim o x v: " . $data->info . "\n";
1025	0		0			0	$idv and print STDERR "$_\t$$idv[$_]\n" for (0..$#$idv);
1026							}
1027
1028	3					65	$data = $data->setvaltobad( $opt{MISSN} );
1029	3					20	$data->check_badflag;
1030	3	50				200	return wantarray? (@$idv? ($data, $idv, $ido) : ($data, $ido)) : $data;
		50
1031							}
1032
1033							=head2 group_by
1034
1035							Returns pdl reshaped according to the specified factor variable. Most useful when used in conjunction with other threading calculations such as average, stdv, etc. When the factor variable contains unequal number of cases in each level, the returned pdl is padded with bad values to fit the level with the most number of cases. This allows the subsequent calculation (average, stdv, etc) to return the correct results for each level.
1036
1037							Usage:
1038
1039							# simple case with 1d pdl and equal number of n in each level of the factor
1040
1041							pdl> p $a = sequence 10
1042							[0 1 2 3 4 5 6 7 8 9]
1043
1044							pdl> p $factor = $a > 4
1045							[0 0 0 0 0 1 1 1 1 1]
1046
1047							pdl> p $a->group_by( $factor )->average
1048							[2 7]
1049
1050							# more complex case with threading and unequal number of n across levels in the factor
1051
1052							pdl> p $a = sequence 10,2
1053							[
1054							[ 0 1 2 3 4 5 6 7 8 9]
1055							[10 11 12 13 14 15 16 17 18 19]
1056							]
1057
1058							pdl> p $factor = qsort $a( ,0) % 3
1059							[
1060							[0 0 0 0 1 1 1 2 2 2]
1061							]
1062
1063							pdl> p $a->group_by( $factor )
1064							[
1065							[
1066							[ 0 1 2 3]
1067							[10 11 12 13]
1068							]
1069							[
1070							[ 4 5 6 BAD]
1071							[ 14 15 16 BAD]
1072							]
1073							[
1074							[ 7 8 9 BAD]
1075							[ 17 18 19 BAD]
1076							]
1077							]
1078							ARRAY(0xa2a4e40)
1079
1080							# group_by supports perl factors, multiple factors
1081							# returns factor labels in addition to pdl in array context
1082
1083							pdl> p $a = sequence 12
1084							[0 1 2 3 4 5 6 7 8 9 10 11]
1085
1086							pdl> $odd_even = [qw( e o e o e o e o e o e o )]
1087
1088							pdl> $magnitude = [qw( l l l l l l h h h h h h )]
1089
1090							pdl> ($a_grouped, $label) = $a->group_by( $odd_even, $magnitude )
1091
1092							pdl> p $a_grouped
1093							[
1094							[
1095							[0 2 4]
1096							[1 3 5]
1097							]
1098							[
1099							[ 6 8 10]
1100							[ 7 9 11]
1101							]
1102							]
1103
1104							pdl> p Dumper $label
1105							$VAR1 = [
1106							[
1107							'e_l',
1108							'o_l'
1109							],
1110							[
1111							'e_h',
1112							'o_h'
1113							]
1114							];
1115
1116
1117							=cut
1118
1119							*group_by = \&PDLA::group_by;
1120							sub PDLA::group_by {
1121	3			3	0	7188	my $p = shift;
1122	3					7	my @factors = @_;
1123
1124	3	100				10	if ( @factors == 1 ) {
1125	2					3	my $factor = $factors[0];
1126	2					2	my $label;
1127	2	50				6	if (ref $factor eq 'ARRAY') {
1128	0					0	$label = _ordered_uniq($factor);
1129	0					0	$factor = _array_to_pdl($factor);
1130							} else {
1131	2					7	my $perl_factor = [$factor->list];
1132	2					41	$label = _ordered_uniq($perl_factor);
1133							}
1134
1135	2					6	my $p_reshaped = _group_by_single_factor( $p, $factor );
1136
1137	2	100				25	return wantarray? ($p_reshaped, $label) : $p_reshaped;
1138							}
1139
1140							# make sure all are arrays instead of pdls
1141	1	50				2	@factors = map { ref($_) eq 'PDLA'? [$_->list] : $_ } @factors;
	2					8
1142
1143	1					2	my (@cells);
1144	1					2	for my $ele (0 .. $#{$factors[0]}) {
	1					3
1145	10					12	my $c = join '_', map { $_->[$ele] } @factors;
	20					30
1146	10					17	push @cells, $c;
1147							}
1148							# get uniq cell labels (ref List::MoreUtils::uniq)
1149	1					2	my %seen;
1150	1					2	my @uniq_cells = grep {! $seen{$_}++ } @cells;
	10					19
1151
1152	1					3	my $flat_factor = _array_to_pdl( \@cells );
1153
1154	1					2	my $p_reshaped = _group_by_single_factor( $p, $flat_factor );
1155
1156							# get levels of each factor and reshape accordingly
1157	1					2	my @levels;
1158	1					2	for (@factors) {
1159	2					4	my %uniq;
1160	2					7	@uniq{ @$_ } = ();
1161	2					6	push @levels, scalar keys %uniq;
1162							}
1163
1164	1					5	$p_reshaped = $p_reshaped->reshape( $p_reshaped->dim(0), @levels )->sever;
1165
1166							# make labels for the returned data structure matching pdl structure
1167	1					40	my @labels;
1168	1	50				4	if (wantarray) {
1169	1					3	for my $ifactor (0 .. $#levels) {
1170	2					3	my @factor_label;
1171	2					3	for my $ilevel (0 .. $levels[$ifactor]-1) {
1172	4					6	my $i = $ifactor * $levels[$ifactor] + $ilevel;
1173	4					7	push @factor_label, $uniq_cells[$i];
1174							}
1175	2					4	push @labels, \@factor_label;
1176							}
1177							}
1178
1179	1	50				8	return wantarray? ($p_reshaped, \@labels) : $p_reshaped;
1180							}
1181
1182							# get uniq cell labels (ref List::MoreUtils::uniq)
1183							sub _ordered_uniq {
1184	2			2		4	my $arr = shift;
1185
1186	2					3	my %seen;
1187	2					4	my @uniq = grep { ! $seen{$_}++ } @$arr;
	20					52
1188
1189	2					7	return \@uniq;
1190							}
1191
1192							sub _group_by_single_factor {
1193	3			3		4	my $p = shift;
1194	3					4	my $factor = shift;
1195
1196	3					8	$factor = $factor->squeeze;
1197	3	50				115	die "Currently support only 1d factor pdl."
1198							if $factor->ndims > 1;
1199
1200	3	50				9	die "Data pdl and factor pdl do not match!"
1201							unless $factor->dim(0) == $p->dim(0);
1202
1203							# get active dim that will be split according to factor and dims to thread over
1204	3					6	my @p_threaddims = $p->dims;
1205	3					50	my $p_dim0 = shift @p_threaddims;
1206
1207	3					8	my $uniq = $factor->uniq;
1208
1209	3					631	my @uniq_ns;
1210	3					8	for ($uniq->list) {
1211	9					272	push @uniq_ns, which( $factor == $_ )->nelem;
1212							}
1213
1214							# get number of n's in each group, find the biggest, fit output pdl to this
1215	3					79	my $uniq_ns = pdl \@uniq_ns;
1216	3					56	my $max = pdl(\@uniq_ns)->max;
1217
1218	3					149	my $badvalue = int($p->max + 1);
1219	3					99	my $p_tmp = ones($max, @p_threaddims, $uniq->nelem) * $badvalue;
1220	3					202	for (0 .. $#uniq_ns) {
1221	9					1164	my $i = which $factor == $uniq($_);
1222	9					398	$p_tmp->dice_axis(-1,$_)->squeeze->(0:$uniq_ns[$_]-1, ) .= $p($i, );
1223							}
1224
1225	3					477	$p_tmp->badflag(1);
1226	3					33	return $p_tmp->setvaltobad($badvalue);
1227							}
1228
1229							=head2 which_id
1230
1231							=for ref
1232
1233							Lookup specified var (obs) ids in $idv ($ido) (see B) and return indices in $idv ($ido) as pdl if found. The indices are ordered by the specified subset. Useful for selecting data by var (obs) id.
1234
1235							=for usage
1236
1237							my $ind = which_id $ido, ['smith', 'summers', 'tesla'];
1238
1239							my $data_subset = $data( $ind, );
1240
1241							# take advantage of perl pattern matching
1242							# e.g. use data from people whose last name starts with s
1243
1244							my $i = which_id $ido, [ grep { /^s/ } @$ido ];
1245
1246							my $data_s = $data($i, );
1247
1248							=cut
1249
1250							sub which_id {
1251	38			38	1	70	my ($id, $id_s) = @_;
1252
1253	38					44	my %ind;
1254	38					130	@ind{ @$id } = ( 0 .. $#$id );
1255
1256	38					54	my @ind_select;
1257	38					68	for (@$id_s) {
1258	65	50				164	defined( $ind{$_} ) and push @ind_select, $ind{$_};
1259							}
1260	38					75	return pdl @ind_select;
1261							}
1262
1263							sub _array_to_pdl {
1264	133			133		3142	my ($var_ref) = @_;
1265
1266	133					175	my (%level, $l);
1267	133					169	$l = 0;
1268	133					234	for (@$var_ref) {
1269	4684	100	66			15006	if (defined($_) and $_ ne '' and $_ ne 'BAD') {
			100
1270							$level{$_} = $l ++
1271	4676	100				8183	if !exists $level{$_};
1272							}
1273							}
1274
1275	133	100	100			239	my $pdl = pdl( map { (defined($_) and $_ ne '' and $_ ne 'BAD')? $level{$_} : -1 } @$var_ref );
	4684					15824
1276	133					4366	$pdl = $pdl->setvaltobad(-1);
1277	133					456	$pdl->check_badflag;
1278
1279	133	100				6134	return wantarray? ($pdl, \%level) : $pdl;
1280							}
1281
1282
1283							=head1 SEE ALSO
1284
1285							PDLA::Basic (hist for frequency counts)
1286
1287							PDLA::Ufunc (sum, avg, median, min, max, etc.)
1288
1289							PDLA::GSL::CDF (various cumulative distribution functions)
1290
1291							=head1 REFERENCES
1292
1293							Hays, W.L. (1994). Statistics (5th ed.). Fort Worth, TX: Harcourt Brace College Publishers.
1294
1295							=head1 AUTHOR
1296
1297							Copyright (C) 2009 Maggie J. Xiong
1298
1299							All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDLA distribution.
1300
1301							=cut
1302
1303
1304
1305							;
1306
1307
1308
1309							# Exit with OK status
1310
1311							1;
1312
1313