File Coverage

blib/lib/PDL/Stats/GLM.pm

Criterion	Covered	Total	%
statement	9	9	100.0
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	12	12	100.0

line	stmt	sub	time	code
1				#
2				# GENERATED WITH PDL::PP from lib/PDL/Stats/GLM.pd! Don't modify!
3				#
4				package PDL::Stats::GLM;
5
6				our @EXPORT_OK = qw(ols_t ols ols_rptd anova anova_rptd anova_design_matrix dummy_code effect_code effect_code_w interaction_code r2_change logistic pca pca_sorti plot_means plot_residuals plot_screes fill_m fill_rand dev_m stddz sse mse rmse pred_logistic d0 dm dvrs );
7				our %EXPORT_TAGS = (Func=>\@EXPORT_OK);
8
9	1	1	1545	use PDL::Core;
	1		2
	1		7
10	1	1	432	use PDL::Exporter;
	1		2
	1		20
11	1	1	31	use DynaLoader;
	1		2
	1		863
12
13
14
15				our @ISA = ( 'PDL::Exporter','DynaLoader' );
16				push @PDL::Core::PP, __PACKAGE__;
17				bootstrap PDL::Stats::GLM ;
18
19
20
21
22
23
24
25
26				#line 14 "lib/PDL/Stats/GLM.pd"
27
28				use strict;
29				use warnings;
30
31				use Carp;
32				use PDL::LiteF;
33				use PDL::MatrixOps;
34				use PDL::Stats::Basic;
35				use PDL::Stats::Kmeans;
36
37				eval { require PDL::Core; require PDL::GSL::CDF; };
38				my $CDF = 1 if !$@;
39
40				=encoding utf8
41
42				=head1 NAME
43
44				PDL::Stats::GLM -- general and generalized linear modelling methods such as ANOVA, linear regression, PCA, and logistic regression.
45
46				=head1 SYNOPSIS
47
48				use PDL::LiteF;
49				use PDL::Stats::GLM;
50
51				# do a multiple linear regression and plot the residuals
52				my $y = pdl( 8, 7, 7, 0, 2, 5, 0 );
53				my $x = pdl( [ 0, 1, 2, 3, 4, 5, 6 ], # linear component
54				[ 0, 1, 4, 9, 16, 25, 36 ] ); # quadratic component
55				my %m = $y->ols( $x, {plot=>1} );
56				print "$_\t$m{$_}\n" for sort keys %m;
57
58				=head1 DESCRIPTION
59
60				For more about general linear modelling, see
61				L.
62				For an unbelievably thorough text on experimental design
63				and analysis, including linear modelling, see L
64				Course in Design and Analysis of Experiments, Gary
65				W. Oehlert\|http://users.stat.umn.edu/~gary/book/fcdae.pdf>.
66
67				The terms FUNCTIONS and METHODS are arbitrarily used to refer to
68				methods that are broadcastable and methods that are NOT broadcastable,
69				respectively. FUNCTIONS support bad values.
70
71				P-values, where appropriate, are provided if PDL::GSL::CDF is installed.
72
73				=cut
74				#line 75 "lib/PDL/Stats/GLM.pm"
75
76
77				=head1 FUNCTIONS
78
79				=cut
80
81
82
83
84
85
86				=head2 fill_m
87
88				=for sig
89
90				Signature: (a(n); [o]b(n))
91				Types: (float double)
92
93				=for ref
94
95				Replaces bad values with sample mean. Mean is set to 0 if all obs are
96				bad.
97
98				=for usage
99
100				pdl> p $data
101				[
102				[ 5 BAD 2 BAD]
103				[ 7 3 7 BAD]
104				]
105
106				pdl> p $data->fill_m
107				[
108				[ 5 3.5 2 3.5]
109				[ 7 3 7 5.66667]
110				]
111
112
113				=pod
114
115				Broadcasts over its inputs.
116
117				=for bad
118
119				The output pdl badflag is cleared.
120
121				=cut
122
123
124
125
126				*fill_m = \&PDL::fill_m;
127
128
129
130
131
132
133				=head2 fill_rand
134
135				=for sig
136
137				Signature: (a(n); [o]b(n))
138				Types: (sbyte byte short ushort long ulong indx ulonglong longlong
139				float double ldouble)
140
141				=for ref
142
143				Replaces bad values with random sample (with replacement) of good
144				observations from the same variable.
145
146				=for usage
147
148				pdl> p $data
149				[
150				[ 5 BAD 2 BAD]
151				[ 7 3 7 BAD]
152				]
153
154				pdl> p $data->fill_rand
155
156				[
157				[5 2 2 5]
158				[7 3 7 7]
159				]
160
161				=pod
162
163				Broadcasts over its inputs.
164
165				=for bad
166
167				The output pdl badflag is cleared.
168
169				=cut
170
171
172
173
174				*fill_rand = \&PDL::fill_rand;
175
176
177
178
179
180
181				=head2 dev_m
182
183				=for sig
184
185				Signature: (a(n); [o]b(n))
186				Types: (float double)
187
188				=for usage
189
190				$b = dev_m($a);
191				dev_m($a, $b); # all arguments given
192				$b = $a->dev_m; # method call
193				$a->dev_m($b);
194				$a->inplace->dev_m; # can be used inplace
195				dev_m($a->inplace);
196
197				=for ref
198
199				Replaces values with deviations from the mean.
200
201				=pod
202
203				Broadcasts over its inputs.
204
205				=for bad
206
207				C processes bad values.
208				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
209
210				=cut
211
212
213
214
215				*dev_m = \&PDL::dev_m;
216
217
218
219
220
221
222				=head2 stddz
223
224				=for sig
225
226				Signature: (a(n); [o]b(n))
227				Types: (float double)
228
229				=for usage
230
231				$b = stddz($a);
232				stddz($a, $b); # all arguments given
233				$b = $a->stddz; # method call
234				$a->stddz($b);
235				$a->inplace->stddz; # can be used inplace
236				stddz($a->inplace);
237
238				=for ref
239
240				Standardize ie replace values with z_scores based on sample standard deviation from the mean (replace with 0s if stdv==0).
241
242				=pod
243
244				Broadcasts over its inputs.
245
246				=for bad
247
248				C processes bad values.
249				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
250
251				=cut
252
253
254
255
256				*stddz = \&PDL::stddz;
257
258
259
260
261
262
263				=head2 sse
264
265				=for sig
266
267				Signature: (a(n); b(n); [o]c())
268				Types: (float double)
269
270				=for usage
271
272				$c = sse($a, $b);
273				sse($a, $b, $c); # all arguments given
274				$c = $a->sse($b); # method call
275				$a->sse($b, $c);
276
277				=for ref
278
279				Sum of squared errors between actual and predicted values.
280
281				=pod
282
283				Broadcasts over its inputs.
284
285				=for bad
286
287				C processes bad values.
288				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
289
290				=cut
291
292
293
294
295				*sse = \&PDL::sse;
296
297
298
299
300
301
302				=head2 mse
303
304				=for sig
305
306				Signature: (a(n); b(n); [o]c())
307				Types: (float double)
308
309				=for usage
310
311				$c = mse($a, $b);
312				mse($a, $b, $c); # all arguments given
313				$c = $a->mse($b); # method call
314				$a->mse($b, $c);
315
316				=for ref
317
318				Mean of squared errors between actual and predicted values, ie variance around predicted value.
319
320				=pod
321
322				Broadcasts over its inputs.
323
324				=for bad
325
326				C processes bad values.
327				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
328
329				=cut
330
331
332
333
334				*mse = \&PDL::mse;
335
336
337
338
339
340
341				=head2 rmse
342
343				=for sig
344
345				Signature: (a(n); b(n); [o]c())
346				Types: (float double)
347
348				=for usage
349
350				$c = rmse($a, $b);
351				rmse($a, $b, $c); # all arguments given
352				$c = $a->rmse($b); # method call
353				$a->rmse($b, $c);
354
355				=for ref
356
357				Root mean squared error, ie stdv around predicted value.
358
359				=pod
360
361				Broadcasts over its inputs.
362
363				=for bad
364
365				C processes bad values.
366				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
367
368				=cut
369
370
371
372
373				*rmse = \&PDL::rmse;
374
375
376
377
378
379
380				=head2 pred_logistic
381
382				=for sig
383
384				Signature: (a(n,m); b(m); [o]c(n))
385				Types: (float double)
386
387				=for ref
388
389				Calculates predicted prob value for logistic regression.
390
391				=for usage
392
393				# glue constant then apply coeff returned by the logistic method
394				$pred = $x->glue(1,ones($x->dim(0)))->pred_logistic( $m{b} );
395
396				=pod
397
398				Broadcasts over its inputs.
399
400				=for bad
401
402				C processes bad values.
403				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
404
405				=cut
406
407
408
409
410				*pred_logistic = \&PDL::pred_logistic;
411
412
413
414
415
416
417				=head2 d0
418
419				=for sig
420
421				Signature: (a(n); [o]c())
422				Types: (float double)
423
424				=for usage
425
426				$c = d0($a);
427				d0($a, $c); # all arguments given
428				$c = $a->d0; # method call
429				$a->d0($c);
430
431				=for ref
432
433				Null deviance for logistic regression.
434
435				=pod
436
437				Broadcasts over its inputs.
438
439				=for bad
440
441				C processes bad values.
442				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
443
444				=cut
445
446
447
448
449				*d0 = \&PDL::d0;
450
451
452
453
454
455
456				=head2 dm
457
458				=for sig
459
460				Signature: (a(n); b(n); [o]c())
461				Types: (float double)
462
463				=for ref
464
465				Model deviance for logistic regression.
466
467				=for usage
468
469				my $dm = $y->dm( $y_pred );
470				# null deviance
471				my $d0 = $y->dm( ones($y->nelem) * $y->avg );
472
473				=pod
474
475				Broadcasts over its inputs.
476
477				=for bad
478
479				C processes bad values.
480				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
481
482				=cut
483
484
485
486
487				*dm = \&PDL::dm;
488
489
490
491
492
493
494				=head2 dvrs
495
496				=for sig
497
498				Signature: (a(); b(); [o]c())
499				Types: (float double)
500
501				=for usage
502
503				$c = dvrs($a, $b);
504				dvrs($a, $b, $c); # all arguments given
505				$c = $a->dvrs($b); # method call
506				$a->dvrs($b, $c);
507
508				=for ref
509
510				Deviance residual for logistic regression.
511
512				=pod
513
514				Broadcasts over its inputs.
515
516				=for bad
517
518				C processes bad values.
519				It will set the bad-value flag of all output ndarrays if the flag is set for any of the input ndarrays.
520
521				=cut
522
523
524
525
526				*dvrs = \&PDL::dvrs;
527
528
529
530
531
532				#line 359 "lib/PDL/Stats/GLM.pd"
533
534				#line 360 "lib/PDL/Stats/GLM.pd"
535
536				=head2 ols_t
537
538				=for ref
539
540				Broadcasted version of ordinary least squares regression (B). The
541				price of broadcasting was losing significance tests for coefficients
542				(but see B). The fitting function was shamelessly copied then
543				modified from PDL::Fit::Linfit. Intercept is FIRST of coeff if CONST => 1.
544
545				ols_t does not handle bad values. consider B or B
546				if there are bad values.
547
548				=for options
549
550				Default options (case insensitive):
551
552				CONST => 1,
553
554				=for usage
555
556				Usage:
557
558				# DV, 2 person's ratings for top-10 box office movies
559				# ascending sorted by box office numbers
560
561				pdl> p $y = pdl '1 1 2 4 4 4 4 5 5 5; 1 2 2 2 3 3 3 3 5 5'
562
563				# model with 2 IVs, a linear and a quadratic trend component
564
565				pdl> $x = cat sequence(10), sequence(10)**2
566
567				# suppose our novice modeler thinks this creates 3 different models
568				# for predicting movie ratings
569
570				pdl> p $x = cat $x, $x * 2, $x * 3
571				[
572				[
573				[ 0 1 2 3 4 5 6 7 8 9]
574				[ 0 1 4 9 16 25 36 49 64 81]
575				]
576				[
577				[ 0 2 4 6 8 10 12 14 16 18]
578				[ 0 2 8 18 32 50 72 98 128 162]
579				]
580				[
581				[ 0 3 6 9 12 15 18 21 24 27]
582				[ 0 3 12 27 48 75 108 147 192 243]
583				]
584				]
585
586				pdl> p $x->info
587				PDL: Double D [10,2,3]
588
589				# insert a dummy dim between IV and the dim (model) to be broadcasted
590
591				pdl> %m = $y->ols_t( $x->dummy(2) )
592
593				pdl> p "$_\t@{[$m{$_} =~ /^\n(.?)\n*\z/s]}\n" for sort keys %m
594
595				# 2 persons' ratings, each fitted with 3 "different" models
596				F [
597				[ 38.314159 25.087209]
598				[ 38.314159 25.087209]
599				[ 38.314159 25.087209]
600				]
601
602				# df is the same across dv and iv models
603				F_df [2 7]
604				F_p [
605				[0.00016967051 0.00064215074]
606				[0.00016967051 0.00064215074]
607				[0.00016967051 0.00064215074]
608				]
609
610				R2 [
611				[ 0.9162963 0.87756762]
612				[ 0.9162963 0.87756762]
613				[ 0.9162963 0.87756762]
614				]
615
616				b [ # constant linear quadratic
617				[
618				[ 0.66363636 0.99015152 -0.056818182] # person 1
619				[ 1.4 0.18939394 0.022727273] # person 2
620				]
621				[
622				[ 0.66363636 0.49507576 -0.028409091]
623				[ 1.4 0.09469697 0.011363636]
624				]
625				[
626				[ 0.66363636 0.33005051 -0.018939394]
627				[ 1.4 0.063131313 0.0075757576]
628				]
629				]
630
631				# our novice modeler realizes at this point that
632				# the 3 models only differ in the scaling of the IV coefficients
633				ss_model [
634				[ 20.616667 13.075758]
635				[ 20.616667 13.075758]
636				[ 20.616667 13.075758]
637				]
638
639				ss_residual [
640				[ 1.8833333 1.8242424]
641				[ 1.8833333 1.8242424]
642				[ 1.8833333 1.8242424]
643				]
644
645				ss_total [22.5 14.9]
646				y_pred [
647				[
648				[0.66363636 1.5969697 2.4166667 3.1227273 ... 4.9727273]
649				...
650
651				=cut
652
653				*ols_t = \&PDL::ols_t;
654				sub PDL::ols_t {
655				_ols_common(1, @_);
656				}
657
658				sub _ols_common {
659				my ($broadcasted, $y, $ivs, $opt) = @_;
660				($y, $ivs) = _ols_prep_inputs(@_);
661				_ols_main($broadcasted, $y, $ivs, $opt);
662				}
663
664				sub _ols_prep_inputs {
665				# y [n], ivs [n x attr] pdl
666				my ($broadcasted, $y, $ivs, $opt) = @_;
667				my %opt = (
668				CONST => 1,
669				PLOT => 0,
670				WIN => undef, # for plotting
671				);
672				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
673				if (!$broadcasted) {
674				$y = $y->squeeze;
675				$y->getndims > 1 and
676				croak "use ols_t for broadcasted version";
677				}
678				$ivs = $ivs->dummy(1) if $ivs->getndims == 1;
679				($y, $ivs) = _rm_bad_value( $y, $ivs ) if !$broadcasted;
680				# set up ivs and const as ivs
681				$opt{CONST} and
682				$ivs = ones($ivs->dim(0))->glue( 1, $ivs );
683				($y, $ivs);
684				}
685
686				sub _ols_main {
687				my ($broadcasted, $y, $ivs, $opt) = @_;
688				my %opt = (
689				CONST => 1,
690				PLOT => 0,
691				WIN => undef, # for plotting
692				);
693				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
694				my $C = inv( $ivs x $ivs->t );
695				# Internally normalise data
696				# (double) it or ushort y and sequence iv won't work right
697				my $ymean = $y->abs->avgover->double;
698				$ymean->where( $ymean==0 ) .= 1;
699				my $divisor = $broadcasted ? $ymean->dummy(0) : $ymean;
700				my $y2 = $y / $divisor;
701				my $Y = $ivs x $y2->dummy(0);
702				# Do the fit
703				# Fitted coefficients vector
704				my $coeff = PDL::squeeze( $C x $Y );
705				$coeff = $coeff->dummy(0)
706				if $broadcasted and $coeff->getndims == 1 and $y->getndims > 1;
707				$coeff *= $divisor; # Un-normalise
708				# *$coeff x $ivs looks nice but produces nan on successive tries*
709				my $y_pred = sumover( ($broadcasted ? $coeff->dummy(1) : $coeff) * $ivs->transpose );
710				$opt{PLOT} and $y->plot_residuals( $y_pred, \%opt );
711				return $coeff unless wantarray;
712				my %ret = (y_pred => $y_pred);
713				$ret{ss_total} = $opt{CONST} ? $y->ss : sumover( $y ** 2 );
714				$ret{ss_residual} = $y->sse( $ret{y_pred} );
715				$ret{ss_model} = $ret{ss_total} - $ret{ss_residual};
716				$ret{R2} = $ret{ss_model} / $ret{ss_total};
717				my $n_var = $opt{CONST} ? $ivs->dim(1) - 1 : $ivs->dim(1);
718				$ret{F_df} = pdl( $n_var, my $df1 = $y->dim(0) - $ivs->dim(1) );
719				$ret{F} = $ret{ss_model} / $n_var
720				/ ($ret{ss_residual} / $df1);
721				$ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $n_var, $df1 )
722				if $CDF;
723				if (!$broadcasted) {
724				my $se_b = ones( $coeff->dims? $coeff->dims : 1 );
725				$opt{CONST} and
726				$se_b->slice(0) .= sqrt( $ret{ss_residual} / $df1 * $C->slice(0,0) );
727				# get the se for bs by successively regressing each iv by the rest ivs
728				if ($ivs->dim(1) > 1) {
729				my @coords = $opt{CONST} ? 1..$n_var : 0..$n_var-1;
730				my $ones = !$opt{CONST} ? undef : ones($ivs->dim(0));
731				for my $k (@coords) {
732				my $G = $ivs->dice_axis(1, [grep $_ != $k, @coords]);
733				$G = $ones->glue( 1, $G ) if $opt{CONST};
734				my $b_G = $ivs->slice(':',$k)->ols( $G, {CONST=>0,PLOT=>0} );
735				my $ss_res_k = $ivs->slice(':',$k)->squeeze->sse( sumover($b_G * $G->transpose) );
736				$se_b->slice($k) .= sqrt( $ret{ss_residual} / $df1 / $ss_res_k );
737				}
738				}
739				else {
740				$se_b->slice(0)
741				.= sqrt( $ret{ss_residual} / $df1 / sum( $ivs->slice(':',0)**2 ) );
742				}
743				$ret{b_se} = $se_b;
744				$ret{b_t} = $coeff / $ret{b_se};
745				$ret{b_p} = 2 * ( 1 - $ret{b_t}->abs->gsl_cdf_tdist_P( $df1 ) )
746				if $CDF;
747				}
748				for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze };
749				$ret{b} = $coeff;
750				return %ret;
751				}
752
753				=head2 r2_change
754
755				=for ref
756
757				Significance test for the incremental change in R2 when new variable(s)
758				are added to an ols regression model.
759
760				Returns the change stats as well as stats for both models. Based on
761				L. (One way to make up for the lack of significance tests for
762				coeffs in ols_t).
763
764				=for options
765
766				Default options (case insensitive):
767
768				CONST => 1,
769
770				=for usage
771
772				Usage:
773
774				# suppose these are two persons' ratings for top 10 box office movies
775				# ascending sorted by box office
776
777				pdl> p $y = qsort ceil(random(10, 2) * 5)
778				[
779				[1 1 2 2 2 3 4 4 4 4]
780				[1 2 2 3 3 3 4 4 5 5]
781				]
782
783				# first IV is a simple linear trend
784
785				pdl> p $x1 = sequence 10
786				[0 1 2 3 4 5 6 7 8 9]
787
788				# the modeler wonders if adding a quadratic trend improves the fit
789
790				pdl> p $x2 = sequence(10) ** 2
791				[0 1 4 9 16 25 36 49 64 81]
792
793				# two difference models are given in two pdls
794				# each as would be pass on to ols_t
795				# the 1st model includes only linear trend
796				# the 2nd model includes linear and quadratic trends
797				# when necessary use dummy dim so both models have the same ndims
798
799				pdl> %c = $y->r2_change( $x1->dummy(1), cat($x1, $x2) )
800
801				pdl> p "$_\t$c{$_}\n" for sort keys %c
802				# person 1 person 2
803				F_change [0.72164948 0.071283096]
804				# df same for both persons
805				F_df [1 7]
806				F_p [0.42370145 0.79717232]
807				R2_change [0.0085966043 0.00048562549]
808				model0 HASH(0x8c10828)
809				model1 HASH(0x8c135c8)
810
811				# the answer here is no.
812
813				=cut
814
815				*r2_change = \&PDL::r2_change;
816				sub PDL::r2_change {
817				my ($self, $ivs0, $ivs1, $opt) = @_;
818				$ivs0->getndims == 1 and $ivs0 = $ivs0->dummy(1);
819
820				my %ret;
821
822				$ret{model0} = { $self->ols_t( $ivs0, $opt ) };
823				$ret{model1} = { $self->ols_t( $ivs1, $opt ) };
824
825				$ret{R2_change} = $ret{model1}->{R2} - $ret{model0}->{R2};
826				$ret{F_df}
827				= pdl(my $df0 = $ivs1->dim(1) - $ivs0->dim(1),
828				my $df1 = $ret{model1}->{F_df}->slice('(1)') );
829				$ret{F_change}
830				= $ret{R2_change} * $df1
831				/ ( (1-$ret{model1}->{R2}) * $df0 );
832				$ret{F_p} = 1 - $ret{F_change}->gsl_cdf_fdist_P( $df0, $df1 )
833				if $CDF;
834
835				for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze };
836
837				%ret;
838				}
839
840				=head1 METHODS
841
842				=head2 anova
843
844				=for ref
845
846				Analysis of variance. Uses type III sum of squares for unbalanced data.
847
848				Dependent variable should be a 1D pdl. Independent variables can be
849				passed as 1D perl array ref or 1D pdl.
850
851				Will only calculate p-value (C) if there are more samples than the
852				product of categories of all the IVs.
853
854				Supports bad value (by ignoring missing or BAD values in dependent and
855				independent variables list-wise).
856
857				For more on ANOVA, see L.
858
859				=for options
860
861				Default options (case insensitive):
862
863				V => 1, # carps if bad value in variables
864				IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
865				PLOT => 0, # plots highest order effect
866				# can set plot_means options here
867				WIN => undef, # for plotting
868
869				=for usage
870
871				Usage:
872
873				# suppose this is ratings for 12 apples
874
875				pdl> p $y = qsort ceil( random(12)*5 )
876				[1 1 2 2 2 3 3 4 4 4 5 5]
877
878				# IV for types of apple
879
880				pdl> p $a = sequence(12) % 3 + 1
881				[1 2 3 1 2 3 1 2 3 1 2 3]
882
883				# IV for whether we baked the apple
884
885				pdl> @b = qw( y y y y y y n n n n n n )
886
887				pdl> %m = $y->anova( $a, \@b, { IVNM=>['apple', 'bake'] } )
888
889				pdl> p "$_\t@{[$m{$_} =~ /^\n(.?)\n*\z/s]}\n" for sort keys %m
890				F 2.46666666666667
891				F_df [5 6]
892				F_p 0.151168719948632
893				ms_model 3.08333333333333
894				ms_residual 1.25
895				ss_model 15.4166666666667
896				ss_residual 7.5
897				ss_total 22.9166666666667
898				\| apple \| F 0.466666666666667
899				\| apple \| F_p 0.648078345471096
900				\| apple \| df 2
901				\| apple \| m [2.75 3 3.5]
902				\| apple \| ms 0.583333333333334
903				\| apple \| se [0.85391256 0.81649658 0.64549722]
904				\| apple \| ss 1.16666666666667
905				\| apple \|\| err df 6
906				\| apple ~ bake \| F 0.0666666666666671
907				\| apple ~ bake \| F_p 0.936190104380701
908				\| apple ~ bake \| df 2
909				\| apple ~ bake \| m [
910				[1.5 2 2.5]
911				[ 4 4 4.5]
912				]
913				\| apple ~ bake \| ms 0.0833333333333339
914				\| apple ~ bake \| se [
915				[0.5 1 0.5]
916				[ 1 1 0.5]
917				]
918				\| apple ~ bake \| ss 0.166666666666668
919				\| apple ~ bake \|\| err df 6
920				\| bake \| F 11.2666666666667
921				\| bake \| F_p 0.015294126084452
922				\| bake \| df 1
923				\| bake \| m [2 4.1666667]
924				\| bake \| ms 14.0833333333333
925				\| bake \| se [0.36514837 0.40138649]
926				\| bake \| ss 14.0833333333333
927				\| bake \|\| err df 6
928
929				This is implemented as a call to L, with an C
930				subjects vector.
931
932				=cut
933
934				*anova = \&PDL::anova;
935				sub PDL::anova {
936				my ($y, @args) = @_;
937				anova_rptd($y, undef, @args);
938				}
939
940				sub _interactions {
941				my ($ivs_ref, $idv) = @_;
942				my (@inter, @idv_inter);
943				for my $nway ( 2 .. @$ivs_ref ) {
944				my $iter_idv = _combinations( $nway, [0..$#$ivs_ref] );
945				while ( my @v = &$iter_idv() ) {
946				push @inter, interaction_code(@$ivs_ref[@v]);
947				push @idv_inter, join ' ~ ', @$idv[@v];
948				}
949				}
950				(\@inter, \@idv_inter);
951				}
952
953				# now prepare for cell mean
954				sub _interactions_cm {
955				my ($ivs_ref, $pdl_ivs_raw) = @_;
956				my ($dim0, @inter_cm, @inter_cmo) = $ivs_ref->[0]->dim(0);
957				for my $nway ( 2 .. @$ivs_ref ) {
958				my $iter_idv = _combinations( $nway, [0..$#$ivs_ref] );
959				while ( my @v = &$iter_idv() ) {
960				my @i_cm;
961				for my $o ( 0 .. $dim0 - 1 ) {
962				push @i_cm, join '', map $_->slice("($o)"), @$pdl_ivs_raw[@v];
963				}
964				my ($inter, $map) = effect_code( \@i_cm );
965				push @inter_cm, $inter;
966				# get the order to put means in correct multi dim pdl pos
967				# this is order in var_e dim(1)
968				my @levels = sort { $map->{$a} <=> $map->{$b} } keys %$map;
969				# this is order needed for cell mean
970				my @i_cmo = sort { reverse($levels[$a]) cmp reverse($levels[$b]) }
971				0 .. $#levels;
972				push @inter_cmo, pdl @i_cmo;
973				}
974				}
975				(\@inter_cmo, \@inter_cm);
976				}
977
978				sub _cell_means {
979				my ($data, $ivs_cm_ref, $i_cmo_ref, $idv, $pdl_ivs_raw) = @_;
980				my %ind_id;
981				@ind_id{ @$idv } = 0..$#$idv;
982				my %cm;
983				my $i = 0;
984				for (@$ivs_cm_ref) {
985				confess "_cell_means passed empty ivs_cm_ref ndarray at pos $i" if $_->isempty;
986				my $last = zeroes $_->dim(0);
987				my $i_neg = which $_->slice(':',0) == -1;
988				$last->slice($i_neg) .= 1;
989				$_->where($_ == -1) .= 0;
990				$_ = $_->glue(1, $last);
991				my @v = split ' ~ ', $idv->[$i];
992				my @shape = map $pdl_ivs_raw->[$_]->uniq->nelem, @ind_id{@v};
993				my ($m, $ss) = $data->centroid( $_ );
994				$m = $m->slice($i_cmo_ref->[$i])->sever;
995				$ss = $ss->slice($i_cmo_ref->[$i])->sever;
996				$m = $m->reshape(@shape);
997				my $se = sqrt( ($ss/($_->sumover - 1)) / $_->sumover )->reshape(@shape);
998				$cm{ "\| $idv->[$i] \| m" } = $m;
999				$cm{ "\| $idv->[$i] \| se" } = $se;
1000				$i++;
1001				}
1002				\%cm;
1003				}
1004
1005				# http://www.perlmonks.org/?node_id=371228
1006				sub _combinations {
1007				my ($num, $arr) = @_;
1008				return sub { return }
1009				if $num == 0 or $num > @$arr;
1010				my @pick;
1011				return sub {
1012				return @$arr[ @pick = ( 0 .. $num - 1 ) ]
1013				unless @pick;
1014				my $i = $#pick;
1015				$i-- until $i < 0 or $pick[$i]++ < @$arr - $num + $i;
1016				return if $i < 0;
1017				@pick[$i .. $#pick] = $pick[$i] .. $#$arr;
1018				return @$arr[@pick];
1019				};
1020				}
1021
1022				=head2 anova_rptd
1023
1024				=for ref
1025
1026				Repeated measures and mixed model anova. Uses type III sum of squares.
1027
1028				The standard error (se) for the means are based on the relevant mean
1029				squared error from the anova, ie it is pooled across levels of the effect.
1030				Will only calculate p-value (C) if there are more samples than the
1031				product of categories of all the IVs.
1032
1033				Uses L, so supports bad values.
1034
1035				For more on repeated measures ANOVA, see
1036				L,
1037				and for mixed models, see
1038				L.
1039
1040				=for options
1041
1042				Default options (case insensitive):
1043
1044				V => 1, # carps if bad value in dv
1045				IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
1046				BTWN => [], # indices of between-subject IVs (matches IVNM indices)
1047				PLOT => 0, # plots highest order effect
1048				# see plot_means() for more options
1049				WIN => undef, # for plotting
1050
1051				=for usage
1052
1053				Usage:
1054
1055				Some fictional data: recall_w_beer_and_wings.txt
1056
1057				Subject Beer Wings Recall
1058				Alex 1 1 8
1059				Alex 1 2 9
1060				Alex 1 3 12
1061				Alex 2 1 7
1062				Alex 2 2 9
1063				Alex 2 3 12
1064				Brian 1 1 12
1065				Brian 1 2 13
1066				Brian 1 3 14
1067				Brian 2 1 9
1068				Brian 2 2 8
1069				Brian 2 3 14
1070				...
1071
1072				# rtable allows text only in 1st row and col
1073				my ($data, $idv, $subj) = rtable 'recall_w_beer_and_wings.txt';
1074
1075				my ($b, $w, $dv) = $data->dog;
1076				# subj and IVs can be 1d pdl or @ ref
1077				# subj must be the first argument
1078				my %m = $dv->anova_rptd( $subj, $b, $w, {ivnm=>['Beer', 'Wings']} );
1079
1080				print "$_\t@{[$m{$_} =~ /^\n(.?)\n*\z/s]}\n" for sort keys %m
1081
1082				ss_residual 19.0833333333333
1083				ss_subject 24.8333333333333
1084				ss_total 133.833333333333
1085				\| Beer \| F 9.39130434782609
1086				\| Beer \| F_p 0.0547977008378944
1087				\| Beer \| df 1
1088				\| Beer \| m [10.916667 8.9166667]
1089				\| Beer \| ms 24
1090				\| Beer \| se [0.4614791 0.4614791]
1091				\| Beer \| ss 24
1092				\| Beer \|\| err df 3
1093				\| Beer \|\| err ms 2.55555555555556
1094				\| Beer \|\| err ss 7.66666666666667
1095				\| Beer ~ Wings \| F 0.510917030567687
1096				\| Beer ~ Wings \| F_p 0.623881438624431
1097				\| Beer ~ Wings \| df 2
1098				\| Beer ~ Wings \| m [
1099				[ 10 7]
1100				[ 10.5 9.25]
1101				[12.25 10.5]
1102				]
1103				\| Beer ~ Wings \| ms 1.625
1104				\| Beer ~ Wings \| se [
1105				[0.89170561 0.89170561]
1106				[0.89170561 0.89170561]
1107				[0.89170561 0.89170561]
1108				]
1109				\| Beer ~ Wings \| ss 3.25000000000001
1110				\| Beer ~ Wings \|\| err df 6
1111				\| Beer ~ Wings \|\| err ms 3.18055555555555
1112				\| Beer ~ Wings \|\| err ss 19.0833333333333
1113				\| Wings \| F 4.52851711026616
1114				\| Wings \| F_p 0.0632754786153548
1115				\| Wings \| df 2
1116				\| Wings \| m [8.5 9.875 11.375]
1117				\| Wings \| ms 16.5416666666667
1118				\| Wings \| se [0.67571978 0.67571978 0.67571978]
1119				\| Wings \| ss 33.0833333333333
1120				\| Wings \|\| err df 6
1121				\| Wings \|\| err ms 3.65277777777778
1122				\| Wings \|\| err ss 21.9166666666667
1123
1124				For mixed model anova, ie when there are between-subject IVs
1125				involved, feed the IVs as above, but specify in BTWN which IVs are
1126				between-subject. For example, if we had added age as a between-subject
1127				IV in the above example, we would do
1128
1129				my %m = $dv->anova_rptd( $subj, $age, $b, $w,
1130				{ ivnm=>['Age', 'Beer', 'Wings'], btwn=>[0] });
1131
1132				=cut
1133
1134				*anova_rptd = \&PDL::anova_rptd;
1135				sub PDL::anova_rptd {
1136				my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
1137				my ($y, $subj, @ivs_raw) = @_;
1138				confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1;
1139				croak "Mismatched number of elements in DV and IV. Are you passing IVs the old-and-abandoned way?"
1140				if (ref $ivs_raw[0] eq 'ARRAY') and (@{ $ivs_raw[0] } != $y->nelem);
1141				for (@ivs_raw) {
1142				croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1
1143				}
1144				my %opt = (
1145				V => 1, # carps if bad value in dv
1146				IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
1147				( !defined($subj) ? () : (
1148				BTWN => [], # indices of between-subject IVs (matches IVNM indices)
1149				)),
1150				PLOT => 0, # plots highest order effect
1151				WIN => undef, # for plotting
1152				);
1153				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
1154				$opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
1155				if !$opt{IVNM} or !@{ $opt{IVNM} };
1156				(my ($dsgn, $idv), $y, my ($sj, $ivs_ref_filtrd, $pdl_ivs_raw, $ivs_ref, $err_ref)) = $y->anova_design_matrix($subj, @ivs_raw, \%opt);
1157				confess "anova_rptd: got singular matrix for X' x X"
1158				if any +($dsgn->t x $dsgn)->det == 0;
1159				my $b_full = _ols_main(1, $y, $dsgn->t, {CONST=>0});
1160
1161				my %ret = (ss_total => $y->ss,
1162				ss_residual => $y->sse( sumover( $b_full * $dsgn ) ));
1163
1164				if (defined $subj) {
1165				my @full = (@$ivs_ref, @$err_ref);
1166				my $has_btwn = @{ $opt{BTWN} };
1167				my @is_btwn; $is_btwn[$_] = 1 for @{ $opt{BTWN} };
1168				my @within_inds = 0 .. $#ivs_raw;
1169				@within_inds = grep !$is_btwn[$_], @within_inds if $has_btwn;
1170				my $within_df = pdl(map $_->dim(1), @full[@within_inds])->prodover->sclr;
1171				EFFECT: for my $k (0 .. $#full) {
1172				my $e = ($k > $#$ivs_ref)? '\| err' : '';
1173				my $i = ($k > $#$ivs_ref)? $k - @$ivs_ref : $k;
1174				my $i_pref = $k == $#full ? undef : "\| $idv->[$i] \|";
1175
1176				if (!defined $full[$k]) { # ss_residual as error
1177				$ret{ "$i_pref$e ss" } = $ret{ss_residual};
1178				# highest ord inter for purely within design, (p-1)(q-1)(n-1)
1179				my $factor = (ref $full[-1] ? $full[-1] : $err_ref->[$full[-1]])->dim(1);
1180				my $df = $ret{ "$i_pref$e df" } = $factor * $within_df;
1181				die "${i_pref}residual df = 0" if $df <= 0;
1182				$ret{ "$i_pref$e ms" } = $ret{ "$i_pref$e ss" } / $df;
1183				} elsif (ref $full[$k]) { # unique error term
1184				next EFFECT
1185				unless my @G = grep $_ != $k && defined $full[$_], 0 .. $#full;
1186				my $G = ones($y->dim(0))->glue(1, grep ref $_, @full[@G]);
1187				my $b_G = $y->ols_t( $G, {CONST=>0} );
1188				my $ss = $ret{$k == $#full ? 'ss_subject' : "$i_pref$e ss"}
1189				= $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual};
1190				if ($k != $#full) {
1191				my $df = $ret{"$i_pref$e df"} = $full[$k]->dim(1);
1192				die "residual df = 0" if $df <= 0;
1193				$ret{"$i_pref$e ms"} = $ss / $df;
1194				}
1195				} else { # repeating error term
1196				my $ss = $ret{$k == $#full ? 'ss_subject' : "$i_pref$e ss"}
1197				= $ret{"\| $idv->[$full[$k]] \|$e ss"};
1198				if ($k != $#full) {
1199				my $df = $ret{"$i_pref$e df"} = $ret{"\| $idv->[$full[$k]] \|$e df"};
1200				die "residual df = 0" if $df <= 0;
1201				$ret{"$i_pref$e ms"} = $ss / $df;
1202				}
1203				}
1204				}
1205				} else {
1206				$ret{ss_model} = $ret{ss_total} - $ret{ss_residual};
1207				$ret{F_df} = pdl(my $F_df0 = $dsgn->dim(0) - 1, my $df1 = $y->nelem - $dsgn->dim(0));
1208				$ret{ms_model} = $ret{ss_model} / $F_df0;
1209				$ret{ms_residual} = $ret{ss_residual} / $df1;
1210				$ret{F} = $ret{ms_model} / $ret{ms_residual};
1211				$ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $F_df0, $df1 )
1212				if $CDF and $df1 > 0;
1213
1214				# get IV ss from $ivs_ref instead of $dsgn pdl
1215				my $ones = ones($y->dim(0));
1216				for my $k (0 .. $#$ivs_ref) {
1217				my $G = $ones->glue(1, @$ivs_ref[grep $_ != $k, 0 .. $#$ivs_ref]);
1218				my $b_G = $y->ols_t( $G, {CONST=>0} );
1219				$ret{ "\| $idv->[$k] \| ss" }
1220				= $y->sse( sumover($b_G * $G->transpose) ) - $ret{ss_residual};
1221				my $df0 = $ret{ "\| $idv->[$k] \| df" } = $ivs_ref->[$k]->dim(1);
1222				$ret{ "\| $idv->[$k] \|\| err df" } = $df1;
1223				die "residual df = 0" if $df1 <= 0;
1224				$ret{ "\| $idv->[$k] \| ms" } = $ret{ "\| $idv->[$k] \| ss" } / $df0;
1225				}
1226				}
1227				# have all iv, inter, and error effects. get F and F_p
1228				for (0 .. $#$ivs_ref) {
1229				my $ms_residual = defined $subj ? $ret{ "\| $idv->[$_] \|\| err ms" } : $ret{ms_residual};
1230				my ($df0, $df1) = @ret{"\| $idv->[$_] \| df" , "\| $idv->[$_] \|\| err df"};
1231				my $F = $ret{ "\| $idv->[$_] \| F" } = $ret{ "\| $idv->[$_] \| ms" } / $ms_residual;
1232				$ret{ "\| $idv->[$_] \| F_p" } = 1 - $F->gsl_cdf_fdist_P($df0, $df1)
1233				if $CDF and $df1 > 0;
1234				}
1235
1236				for (keys %ret) {ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze};
1237
1238				my ($inter_cmo_ref, $inter_cm_ref)
1239				= _interactions_cm($ivs_ref_filtrd, $pdl_ivs_raw);
1240				# append inter info to cell means effects
1241				my $ivs_cm_ref = [@$ivs_ref_filtrd, @$inter_cm_ref];
1242				my @i_cmo_ref = map pdl(values %{ (effect_code($_))[1] })->qsort, @$pdl_ivs_raw;
1243				#line 1068 "lib/PDL/Stats/GLM.pd"
1244				push @i_cmo_ref, @$inter_cmo_ref;
1245
1246				my $cm_ref
1247				= _cell_means( $y, $ivs_cm_ref, \@i_cmo_ref, $idv, $pdl_ivs_raw );
1248				if (defined $subj) {
1249				my @ls = map { $_->uniq->nelem } @$pdl_ivs_raw;
1250				$cm_ref
1251				= _fix_rptd_se( $cm_ref, \%ret, $opt{IVNM}, \@ls, $sj->uniq->nelem );
1252				}
1253				# integrate mean and se into %ret
1254				@ret{ keys %$cm_ref } = values %$cm_ref;
1255
1256				my $highest = join(' ~ ', @{ $opt{IVNM} });
1257				$cm_ref->{"\| $highest \| m"}->plot_means( $cm_ref->{"\| $highest \| se"},
1258				{ %opt, IVNM=>$idv } )
1259				if $opt{PLOT};
1260
1261				%ret;
1262				}
1263
1264				=head2 anova_design_matrix
1265
1266				=for ref
1267
1268				Effect-coded design matrix for anova, including repeated-measures and
1269				mixed-model. The C for use in linear regression i.e. C.
1270
1271				Added in 0.854. See L for more.
1272
1273				Supports bad value in the dependent and independent variables. It
1274				automatically removes bad data listwise, i.e. remove a subject's data if
1275				there is any cell missing for the subject.
1276
1277				=for options
1278
1279				Default options (case insensitive):
1280
1281				V => 1, # carps if bad value in dv
1282				IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
1283				BTWN => [], # indices of between-subject IVs (matches IVNM indices)
1284
1285				=for usage
1286
1287				$matrix = $dv->anova_design_matrix(undef, $b, $w, {ivnm=>[qw(b w)]});
1288				$matrix = $dv->anova_design_matrix(
1289				$subj, $b, $w, {ivnm=>[qw(b w)]}); # repeated-measures
1290				$matrix = $dv->anova_design_matrix(
1291				$subj, $b, $w, {ivnm=>[qw(b w)], btwn=>['b']}); # mixed-model
1292				($matrix, $ivnm_ref) = $dv->anova_design_matrix(
1293				$subj, $b, $w, {ivnm=>[qw(b w)], btwn=>['b']}); # list context also names
1294
1295				=cut
1296
1297				*anova_design_matrix = \&PDL::anova_design_matrix;
1298				sub PDL::anova_design_matrix {
1299				my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
1300				my ($y, $subj, @ivs_raw) = @_;
1301				confess "No IVs: did you omit 'undef' for anova?" if !@ivs_raw;
1302				confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1;
1303				for (@ivs_raw) {
1304				croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1;
1305				}
1306				my %opt = (
1307				V => 1, # carps if bad value in dv
1308				IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
1309				( !defined($subj) ? () : (
1310				BTWN => [], # indices of between-subject IVs (matches IVNM indices)
1311				)),
1312				);
1313				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
1314				$opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
1315				if !$opt{IVNM} or !@{ $opt{IVNM} };
1316				my @idv_orig = @{ $opt{IVNM} };
1317				my @pdl_ivs_raw = map scalar PDL::Stats::Basic::code_ivs($_), @ivs_raw;
1318				my $pdl_ivs = pdl(\@pdl_ivs_raw);
1319				# explicit set badflag because pdl() removes badflag
1320				$pdl_ivs->badflag( scalar grep $_->badflag, @pdl_ivs_raw );
1321				my $sj;
1322				if (defined($subj)) {
1323				# delete bad data listwise ie remove subj if any cell missing
1324				$sj = PDL::Stats::Basic::code_ivs($subj);
1325				my $ibad = which( $y->isbad \| nbadover($pdl_ivs->transpose) );
1326				my $sj_bad = $sj->slice($ibad)->uniq;
1327				if ($sj_bad->nelem) {
1328				warn $sj_bad->nelem . " subjects with missing data removed\n"
1329				if $opt{V};
1330				$sj = $sj->setvaltobad($_)
1331				for (list $sj_bad);
1332				my $igood = which $sj->isgood;
1333				for ($y, $sj, @pdl_ivs_raw) {
1334				$_ = $_->slice( $igood )->sever;
1335				$_->badflag(0);
1336				}
1337				}
1338				} else {
1339				($y, $pdl_ivs) = _rm_bad_value( $y, $pdl_ivs );
1340				if ($opt{V} and $y->nelem < $pdl_ivs_raw[0]->nelem) {
1341				warn sprintf "%d subjects with missing data removed\n", $pdl_ivs_raw[0]->nelem - $y->nelem;
1342				}
1343				@pdl_ivs_raw = $pdl_ivs->dog;
1344				}
1345				my @ivs_ref_fltrd = map scalar effect_code($_), @pdl_ivs_raw;
1346				my ($ivs_inter_ref, $idv_inter) = _interactions(\@ivs_ref_fltrd, \@idv_orig);
1347				# append inter info to main effects
1348				my $ivs_ref = [@ivs_ref_fltrd, @$ivs_inter_ref];
1349				my @idv = (@idv_orig, @$idv_inter);
1350				# matches $ivs_ref, with an extra last pdl for subj effect
1351				my $err_ref = !defined($subj) ? [] :
1352				_add_errors( $sj, $ivs_ref, \@idv, \@pdl_ivs_raw, $opt{BTWN} );
1353				for (grep ref $err_ref->[$_], 0..$#$err_ref) {
1354				my ($null_row_ids, $non_null_row_ids) = $err_ref->[$_]->zcover->which_both;
1355				confess "got null columns $null_row_ids in error entry #$_ ($idv[$_])"
1356				if !$null_row_ids->isempty;
1357				}
1358				my $dsgn = PDL::glue(1, ones($y->dim(0)), @$ivs_ref, (grep ref($_), @$err_ref))->t;
1359				!wantarray ? $dsgn : ($dsgn, \@idv, $y, $sj, \@ivs_ref_fltrd, \@pdl_ivs_raw, $ivs_ref, $err_ref);
1360				}
1361
1362				# code (btwn group) subjects. Rutherford (2011) pp 208-209
1363				sub _code_btwn {
1364				my ($subj, $btwn) = @_;
1365				my (@grp, %grp_s);
1366				for my $n (0 .. $subj->nelem - 1) {
1367				# construct string to code group membership
1368				# something not treated as BAD by code_ivs to start off marking group membership
1369				# if no $btwn, everyone ends up in the same grp
1370				my $s = '_' . join '', map $_->slice($n), @$btwn;
1371				push @grp, $s; # group membership
1372				$s .= $subj->slice($n); # keep track of total uniq subj
1373				$grp_s{$s} = 1;
1374				}
1375				my $grp = PDL::Stats::Kmeans::iv_cluster \@grp;
1376				my $spdl = zeroes $subj->dim(0), keys(%grp_s) - $grp->dim(1);
1377				my $d1 = 0;
1378				for my $g (0 .. $grp->dim(1)-1) {
1379				my $col_inds = which $grp->slice(':',$g);
1380				my $gsub = $subj->slice( $col_inds )->effect_code;
1381				my ($nobs, $nsub) = $gsub->dims;
1382				$spdl->slice($col_inds, [$d1,$d1+$nsub-1]) .= $gsub;
1383				$d1 += $nsub;
1384				}
1385				$spdl;
1386				}
1387
1388				sub _add_errors {
1389				my ($subj, $ivs_ref, $idv, $raw_ivs, $btwn) = @_;
1390				my $spdl = _code_btwn($subj, [@$raw_ivs[@$btwn]]);
1391				# if btwn factor involved, or highest order inter for within factors
1392				# elem is undef, so that
1393				# @errors ind matches @$ivs_ref, with an extra elem at the end for subj
1394				# mark btwn factors for error terms
1395				# same error term for B(wn) and A(btwn) x B(wn) (Rutherford, p205)
1396				my %is_btwn = map +($_=>1), @$idv[ @$btwn ];
1397				my $has_btwn = keys %is_btwn;
1398				my %idv2indx = map +($idv->[$_]=>$_), 0..$#$idv;
1399				my $ie_subj;
1400				my @errors = map {
1401				my @fs = split ' ~ ', $idv->[$_];
1402				# separate bw and wn factors
1403				# if only bw, error is bw x subj
1404				# if only wn or wn and bw, error is wn x subj
1405				my @bw = !$has_btwn ? () : grep $is_btwn{$_}, @fs;
1406				my @wn = !$has_btwn ? @fs : grep !$is_btwn{$_}, @fs;
1407				$ie_subj = $_ if !defined($ie_subj) and !@wn;
1408				my $err = join ' ~ ', @wn ? @wn : @bw;
1409				# highest order inter of within factors, use ss_residual as error
1410				if ( @wn == @$raw_ivs - @$btwn ) { undef }
1411				# repeating btwn factors use ss_subject as error
1412				elsif (!@wn and $_ > $ie_subj) { $ie_subj }
1413				# repeating error term
1414				elsif ($_ > $idv2indx{$err}) { $idv2indx{$err} }
1415				elsif (@wn) { interaction_code($ivs_ref->[$_], $spdl) }
1416				else { $spdl }
1417				} 0 .. $#$ivs_ref;
1418				push @errors, $has_btwn ? $ie_subj : $spdl;
1419				\@errors;
1420				}
1421
1422				sub _fix_rptd_se {
1423				# if ivnm lvls_ref for within ss only this can work for mixed design
1424				my ($cm_ref, $ret, $ivnm, $lvls_ref, $n) = @_;
1425				my @se = map /^\\| (.+?) \\| se$/ ? $1 : (), keys %$cm_ref;
1426				my @n_obs = map {
1427				my @ivs = split / ~ /, $_;
1428				my $i_ivs = which_id $ivnm, \@ivs;
1429				my $icollapsed = setops pdl(0 .. $#$ivnm), 'XOR', $i_ivs;
1430				my $collapsed = $icollapsed->nelem?
1431				pdl( @$lvls_ref[(list $icollapsed)] )->prodover
1432				: 1
1433				;
1434				$n * $collapsed;
1435				} @se;
1436				for my $i (0 .. $#se) {
1437				$cm_ref->{"\| $se[$i] \| se"}
1438				.= sqrt( $ret->{"\| $se[$i] \|\| err ms"} / $n_obs[$i] );
1439				}
1440				$cm_ref;
1441				}
1442
1443				=head2 dummy_code
1444
1445				=for ref
1446
1447				Dummy coding of nominal variable (perl @ ref or 1d pdl) for use in regression.
1448
1449				Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD).
1450
1451				=for usage
1452
1453				pdl> @a = qw(a a a b b b c c c)
1454				pdl> p $a = dummy_code(\@a)
1455				[
1456				[1 1 1 0 0 0 0 0 0]
1457				[0 0 0 1 1 1 0 0 0]
1458				]
1459
1460				=cut
1461
1462				*dummy_code = \&PDL::dummy_code;
1463				sub PDL::dummy_code {
1464				my ($var_ref) = @_;
1465				my $var_e = effect_code( $var_ref );
1466				$var_e->where( $var_e == -1 ) .= 0;
1467				$var_e;
1468				}
1469
1470				=head2 effect_code
1471
1472				=for ref
1473
1474				Unweighted effect coding of nominal variable (perl @ ref or 1d pdl) for
1475				use in regression. returns in @ context coded pdl and % ref to level -
1476				pdl->dim(1) index.
1477
1478				Supports BAD value (missing or 'BAD' values result in the corresponding
1479				pdl elements being marked as BAD).
1480
1481				=for usage
1482
1483				my @var = qw( a a a b b b c c c );
1484				my ($var_e, $map) = effect_code( \@var );
1485
1486				print $var_e . $var_e->info . "\n";
1487
1488				[
1489				[ 1 1 1 0 0 0 -1 -1 -1]
1490				[ 0 0 0 1 1 1 -1 -1 -1]
1491				]
1492				PDL: Double D [9,2]
1493
1494				print "$_\t$map->{$_}\n" for sort keys %$map
1495				a 0
1496				b 1
1497				c 2
1498
1499				=cut
1500
1501				*effect_code = \&PDL::effect_code;
1502				sub PDL::effect_code {
1503				my ($var_ref) = @_;
1504				my ($var, $map_ref) = PDL::Stats::Basic::code_ivs( $var_ref );
1505				my $var_max = $var->max;
1506				confess "effect_code called with only one unique value" if $var_max < 1;
1507				my $var_e = yvals( float, $var->nelem, $var_max ) == $var;
1508				$var_e->slice(which( $var == $var_max ), ) .= -1;
1509				$var_e = $var_e->setbadif( $var->isbad ) if $var->badflag;
1510				wantarray ? ($var_e, $map_ref) : $var_e;
1511				}
1512
1513				=head2 effect_code_w
1514
1515				=for ref
1516
1517				Weighted effect code for nominal variable. returns in @ context coded pdl and % ref to level - pdl->dim(1) index.
1518
1519				Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD).
1520
1521				=for usage
1522
1523				pdl> @a = qw( a a b b b c c )
1524				pdl> p $a = effect_code_w(\@a)
1525				[
1526				[ 1 1 0 0 0 -1 -1]
1527				[ 0 0 1 1 1 -1.5 -1.5]
1528				]
1529
1530				=cut
1531
1532				*effect_code_w = \&PDL::effect_code_w;
1533				sub PDL::effect_code_w {
1534				my ($var_ref) = @_;
1535				my ($var_e, $map_ref) = effect_code( $var_ref );
1536				return wantarray ? ($var_e, $map_ref) : $var_e if $var_e->sum == 0;
1537				my $pos = $var_e == 1;
1538				my $neg = $var_e == -1;
1539				my $w = $pos->sumover / $neg->sumover;
1540				my $neg_ind = $neg->whichND;
1541				$var_e->indexND($neg_ind) *= $w->slice($neg_ind->slice('(1)'));
1542				wantarray ? ($var_e, $map_ref) : $var_e;
1543				}
1544
1545				=head2 interaction_code
1546
1547				Returns the coded interaction term for effect-coded variables.
1548
1549				Supports BAD value (missing or 'BAD' values result in the corresponding
1550				pdl elements being marked as BAD).
1551
1552				=for usage
1553
1554				pdl> $a = sequence(6) > 2
1555				pdl> p $a = $a->effect_code
1556				[
1557				[ 1 1 1 -1 -1 -1]
1558				]
1559
1560				pdl> $b = pdl( qw( 0 1 2 0 1 2 ) )
1561				pdl> p $b = $b->effect_code
1562				[
1563				[ 1 0 -1 1 0 -1]
1564				[ 0 1 -1 0 1 -1]
1565				]
1566
1567				pdl> p $ab = interaction_code( $a, $b )
1568				[
1569				[ 1 0 -1 -1 -0 1]
1570				[ 0 1 -1 -0 -1 1]
1571				]
1572
1573				=cut
1574
1575				*interaction_code = \&PDL::interaction_code;
1576				sub PDL::interaction_code {
1577				my $i = ones( $_[0]->dim(0), 1 );
1578				$i = ($i * $_->dummy(1))->clump(1,2) for @_;
1579				$i;
1580				}
1581
1582				=head2 ols
1583
1584				=for ref
1585
1586				Ordinary least squares regression, aka linear regression.
1587
1588				Unlike B, ols is not broadcastable, but it can handle bad value (by
1589				ignoring observations with bad value in dependent or independent variables
1590				list-wise) and returns the full model in list context with various stats.
1591
1592				IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply
1593				the constant vector in $x. Intercept is automatically added and returned
1594				as FIRST of the coeffs if CONST=>1. Returns full model in list context
1595				and coeff in scalar context.
1596
1597				For more on multiple linear regression see
1598				L.
1599
1600				=for options
1601
1602				Default options (case insensitive):
1603
1604				CONST => 1,
1605				PLOT => 0, # see plot_residuals() for plot options
1606				WIN => undef, # for plotting
1607
1608				=for usage
1609
1610				Usage:
1611
1612				# suppose this is a person's ratings for top 10 box office movies
1613				# ascending sorted by box office
1614
1615				pdl> $y = pdl '[1 1 2 2 2 2 4 4 5 5]'
1616
1617				# construct IV with linear and quadratic component
1618
1619				pdl> p $x = cat sequence(10), sequence(10)**2
1620				[
1621				[ 0 1 2 3 4 5 6 7 8 9]
1622				[ 0 1 4 9 16 25 36 49 64 81]
1623				]
1624
1625				pdl> %m = $y->ols( $x )
1626
1627				pdl> p "$_\t@{[$m{$_} =~ /^\n(.?)\n*\z/s]}\n" for sort keys %m
1628
1629				F 40.4225352112676
1630				F_df [2 7]
1631				F_p 0.000142834216344756
1632				R2 0.920314253647587
1633
1634				# coeff constant linear quadratic
1635				b [0.981818 0.212121 0.030303]
1636				b_p [0.039910 0.328001 0.203034]
1637				b_se [0.389875 0.201746 0.021579]
1638				b_t [2.518284 1.051422 1.404218]
1639				ss_model 19.8787878787879
1640				ss_residual 1.72121212121212
1641				ss_total 21.6
1642				y_pred [0.98181818 1.2242424 1.5272727 ... 4.6181818 5.3454545]
1643
1644				=cut
1645
1646				*ols = \&PDL::ols;
1647				sub PDL::ols {
1648				_ols_common(0, @_);
1649				}
1650
1651				# ivs = [nobs x nivs] so can `dog` retval
1652				sub _rm_bad_value {
1653				my ($y, $ivs) = @_;
1654				return ($y, $ivs, undef) if !$y->check_badflag and !$ivs->check_badflag;
1655				my $idx = which($y->isgood & (nbadover ($ivs->transpose)==0));
1656				$_ = $_->slice($idx)->sever for $y, $ivs;
1657				$_->badflag(0) for $y, $ivs;
1658				($y, $ivs, $idx);
1659				}
1660
1661				=head2 ols_rptd
1662
1663				=for ref
1664
1665				Repeated measures linear regression.
1666				Handles purely within-subject design for now.
1667
1668				(Lorch & Myers, 1990; Van den Noortgate & Onghena, 2006).
1669				See F for an example using the Lorch and Myers data.
1670
1671				=for usage
1672
1673				Usage:
1674
1675				# This is the example from Lorch and Myers (1990),
1676				# a study on how characteristics of sentences affected reading time
1677				# Three within-subject IVs:
1678				# SP -- serial position of sentence
1679				# WORDS -- number of words in sentence
1680				# NEW -- number of new arguments in sentence
1681
1682				# $subj can be 1D pdl or @ ref and must be the first argument
1683				# IV can be 1D @ ref or pdl
1684				# 1D @ ref is effect coded internally into pdl
1685				# pdl is left as is
1686
1687				my %r = $rt->ols_rptd( $subj, $sp, $words, $new );
1688
1689				print "$_\t$r{$_}\n" for sort keys %r;
1690
1691				ss_residual 58.3754646504336
1692				ss_subject 51.8590337714286
1693				ss_total 405.188241771429
1694				# SP WORDS NEW
1695				F [ 7.208473 61.354153 1.0243311]
1696				F_p [0.025006181 2.619081e-05 0.33792837]
1697				coeff [0.33337285 0.45858933 0.15162986]
1698				df [1 1 1]
1699				df_err [9 9 9]
1700				ms [ 18.450705 73.813294 0.57026483]
1701				ms_err [ 2.5595857 1.2030692 0.55671923]
1702				ss [ 18.450705 73.813294 0.57026483]
1703				ss_err [ 23.036272 10.827623 5.0104731]
1704
1705				=cut
1706
1707				*ols_rptd = \&PDL::ols_rptd;
1708				sub PDL::ols_rptd {
1709				my ($y, $subj, @ivs_raw) = @_;
1710
1711				$y = $y->squeeze;
1712				$y->getndims > 1 and
1713				croak "ols_rptd does not support broadcasting";
1714
1715				my @ivs = map { (ref $_ eq 'PDL' and $_->ndims > 1)? $_
1716				: ref $_ eq 'PDL' ? $_->dummy(1)
1717				: scalar effect_code($_)
1718				;
1719				} @ivs_raw;
1720
1721				my %r;
1722
1723				$r{ss_total} = $y->ss;
1724
1725				# STEP 1: subj
1726				my $s = effect_code $subj;
1727				my $b_s = $y->ols_t($s);
1728				my $pred = sumover($b_s->slice('1:-1') * $s->transpose) + $b_s->slice(0);
1729				$r{ss_subject} = $r{ss_total} - $y->sse( $pred );
1730
1731				# STEP 2: add predictor variables
1732				my $iv_p = $s->glue(1, @ivs);
1733				my $b_p = $y->ols_t($iv_p);
1734
1735				# only care about coeff for predictor vars. no subj or const coeff
1736				$r{coeff} = $b_p->slice([-@ivs,-1])->sever;
1737
1738				# get total sse for this step
1739				$pred = sumover($b_p->slice('1:-1') * $iv_p->transpose) + $b_p->slice(0);
1740				my $ss_pe = $y->sse( $pred );
1741
1742				# get predictor ss by successively reducing the model
1743				$r{ss} = zeroes scalar(@ivs);
1744				for my $i (0 .. $#ivs) {
1745				my $iv = $s->glue(1, @ivs[ grep $_ != $i, 0..$#ivs ]);
1746				my $b = $y->ols_t($iv);
1747				$pred = sumover($b->slice('1:-1') * $iv->transpose) + $b->slice(0);
1748				$r{ss}->slice($i) .= $y->sse($pred) - $ss_pe;
1749				}
1750
1751				# STEP 3: get predictor x subj interaction as error term
1752				my $iv_e = PDL::glue 1, map interaction_code( $s, $_ ), @ivs;
1753
1754				# get total sse for this step. full model now.
1755				my $b_f = $y->ols_t( $iv_p->glue(1,$iv_e) );
1756				$pred = sumover($b_f->slice('1:-1') * $iv_p->glue(1,$iv_e)->transpose) + $b_f->slice(0);
1757				$r{ss_residual} = $y->sse( $pred );
1758
1759				# get predictor x subj ss by successively reducing the error term
1760				$r{ss_err} = zeroes scalar(@ivs);
1761				for my $i (0 .. $#ivs) {
1762				my $iv = $iv_p->glue(1, map interaction_code($s, $_),
1763				@ivs[grep $_ != $i, 0..$#ivs]);
1764				my $b = $y->ols_t($iv);
1765				my $pred = sumover($b->slice('1:-1') * $iv->transpose) + $b->slice(0);
1766				$r{ss_err}->slice($i) .= $y->sse($pred) - $r{ss_residual};
1767				}
1768
1769				# Finally, get MS, F, etc
1770
1771				$r{df} = pdl( map $_->squeeze->ndims, @ivs );
1772				$r{ms} = $r{ss} / $r{df};
1773
1774				$r{df_err} = $s->dim(1) * $r{df};
1775				$r{ms_err} = $r{ss_err} / $r{df_err};
1776
1777				$r{F} = $r{ms} / $r{ms_err};
1778
1779				$r{F_p} = 1 - $r{F}->gsl_cdf_fdist_P( $r{df}, $r{df_err} )
1780				if $CDF;
1781
1782				%r;
1783				}
1784
1785				=head2 logistic
1786
1787				=for ref
1788
1789				Logistic regression with maximum likelihood estimation using L.
1790
1791				IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply
1792				the constant vector in $x. It is included in the model and returned as
1793				LAST of coeff. Returns full model in list context and coeff in scalar
1794				context.
1795
1796				The significance tests are likelihood ratio tests (-2LL deviance)
1797				tests. IV significance is tested by comparing deviances between the
1798				reduced model (ie with the IV in question removed) and the full model.
1799
1800				***NOTE: the results here are qualitatively similar to but not identical
1801				with results from R, because different algorithms are used for the
1802				nonlinear parameter fit. Use with discretion***
1803
1804				=for options
1805
1806				Default options (case insensitive):
1807
1808				INITP => zeroes( $x->dim(1) + 1 ), # n_iv + 1
1809				MAXIT => 1000,
1810				EPS => 1e-7,
1811
1812				=for usage
1813
1814				Usage:
1815
1816				# suppose this is whether a person had rented 10 movies
1817
1818				pdl> p $y = ushort( random(10)*2 )
1819				[0 0 0 1 1 0 0 1 1 1]
1820
1821				# IV 1 is box office ranking
1822
1823				pdl> p $x1 = sequence(10)
1824				[0 1 2 3 4 5 6 7 8 9]
1825
1826				# IV 2 is whether the movie is action- or chick-flick
1827
1828				pdl> p $x2 = sequence(10) % 2
1829				[0 1 0 1 0 1 0 1 0 1]
1830
1831				# concatenate the IVs together
1832
1833				pdl> p $x = cat $x1, $x2
1834				[
1835				[0 1 2 3 4 5 6 7 8 9]
1836				[0 1 0 1 0 1 0 1 0 1]
1837				]
1838
1839				pdl> %m = $y->logistic( $x )
1840
1841				pdl> p "$_\t$m{$_}\n" for sort keys %m
1842
1843				D0 13.8629436111989
1844				Dm 9.8627829791575
1845				Dm_chisq 4.00016063204141
1846				Dm_df 2
1847				Dm_p 0.135324414081692
1848				# ranking genre constant
1849				b [0.41127706 0.53876358 -2.1201285]
1850				b_chisq [ 3.5974504 0.16835559 2.8577151]
1851				b_p [0.057868258 0.6815774 0.090936587]
1852				iter 12
1853				y_pred [0.10715577 0.23683909 ... 0.76316091 0.89284423]
1854
1855				# to get the covariance out, supply a true value for the COV option:
1856				pdl> %m = $y->logistic( $x, {COV=>1} )
1857				pdl> p $m{cov};
1858
1859				=cut
1860
1861				*logistic = \&PDL::logistic;
1862				sub PDL::logistic {
1863				require PDL::Fit::LM;
1864				my ( $self, $ivs, $opt ) = @_;
1865				$self = $self->squeeze;
1866				# make compatible w multiple var cases
1867				$ivs->getndims == 1 and $ivs = $ivs->dummy(1);
1868				$self->dim(0) != $ivs->dim(0) and
1869				carp "mismatched n btwn DV and IV!";
1870				my %opt = (
1871				INITP => zeroes( $ivs->dim(1) + 1 ), # n_ivs + 1
1872				MAXIT => 1000,
1873				EPS => 1e-7,
1874				);
1875				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
1876				# not using it atm
1877				$opt{WT} = 1;
1878				# Use lmfit. Fourth input argument is reference to user-defined
1879				# copy INITP so we have the original value when needed
1880				my ($yfit,$coeff,$cov,$iter)
1881				= PDL::Fit::LM::lmfit($ivs, $self, $opt{WT}, \&_logistic, $opt{INITP}->copy,
1882				{ Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
1883				# apparently at least coeff is child of some pdl
1884				# which is changed in later lmfit calls
1885				$yfit = $yfit->copy;
1886				$coeff = $coeff->copy;
1887				return $coeff unless wantarray;
1888				my %ret;
1889				my $n0 = $self->where($self == 0)->nelem;
1890				my $n1 = $self->nelem - $n0;
1891				$ret{cov} = $cov if $opt{COV};
1892				$ret{D0} = -2($n0 log($n0 / $self->nelem) + $n1 * log($n1 / $self->nelem));
1893				$ret{Dm} = sum( $self->dvrs( $yfit ) ** 2 );
1894				$ret{Dm_chisq} = $ret{D0} - $ret{Dm};
1895				$ret{Dm_df} = $ivs->dim(1);
1896				$ret{Dm_p}
1897				= 1 - PDL::GSL::CDF::gsl_cdf_chisq_P( $ret{Dm_chisq}, $ret{Dm_df} )
1898				if $CDF;
1899				my $coeff_chisq = zeroes $opt{INITP}->nelem;
1900				if ( $ivs->dim(1) > 1 ) {
1901				for my $k (0 .. $ivs->dim(1)-1) {
1902				my @G = grep { $_ != $k } (0 .. $ivs->dim(1)-1);
1903				my $G = $ivs->dice_axis(1, \@G);
1904				my $init = $opt{INITP}->dice([ @G, $opt{INITP}->dim(0)-1 ])->copy;
1905				my $y_G
1906				= PDL::Fit::LM::lmfit( $G, $self, $opt{WT}, \&_logistic, $init,
1907				{ Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
1908				$coeff_chisq->slice($k) .= $self->dm( $y_G ) - $ret{Dm};
1909				}
1910				}
1911				else {
1912				# d0 is, by definition, the deviance with only intercept
1913				$coeff_chisq->slice(0) .= $ret{D0} - $ret{Dm};
1914				}
1915				my $y_c
1916				= PDL::Fit::LM::lmfit( $ivs, $self, $opt{WT}, \&_logistic_no_intercept, $opt{INITP}->slice('0:-2')->sever,
1917				{ Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
1918				$coeff_chisq->slice(-1) .= $self->dm( $y_c ) - $ret{Dm};
1919				$ret{b} = $coeff;
1920				$ret{b_chisq} = $coeff_chisq;
1921				$ret{b_p} = 1 - $ret{b_chisq}->gsl_cdf_chisq_P( 1 )
1922				if $CDF;
1923				$ret{y_pred} = $yfit;
1924				$ret{iter} = $iter;
1925				for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze };
1926				%ret;
1927				}
1928
1929				sub _logistic {
1930				my ($x,$par,$ym,$dyda) = @_;
1931				# $b and $c are fit parameters slope and intercept
1932				my $b = $par->slice([0,$x->dim(1) - 1])->sever;
1933				my $c = $par->slice(-1)->sever;
1934				# Write function with dependent variable $ym,
1935				# independent variable $x, and fit parameters as specified above.
1936				# Use the .= (dot equals) assignment operator to express the equality
1937				# (not just a plain equals)
1938				$ym .= 1 / ( 1 + exp( -1 * (sumover($b * $x->transpose) + $c) ) );
1939				my @dy = map $dyda->slice(",($_)"), 0 .. $par->dim(0)-1;
1940				# Partial derivative of the function with respect to each slope
1941				# fit parameter ($b in this case). Again, note .= assignment
1942				# operator (not just "equals")
1943				$dy[$_] .= $x->slice(':',$_) * $ym * (1 - $ym)
1944				for (0 .. $b->dim(0)-1);
1945				# Partial derivative of the function re intercept par
1946				$dy[-1] .= $ym * (1 - $ym);
1947				}
1948
1949				sub _logistic_no_intercept {
1950				my ($x,$par,$ym,$dyda) = @_;
1951				my $b = $par->slice([0,$x->dim(1) - 1])->sever;
1952				# Write function with dependent variable $ym,
1953				# independent variable $x, and fit parameters as specified above.
1954				# Use the .= (dot equals) assignment operator to express the equality
1955				# (not just a plain equals)
1956				$ym .= 1 / ( 1 + exp( -1 * sumover($b * $x->transpose) ) );
1957				my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1);
1958				# Partial derivative of the function with respect to each slope
1959				# fit parameter ($b in this case). Again, note .= assignment
1960				# operator (not just "equals")
1961				$dy[$_] .= $x->slice(':',$_) * $ym * (1 - $ym)
1962				for 0 .. $b->dim(0)-1;
1963				}
1964
1965				=head2 pca
1966
1967				=for ref
1968
1969				Principal component analysis. Based on corr instead of cov.
1970
1971				Bad values are ignored pair-wise. OK when bad values are few but otherwise
1972				probably should fill_m etc before pca). Uses L.
1973
1974				=for options
1975
1976				Default options (case insensitive):
1977
1978				CORR => 1, # boolean. use correlation or covariance
1979				PLOT => 0, # calls plot_screes by default
1980				# can set plot_screes options here
1981				WIN => undef, # for plotting
1982
1983				=for usage
1984
1985				Usage:
1986
1987				my $d = qsort random 10, 5; # 10 obs on 5 variables
1988				my %r = $d->pca( \%opt );
1989				print "$_\t$r{$_}\n" for (keys %r);
1990
1991				eigenvalue # variance accounted for by each component
1992				[4.70192 0.199604 0.0471421 0.0372981 0.0140346]
1993
1994				eigenvector # dim var x comp. weights for mapping variables to component
1995				[
1996				[ -0.451251 -0.440696 -0.457628 -0.451491 -0.434618]
1997				[ -0.274551 0.582455 0.131494 0.255261 -0.709168]
1998				[ 0.43282 0.500662 -0.139209 -0.735144 -0.0467834]
1999				[ 0.693634 -0.428171 0.125114 0.128145 -0.550879]
2000				[ 0.229202 0.180393 -0.859217 0.4173 0.0503155]
2001				]
2002
2003				loadings # dim var x comp. correlation between variable and component
2004				[
2005				[ -0.978489 -0.955601 -0.992316 -0.97901 -0.942421]
2006				[ -0.122661 0.260224 0.0587476 0.114043 -0.316836]
2007				[ 0.0939749 0.108705 -0.0302253 -0.159616 -0.0101577]
2008				[ 0.13396 -0.0826915 0.0241629 0.0247483 -0.10639]
2009				[ 0.027153 0.0213708 -0.101789 0.0494365 0.00596076]
2010				]
2011
2012				pct_var # percent variance accounted for by each component
2013				[0.940384 0.0399209 0.00942842 0.00745963 0.00280691]
2014
2015				Plot scores along the first two components,
2016
2017				$d->plot_scores( $r{eigenvector} );
2018
2019				=cut
2020
2021				*pca = \&PDL::pca;
2022				sub PDL::pca {
2023				my ($self, $opt) = @_;
2024				my %opt = (
2025				CORR => 1,
2026				PLOT => 0,
2027				WIN => undef, # for plotting
2028				);
2029				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
2030				my $var_var = $opt{CORR}? $self->corr_table : $self->cov_table;
2031				# value is axis pdl and score is var x axis
2032				my ($eigvec, $eigval) = $var_var->eigens_sym;
2033				$eigvec = $eigvec->transpose; # compatibility with PDL::Slatec::eigsys
2034				# ind is sticky point for broadcasting
2035				my $ind_sorted = $eigval->qsorti->slice('-1:0');
2036				$eigvec = $eigvec->slice(':',$ind_sorted)->sever;
2037				$eigval = $eigval->slice($ind_sorted)->sever;
2038				# var x axis
2039				my $var = $eigval / $eigval->sum->sclr;
2040				my $loadings;
2041				if ($opt{CORR}) {
2042				$loadings = $eigvec * sqrt( $eigval->transpose );
2043				}
2044				else {
2045				my $scores = $eigvec x $self->dev_m;
2046				$loadings = $self->corr( $scores->dummy(1) );
2047				}
2048				$var->plot_screes(\%opt)
2049				if $opt{PLOT};
2050				( eigenvalue=>$eigval, eigenvector=>$eigvec,
2051				pct_var=>$var, loadings=>$loadings );
2052				}
2053
2054				=head2 pca_sorti
2055
2056				Determine by which vars a component is best represented. Descending sort
2057				vars by size of association with that component. Returns sorted var and
2058				relevant component indices.
2059
2060				=for options
2061
2062				Default options (case insensitive):
2063
2064				NCOMP => 10, # maximum number of components to consider
2065
2066				=for usage
2067
2068				Usage:
2069
2070				# let's see if we replicated the Osgood et al. (1957) study
2071				pdl> ($data, $idv, $ido) = rtable 'osgood_exp.csv', {v=>0}
2072
2073				# select a subset of var to do pca
2074				pdl> $ind = which_id $idv, [qw( ACTIVE BASS BRIGHT CALM FAST GOOD HAPPY HARD LARGE HEAVY )]
2075				pdl> $data = $data( ,$ind)->sever
2076				pdl> @$idv = @$idv[list $ind]
2077
2078				pdl> %m = $data->pca
2079
2080				pdl> ($iv, $ic) = $m{loadings}->pca_sorti()
2081
2082				pdl> p "$idv->[$_]\t" . $m{loadings}->($_,$ic)->flat . "\n" for (list $iv)
2083
2084				# COMP0 COMP1 COMP2 COMP3
2085				HAPPY [0.860191 0.364911 0.174372 -0.10484]
2086				GOOD [0.848694 0.303652 0.198378 -0.115177]
2087				CALM [0.821177 -0.130542 0.396215 -0.125368]
2088				BRIGHT [0.78303 0.232808 -0.0534081 -0.0528796]
2089				HEAVY [-0.623036 0.454826 0.50447 0.073007]
2090				HARD [-0.679179 0.0505568 0.384467 0.165608]
2091				ACTIVE [-0.161098 0.760778 -0.44893 -0.0888592]
2092				FAST [-0.196042 0.71479 -0.471355 0.00460276]
2093				LARGE [-0.241994 0.594644 0.634703 -0.00618055]
2094				BASS [-0.621213 -0.124918 0.0605367 -0.765184]
2095
2096				=cut
2097
2098				*pca_sorti = \&PDL::pca_sorti;
2099				sub PDL::pca_sorti {
2100				# $self is pdl (var x component)
2101				my ($self, $opt) = @_;
2102				my %opt = (
2103				NCOMP => 10, # maximum number of components to consider
2104				);
2105				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
2106				my $ncomp = pdl($opt{NCOMP}, $self->dim(1))->min;
2107				$self = $self->dice_axis( 1, pdl(0..$ncomp-1) );
2108				my $icomp = $self->transpose->abs->maximum_ind;
2109				# sort between comp
2110				my $ivar_sort = $icomp->qsorti;
2111				$self = $self->slice($ivar_sort)->sever;
2112				# sort within comp
2113				my $ic = $icomp->slice($ivar_sort)->iv_cluster;
2114				for my $comp (0 .. $ic->dim(1)-1) {
2115				my $i = $self->slice(which($ic->slice(':',$comp)), "($comp)")->qsorti->slice('-1:0');
2116				$ivar_sort->slice(which $ic->slice(':',$comp))
2117				.= $ivar_sort->slice(which $ic->slice(':',$comp))->slice($i);
2118				}
2119				wantarray ? ($ivar_sort, pdl(0 .. $ic->dim(1)-1)) : $ivar_sort;
2120				}
2121
2122				=head2 plot_means
2123
2124				Plots means anova style. Can handle up to 4-way interactions (ie 4D pdl).
2125
2126				=for options
2127
2128				Default options (case insensitive):
2129
2130				IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'],
2131				DVNM => 'DV',
2132				AUTO => 1, # auto set dims to be on x-axis, line, panel
2133				# if set 0, dim 0 goes on x-axis, dim 1 as lines
2134				# dim 2+ as panels
2135				# see PDL::Graphics::Simple for next option
2136				WIN => undef, # pgswin object. not closed here if passed
2137				# allows comparing multiple lines in same plot
2138				SIZE => 640, # individual square panel size in pixels
2139
2140				=for usage
2141
2142				Usage:
2143
2144				# see anova for mean / se pdl structure
2145				$mean->plot_means( $se, {IVNM=>['apple', 'bake']} );
2146
2147				Or like this:
2148
2149				$m{'\| apple ~ bake \| m'}->plot_means;
2150
2151				=cut
2152
2153				*plot_means = \&PDL::plot_means;
2154				sub PDL::plot_means {
2155				require PDL::Graphics::Simple;
2156				my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2157				my ($self, $se) = @_;
2158				$self = $self->squeeze;
2159				if ($self->ndims > 4) {
2160				carp "Data is > 4D. No plot here.";
2161				return;
2162				}
2163				my %opt = (
2164				IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'],
2165				DVNM => 'DV',
2166				AUTO => 1, # auto set vars to be on X axis, line, panel
2167				WIN => undef, # PDL::Graphics::Simple object
2168				SIZE => 640, # individual square panel size in pixels
2169				);
2170				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt }
2171				# decide which vars to plot as x axis, lines, panels
2172				# put var w most levels on x axis
2173				# put var w least levels on diff panels
2174				my @iD = 0..3;
2175				my @dims = (1, 1, 1, 1);
2176				# splice ARRAY,OFFSET,LENGTH,LIST
2177				splice @dims, 0, $self->ndims, $self->dims;
2178				$self = $self->reshape(@dims)->sever;
2179				$se = $se->reshape(@dims)->sever
2180				if defined $se;
2181				@iD = reverse list qsorti pdl @dims
2182				if $opt{AUTO};
2183				# $iD[0] on x axis
2184				# $iD[1] as separate lines
2185				my $nx = $self->dim($iD[2]); # n xpanels
2186				my $ny = $self->dim($iD[3]); # n ypanels
2187				my $w = $opt{WIN} \|\| PDL::Graphics::Simple::pgswin(
2188				size=>[$opt{SIZE}$nx, $opt{SIZE}$ny,'px']);
2189				my $seq0 = sequence(my $dim0 = $self->dim($iD[0]));
2190				my ($pcount, @plots) = 0;
2191				for my $y (0..$ny-1) {
2192				for my $x (0..$nx-1) {
2193				my $key_prefix = "$opt{IVNM}[$iD[0]]\|";
2194				$key_prefix .= $opt{IVNM}[$iD[2]] . "=$x\|" if $nx > 1;
2195				$key_prefix .= $opt{IVNM}[$iD[3]] . "=$y\|" if $ny > 1;
2196				for (0 .. $self->dim($iD[1]) - 1) {
2197				my $ke = "$key_prefix$opt{IVNM}[$iD[1]]=$_";
2198				my $ydiced = $self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_)->squeeze;
2199				push @plots, with=>'lines', ke=>"$ke mean", style=>$pcount,
2200				$seq0+$pcount*0.05, $ydiced;
2201				push @plots, with=>'errorbars', ke=>"$ke error", style=>$pcount,
2202				$seq0+$pcount*0.05, $ydiced,
2203				$se->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)
2204				->dice_axis($iD[1],$_)->squeeze
2205				if defined($se);
2206				$pcount++;
2207				}
2208				}
2209				}
2210				my ($ymin, $ymax) = pdl($self, !defined $se ? () : ($self+$se, $self-$se))->minmax;
2211				$w->plot(@plots,
2212				{ xlabel=>$opt{IVNM}[$iD[0]], ylabel=>$opt{DVNM},
2213				xrange=>[-0.05,$dim0-1+$pcount*0.05], yrange=>[$ymin-0.05,$ymax+0.05] }
2214				);
2215				$w;
2216				}
2217
2218				=head2 plot_residuals
2219
2220				Plots residuals against predicted values.
2221
2222				=for usage
2223
2224				Usage:
2225
2226				use PDL::Graphics::Simple;
2227				$w = pgswin();
2228				$y->plot_residuals( $y_pred, { win=>$w } );
2229
2230				=for options
2231
2232				Default options (case insensitive):
2233
2234				# see PDL::Graphics::Simple for more info
2235				WIN => undef, # pgswin object. not closed here if passed
2236				# allows comparing multiple lines in same plot
2237				SIZE => 640, # plot size in pixels
2238				COLOR => 1,
2239
2240				=cut
2241
2242				*plot_residuals = \&PDL::plot_residuals;
2243				sub PDL::plot_residuals {
2244				require PDL::Graphics::Simple;
2245				my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2246				my ($y, $y_pred) = @_;
2247				my %opt = (
2248				# see PDL::Graphics::Simple for next options
2249				WIN => undef, # pgswin object. not closed here if passed
2250				# allows comparing multiple lines in same plot
2251				SIZE => 640, # plot size in pixels
2252				COLOR => 1,
2253				);
2254				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
2255				my $residuals = $y - $y_pred;
2256				my $win = $opt{WIN} \|\| PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']);
2257				$win->plot(
2258				with=>'points', style=>$opt{COLOR}, $y_pred, $residuals,
2259				with=>'lines', style=>$opt{COLOR}, pdl($y_pred->minmax), pdl(0,0), # 0-line
2260				{xlabel=>'predicted value', ylabel=>'residuals'},
2261				);
2262				}
2263
2264				=head2 plot_scores
2265
2266				Plots standardized original and PCA transformed scores against two components. (Thank you, Bob MacCallum, for the documentation suggestion that led to this function.)
2267
2268				=for options
2269
2270				Default options (case insensitive):
2271
2272				CORR => 1, # boolean. PCA was based on correlation or covariance
2273				COMP => [0,1], # indices to components to plot
2274				# see PDL::Graphics::Simple for next options
2275				WIN => undef, # pgswin object. not closed here if passed
2276				# allows comparing multiple lines in same plot
2277				SIZE => 640, # plot size in pixels
2278				COLOR => [1,2], # color for original and rotated scores
2279
2280				=for usage
2281
2282				Usage:
2283
2284				my %p = $data->pca();
2285				$data->plot_scores( $p{eigenvector}, \%opt );
2286
2287				=cut
2288
2289				*plot_scores = \&PDL::plot_scores;
2290				sub PDL::plot_scores {
2291				require PDL::Graphics::Simple;
2292				my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2293				my ($self, $eigvec) = @_;
2294				my %opt = (
2295				CORR => 1, # boolean. PCA was based on correlation or covariance
2296				COMP => [0,1], # indices to components to plot
2297				# see PDL::Graphics::Simple for next options
2298				WIN => undef, # pgswin object. not closed here if passed
2299				# allows comparing multiple lines in same plot
2300				SIZE => 640, # plot size in pixels
2301				COLOR => [1,2], # color for original and transformed scoress
2302				);
2303				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt }
2304				my $i = pdl $opt{COMP};
2305				my $z = $opt{CORR} ? $self->stddz : $self->dev_m;
2306				# transformed normed values
2307				my $scores = sumover($eigvec->slice(':',$i) * $z->transpose->dummy(1))->transpose;
2308				$z = $z->slice(':',$i)->sever;
2309				my $win = $opt{WIN} \|\| PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']);
2310				$win->plot(
2311				with=>'points', style=>$opt{COLOR}[0], ke=>'original', $z->slice(',(0)'), $z->slice(',(1)'),
2312				with=>'points', style=>$opt{COLOR}[1], ke=>'transformed', $scores->slice(',(0)'), $scores->slice(',(1)'),
2313				{xlabel=>"Component $opt{COMP}[0]", ylabel=>"Component $opt{COMP}[1]"},
2314				);
2315				}
2316
2317				=head2 plot_screes
2318
2319				Scree plot. Plots proportion of variance accounted for by PCA components.
2320
2321				=for options
2322
2323				Default options (case insensitive):
2324
2325				NCOMP => 20, # max number of components to plot
2326				CUT => 0, # set to plot cutoff line after this many components
2327				# undef to plot suggested cutoff line for NCOMP comps
2328				# see PDL::Graphics::Simple for next options
2329				WIN => undef, # pgswin object. not closed here if passed
2330				# allows comparing multiple lines in same plot
2331				SIZE => 640, # plot size in pixels
2332
2333				=for usage
2334
2335				Usage:
2336
2337				# variance should be in descending order
2338				$d = qsort random 10, 5; # 10 obs on 5 variables
2339				%pca = $d->pca( \%opt );
2340				$pca{pct_var}->plot_screes( {ncomp=>16, win=>$win=PDL::Graphics::Simple::pgswin()} );
2341
2342				Or, because NCOMP is used so often, it is allowed a shortcut,
2343
2344				$pca{pct_var}->plot_screes( 16 );
2345
2346				=cut
2347
2348				*plot_scree = \&PDL::plot_screes; # here for now for compatibility
2349				*plot_screes = \&PDL::plot_screes;
2350				sub PDL::plot_screes {
2351				require PDL::Graphics::Simple;
2352				my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2353				my ($self, $ncomp) = @_;
2354				my %opt = (
2355				NCOMP => 20, # max number of components to plot
2356				CUT => 0, # set to plot cutoff line after this many components
2357				# undef to plot suggested cutoff line for NCOMP comps
2358				# see PDL::Graphics::Simple for next options
2359				WIN => undef, # pgswin object. not closed here if passed
2360				# allows comparing multiple lines in same plot
2361				SIZE => 640, # plot size in pixels
2362				);
2363				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
2364				$opt{NCOMP} = $ncomp
2365				if $ncomp;
2366				# re-use $ncomp below
2367				$ncomp = ($self->dim(0) < $opt{NCOMP})? $self->dim(0) : $opt{NCOMP};
2368				my $self_comp = $self->slice([0,$ncomp-1]);
2369				$opt{CUT} = PDL::Stats::Kmeans::_scree_ind $self_comp
2370				if !defined $opt{CUT};
2371				my $win = $opt{WIN} \|\|
2372				PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']);
2373				$win->plot(
2374				with=>'lines', ke=>'scree', sequence($ncomp), $self_comp,
2375				!$opt{CUT} ? () : (with=>'lines', ke=>'cut', pdl($opt{CUT}-.5, $opt{CUT}-.5), pdl(-.05, $self->max->sclr+.05)),
2376				{xlabel=>'Component', ylabel=>'Proportion of Variance Accounted for',
2377				xrange=>[-0.05,$ncomp-0.95], yrange=>[0,1], le=>'tr'},
2378				);
2379				}
2380
2381				=head2 plot_stripchart
2382
2383				Stripchart plot. Plots ANOVA-style data, categorised against given IVs.
2384
2385				=for options
2386
2387				Default options (case insensitive):
2388
2389				IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
2390				DVNM => 'DV',
2391				# see PDL::Graphics::Simple for next options
2392				WIN => undef, # pgswin object. not closed here if passed
2393
2394				=for usage
2395
2396				Usage:
2397
2398				%m = $y->plot_stripchart( $a, \@b, { IVNM=>[qw(apple bake)] } );
2399
2400				=cut
2401
2402				my $CHART_GAP = 0.1;
2403				*plot_stripchart = \&PDL::plot_stripchart;
2404				sub PDL::plot_stripchart {
2405				require PDL::Graphics::Simple;
2406				my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2407				my ($y, @ivs_raw) = @_;
2408				my %opt = (
2409				IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
2410				DVNM => 'DV',
2411				WIN => undef, # pgswin object. not closed here if passed
2412				);
2413				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
2414				$opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
2415				if !$opt{IVNM} or !@{ $opt{IVNM} };
2416				my $w = $opt{WIN} \|\| PDL::Graphics::Simple::pgswin();
2417				my @codes = map [code_ivs($_)], @ivs_raw;
2418				my @levels = map {
2419				my $map = $_->[1];
2420				[sort {$map->{$a} <=> $map->{$b}} keys %$map];
2421				} @codes;
2422				my $xjitter = $y->random * $CHART_GAP;
2423				my ($pcount, @plots) = 0;
2424				push @plots, with=>'points', ke=>"all data", $xjitter+$pcount, $y;
2425				$pcount++;
2426				for my $i (0..$#ivs_raw) {
2427				my $levs = $levels[$i];
2428				my $name = $opt{IVNM}[$i];
2429				my $coded = $codes[$i][0];
2430				for my $j (0..$#$levs) {
2431				my $inds = which($coded == $j);
2432				push @plots, with=>'points', ke=>"$name=$levs->[$j]",
2433				$xjitter->slice($inds)+$pcount+$j*$CHART_GAP, $y->slice($inds);
2434				}
2435				$pcount++;
2436				}
2437				my ($ymin, $ymax) = $y->minmax;
2438				my $xmax = $pcount-1 + $CHART_GAP*($#{$levels[-1]} + 2);
2439				$w->plot(@plots,
2440				{ xlabel=>'IV', ylabel=>$opt{DVNM},
2441				xrange=>[-1,$xmax], yrange=>[$ymin-$CHART_GAP,$ymax+$CHART_GAP] }
2442				);
2443				$w;
2444				}
2445
2446				=head1 SEE ALSO
2447
2448				L
2449
2450				L
2451
2452				=head1 REFERENCES
2453
2454				Cohen, J., Cohen, P., West, S.G., & Aiken, L.S. (2003). Applied
2455				Multiple Regression/correlation Analysis for the Behavioral Sciences
2456				(3rd ed.). Mahwah, NJ: Lawrence Erlbaum Associates Publishers.
2457
2458				Hosmer, D.W., & Lemeshow, S. (2000). Applied Logistic Regression (2nd
2459				ed.). New York, NY: Wiley-Interscience.
2460
2461				Lorch, R.F., & Myers, J.L. (1990). Regression analyses of repeated
2462				measures data in cognitive research. Journal of Experimental Psychology:
2463				Learning, Memory, & Cognition, 16, 149-157.
2464
2465				Osgood C.E., Suci, G.J., & Tannenbaum, P.H. (1957). The Measurement of
2466				Meaning. Champaign, IL: University of Illinois Press.
2467
2468				Rutherford, A. (2011). ANOVA and ANCOVA: A GLM Approach (2nd ed.). Wiley.
2469
2470				Shlens, J. (2009). A Tutorial on Principal Component Analysis. Retrieved
2471				April 10, 2011 from http://citeseerx.ist.psu.edu/
2472
2473				The GLM procedure: unbalanced ANOVA for two-way design with
2474				interaction. (2008). SAS/STAT(R) 9.2 User's Guide. Retrieved June 18,
2475				2009 from http://support.sas.com/
2476
2477				Van den Noortgate, W., & Onghena, P. (2006). Analysing repeated
2478				measures data in cognitive research: A comment on regression coefficient
2479				analyses. European Journal of Cognitive Psychology, 18, 937-952.
2480
2481				=head1 AUTHOR
2482
2483				Copyright (C) 2009 Maggie J. Xiong
2484
2485				All rights reserved. There is no warranty. You are allowed to redistribute
2486				this software / documentation as described in the file COPYING in the
2487				PDL distribution.
2488
2489				=cut
2490				#line 2491 "lib/PDL/Stats/GLM.pm"
2491
2492				# Exit with OK status
2493
2494				1;