File Coverage

blib/lib/PDLA/Stats/GLM.pm

Criterion	Covered	Total	%
statement	513	680	75.4
branch	138	260	53.0
condition	31	72	43.0
subroutine	30	40	75.0
pod	0	17	0.0
total	712	1069	66.6

line	stmt	bran	cond	sub	pod	time	code
1
2							#
3							# GENERATED WITH PDLA::PP! Don't modify!
4							#
5							package PDLA::Stats::GLM;
6
7							@EXPORT_OK = qw( ols_t anova anova_rptd dummy_code effect_code effect_code_w interaction_code ols ols_rptd r2_change logistic pca pca_sorti plot_means plot_residuals plot_screes PDLA::PP fill_m PDLA::PP fill_rand PDLA::PP dev_m PDLA::PP stddz PDLA::PP sse PDLA::PP mse PDLA::PP rmse PDLA::PP pred_logistic PDLA::PP d0 PDLA::PP dm PDLA::PP dvrs );
8							%EXPORT_TAGS = (Func=>[@EXPORT_OK]);
9
10	2			2		1427	use PDLA::Core;
	2					3
	2					12
11	2			2		459	use PDLA::Exporter;
	2					4
	2					10
12	2			2		52	use DynaLoader;
	2					3
	2					93
13
14
15
16
17							@ISA = ( 'PDLA::Exporter','DynaLoader' );
18							push @PDLA::Core::PP, __PACKAGE__;
19							bootstrap PDLA::Stats::GLM ;
20
21
22
23
24
25	2			2		10	use strict;
	2					4
	2					49
26	2			2		10	use warnings;
	2					2
	2					85
27
28	2			2		21	use Carp;
	2					5
	2					114
29	2			2		9	use PDLA::LiteF;
	2					4
	2					13
30	2			2		3009	use PDLA::MatrixOps;
	2					4
	2					13
31	2			2		251	use PDLA::NiceSlice;
	2					3
	2					13
32	2			2		118770	use PDLA::Stats::Basic;
	2					5
	2					23
33	2			2		1621	use PDLA::Stats::Kmeans;
	2					5
	2					15
34
35							$PDLA::onlinedoc->scan(__FILE__) if $PDLA::onlinedoc;
36
37							eval { require PDLA::GSL::CDF; };
38							my $CDF = 1 if !$@;
39
40							eval { require PDLA::Slatec; };
41							my $SLATEC = 1 if !$@;
42
43							eval {
44							require PDLA::Graphics::PGPLOT::Window;
45							PDLA::Graphics::PGPLOT::Window->import( 'pgwin' );
46							};
47							my $PGPLOT = 1 if !$@;
48
49							my $DEV = ($^O =~ /win/i)? '/png' : '/xs';
50
51							=head1 NAME
52
53							PDLA::Stats::GLM -- general and generalized linear modeling methods such as ANOVA, linear regression, PCA, and logistic regression.
54
55							=head1 DESCRIPTION
56
57							The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are NOT threadable, respectively. FUNCTIONS except B support bad value. B strongly recommended for most METHODS, and it is required for B.
58
59							P-values, where appropriate, are provided if PDLA::GSL::CDF is installed.
60
61							=head1 SYNOPSIS
62
63							use PDLA::LiteF;
64							use PDLA::NiceSlice;
65							use PDLA::Stats::GLM;
66
67							# do a multiple linear regression and plot the residuals
68
69							my $y = pdl( 8, 7, 7, 0, 2, 5, 0 );
70
71							my $x = pdl( [ 0, 1, 2, 3, 4, 5, 6 ], # linear component
72							[ 0, 1, 4, 9, 16, 25, 36 ] ); # quadratic component
73
74							my %m = $y->ols( $x, {plot=>1} );
75
76							print "$_\t$m{$_}\n" for (sort keys %m);
77
78							=cut
79
80
81
82
83
84
85
86							=head1 FUNCTIONS
87
88
89
90							=cut
91
92
93
94
95
96
97							=head2 fill_m
98
99							=for sig
100
101							Signature: (a(n); float+ [o]b(n))
102
103
104
105							=for ref
106
107							Replaces bad values with sample mean. Mean is set to 0 if all obs are bad. Can be done inplace.
108
109							=for usage
110
111							perldl> p $data
112							[
113							[ 5 BAD 2 BAD]
114							[ 7 3 7 BAD]
115							]
116
117							perldl> p $data->fill_m
118							[
119							[ 5 3.5 2 3.5]
120							[ 7 3 7 5.66667]
121							]
122
123
124
125
126							=for bad
127
128							The output pdl badflag is cleared.
129
130
131							=cut
132
133
134
135
136
137
138							*fill_m = \&PDLA::fill_m;
139
140
141
142
143
144							=head2 fill_rand
145
146							=for sig
147
148							Signature: (a(n); [o]b(n))
149
150
151
152							=for ref
153
154							Replaces bad values with random sample (with replacement) of good observations from the same variable. Can be done inplace.
155
156							=for usage
157
158							perldl> p $data
159							[
160							[ 5 BAD 2 BAD]
161							[ 7 3 7 BAD]
162							]
163
164							perldl> p $data->fill_rand
165
166							[
167							[5 2 2 5]
168							[7 3 7 7]
169							]
170
171
172
173
174							=for bad
175
176							The output pdl badflag is cleared.
177
178
179							=cut
180
181
182
183
184
185
186							*fill_rand = \&PDLA::fill_rand;
187
188
189
190
191
192							=head2 dev_m
193
194							=for sig
195
196							Signature: (a(n); float+ [o]b(n))
197
198
199
200							=for ref
201
202							Replaces values with deviations from the mean. Can be done inplace.
203
204
205
206
207							=for bad
208
209							dev_m processes bad values.
210							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
211
212
213							=cut
214
215
216
217
218
219
220							*dev_m = \&PDLA::dev_m;
221
222
223
224
225
226							=head2 stddz
227
228							=for sig
229
230							Signature: (a(n); float+ [o]b(n))
231
232
233							=for ref
234
235							Standardize ie replace values with z_scores based on sample standard deviation from the mean (replace with 0s if stdv==0). Can be done inplace.
236
237
238
239
240							=for bad
241
242							stddz processes bad values.
243							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
244
245
246							=cut
247
248
249
250
251
252
253							*stddz = \&PDLA::stddz;
254
255
256
257
258
259							=head2 sse
260
261							=for sig
262
263							Signature: (a(n); b(n); float+ [o]c())
264
265
266
267							=for ref
268
269							Sum of squared errors between actual and predicted values.
270
271
272
273
274							=for bad
275
276							sse processes bad values.
277							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
278
279
280							=cut
281
282
283
284
285
286
287							*sse = \&PDLA::sse;
288
289
290
291
292
293							=head2 mse
294
295							=for sig
296
297							Signature: (a(n); b(n); float+ [o]c())
298
299
300
301							=for ref
302
303							Mean of squared errors between actual and predicted values, ie variance around predicted value.
304
305
306
307
308							=for bad
309
310							mse processes bad values.
311							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
312
313
314							=cut
315
316
317
318
319
320
321							*mse = \&PDLA::mse;
322
323
324
325
326
327							=head2 rmse
328
329							=for sig
330
331							Signature: (a(n); b(n); float+ [o]c())
332
333
334
335							=for ref
336
337							Root mean squared error, ie stdv around predicted value.
338
339
340
341
342							=for bad
343
344							rmse processes bad values.
345							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
346
347
348							=cut
349
350
351
352
353
354
355							*rmse = \&PDLA::rmse;
356
357
358
359
360
361							=head2 pred_logistic
362
363							=for sig
364
365							Signature: (a(n,m); b(m); float+ [o]c(n))
366
367
368
369							=for ref
370
371							Calculates predicted prob value for logistic regression.
372
373							=for usage
374
375							# glue constant then apply coeff returned by the logistic method
376
377							$pred = $x->glue(1,ones($x->dim(0)))->pred_logistic( $m{b} );
378
379
380
381
382							=for bad
383
384							pred_logistic processes bad values.
385							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
386
387
388							=cut
389
390
391
392
393
394
395							*pred_logistic = \&PDLA::pred_logistic;
396
397
398
399
400
401							=head2 d0
402
403							=for sig
404
405							Signature: (a(n); float+ [o]c())
406
407
408							=for usage
409
410							my $d0 = $y->d0();
411
412							=for ref
413
414							Null deviance for logistic regression.
415
416
417
418
419							=for bad
420
421							d0 processes bad values.
422							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
423
424
425							=cut
426
427
428
429
430
431
432							*d0 = \&PDLA::d0;
433
434
435
436
437
438							=head2 dm
439
440							=for sig
441
442							Signature: (a(n); b(n); float+ [o]c())
443
444
445							=for usage
446
447							my $dm = $y->dm( $y_pred );
448
449							# null deviance
450							my $d0 = $y->dm( ones($y->nelem) * $y->avg );
451
452							=for ref
453
454							Model deviance for logistic regression.
455
456
457
458
459							=for bad
460
461							dm processes bad values.
462							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
463
464
465							=cut
466
467
468
469
470
471
472							*dm = \&PDLA::dm;
473
474
475
476
477
478							=head2 dvrs
479
480							=for sig
481
482							Signature: (a(); b(); float+ [o]c())
483
484
485
486							=for ref
487
488							Deviance residual for logistic regression.
489
490
491
492
493							=for bad
494
495							dvrs processes bad values.
496							It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles.
497
498
499							=cut
500
501
502
503
504
505
506							*dvrs = \&PDLA::dvrs;
507
508
509
510
511							# my tmp var for PDLA 2.007 slice upate
512							my $_tmp;
513
514							=head2 ols_t
515
516							=for ref
517
518							Threaded version of ordinary least squares regression (B). The price of threading was losing significance tests for coefficients (but see B). The fitting function was shamelessly copied then modified from PDLA::Fit::Linfit. Uses PDLA::Slatec when possible but otherwise uses PDLA::MatrixOps. Intercept is LAST of coeff if CONST => 1.
519
520							ols_t does not handle bad values. consider B or B if there are bad values.
521
522							=for options
523
524							Default options (case insensitive):
525
526							CONST => 1,
527
528							=for usage
529
530							Usage:
531
532							# DV, 2 person's ratings for top-10 box office movies
533							# ascending sorted by box office numbers
534
535							perldl> p $y = qsort ceil( random(10, 2)*5 )
536							[
537							[1 1 2 4 4 4 4 5 5 5]
538							[1 2 2 2 3 3 3 3 5 5]
539							]
540
541							# model with 2 IVs, a linear and a quadratic trend component
542
543							perldl> $x = cat sequence(10), sequence(10)**2
544
545							# suppose our novice modeler thinks this creates 3 different models
546							# for predicting movie ratings
547
548							perldl> p $x = cat $x, $x * 2, $x * 3
549							[
550							[
551							[ 0 1 2 3 4 5 6 7 8 9]
552							[ 0 1 4 9 16 25 36 49 64 81]
553							]
554							[
555							[ 0 2 4 6 8 10 12 14 16 18]
556							[ 0 2 8 18 32 50 72 98 128 162]
557							]
558							[
559							[ 0 3 6 9 12 15 18 21 24 27]
560							[ 0 3 12 27 48 75 108 147 192 243]
561							]
562							]
563
564							perldl> p $x->info
565							PDLA: Double D [10,2,3]
566
567							# insert a dummy dim between IV and the dim (model) to be threaded
568
569							perldl> %m = $y->ols_t( $x->dummy(2) )
570
571							perldl> p "$_\t$m{$_}\n" for (sort keys %m)
572
573							# 2 persons' ratings, eached fitted with 3 "different" models
574
575							F
576							[
577							[ 38.314159 25.087209]
578							[ 38.314159 25.087209]
579							[ 38.314159 25.087209]
580							]
581
582							# df is the same across dv and iv models
583
584							F_df [2 7]
585							F_p
586							[
587							[0.00016967051 0.00064215074]
588							[0.00016967051 0.00064215074]
589							[0.00016967051 0.00064215074]
590							]
591
592							R2
593							[
594							[ 0.9162963 0.87756762]
595							[ 0.9162963 0.87756762]
596							[ 0.9162963 0.87756762]
597							]
598
599							b
600							[ # linear quadratic constant
601							[
602							[ 0.99015152 -0.056818182 0.66363636] # person 1
603							[ 0.18939394 0.022727273 1.4] # person 2
604							]
605							[
606							[ 0.49507576 -0.028409091 0.66363636]
607							[ 0.09469697 0.011363636 1.4]
608							]
609							[
610							[ 0.33005051 -0.018939394 0.66363636]
611							[ 0.063131313 0.0075757576 1.4]
612							]
613							]
614
615							# our novice modeler realizes at this point that
616							# the 3 models only differ in the scaling of the IV coefficients
617
618							ss_model
619							[
620							[ 20.616667 13.075758]
621							[ 20.616667 13.075758]
622							[ 20.616667 13.075758]
623							]
624
625							ss_residual
626							[
627							[ 1.8833333 1.8242424]
628							[ 1.8833333 1.8242424]
629							[ 1.8833333 1.8242424]
630							]
631
632							ss_total [22.5 14.9]
633							y_pred
634							[
635							[
636							[0.66363636 1.5969697 2.4166667 3.1227273 ... 4.9727273]
637							...
638
639							=cut
640
641							*ols_t = \&PDLA::ols_t;
642							sub PDLA::ols_t {
643							# y [n], ivs [n x attr] pdl
644	105			105	0	9936	my ($y, $ivs, $opt) = @_;
645	105					332	my %opt = ( CONST => 1 );
646	105		33			840	$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
647
648							# $y = $y->squeeze;
649	105	100				535	$ivs = $ivs->dummy(1) if $ivs->getndims == 1;
650							# set up ivs and const as ivs
651							$opt{CONST} and
652	105	100				472	$ivs = $ivs->glue( 1, ones($ivs->dim(0)) );
653
654							# Internally normalise data
655							# (double) it or ushort y and sequence iv won't work right
656	105					3261	my $ymean = $y->abs->sumover->double / $y->dim(0);
657	105					9145	($_tmp = $ymean->where( $ymean==0 )) .= 1;
658	105					8098	my $y2 = $y / $ymean->dummy(0);
659
660							# Do the fit
661
662	105					3741	my $Y = $ivs x $y2->dummy(0);
663
664	105					6648	my $C;
665
666	105	50				291	if ( $SLATEC ) {
667	0					0	$C = PDLA::Slatec::matinv( $ivs x $ivs->xchg(0,1) );
668							}
669							else {
670	105					569	$C = inv( $ivs x $ivs->xchg(0,1) );
671							}
672
673							# Fitted coefficients vector
674	105					4626069	my $coeff = PDLA::squeeze( $C x $Y );
675
676	105	100	100			11671	$coeff = $coeff->dummy(0)
677							if $coeff->getndims == 1 and $y->getndims > 1;
678	105					481	$coeff *= $ymean->dummy(0); # Un-normalise
679
680	105	100				4759	return $coeff
681							unless wantarray;
682
683	4					6	my %ret;
684
685							# *$coeff x $ivs looks nice but produces nan on successive tries*
686	4					14	$ret{y_pred} = sumover( $coeff->dummy(1) * $ivs->xchg(0,1) );
687	4	100				240	$ret{ss_total} = $opt{CONST}? $y->ss : sumover( $y ** 2 );
688	4					47	$ret{ss_residual} = $y->sse( $ret{y_pred} );
689	4					33	$ret{ss_model} = $ret{ss_total} - $ret{ss_residual};
690	4					25	$ret{R2} = $ret{ss_model} / $ret{ss_total};
691
692	4	100				17	my $n_var = $opt{CONST}? $ivs->dim(1) - 1 : $ivs->dim(1);
693	4					18	$ret{F_df} = pdl( $n_var, $y->dim(0) - $ivs->dim(1) );
694							$ret{F}
695	4					84	= $ret{ss_model} / $ret{F_df}->(0) / ($ret{ss_residual} / $ret{F_df}->(1));
696							$ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $ret{F_df}->dog )
697	4	50				141	if $CDF;
698
699	4	50				13	for (keys %ret) { ref $ret{$_} eq 'PDLA' and $ret{$_} = $ret{$_}->squeeze };
	28					829
700
701	4					140	$ret{b} = $coeff;
702
703	4					48	return %ret;
704							}
705
706							=head2 r2_change
707
708							=for ref
709
710							Significance test for the incremental change in R2 when new variable(s) are added to an ols regression model. Returns the change stats as well as stats for both models. Based on B. (One way to make up for the lack of significance tests for coeffs in ols_t).
711
712							=for options
713
714							Default options (case insensitive):
715
716							CONST => 1,
717
718							=for usage
719
720							Usage:
721
722							# suppose these are two persons' ratings for top 10 box office movies
723							# ascending sorted by box office
724
725							perldl> p $y = qsort ceil(random(10, 2) * 5)
726							[
727							[1 1 2 2 2 3 4 4 4 4]
728							[1 2 2 3 3 3 4 4 5 5]
729							]
730
731							# first IV is a simple linear trend
732
733							perldl> p $x1 = sequence 10
734							[0 1 2 3 4 5 6 7 8 9]
735
736							# the modeler wonders if adding a quadratic trend improves the fit
737
738							perldl> p $x2 = sequence(10) ** 2
739							[0 1 4 9 16 25 36 49 64 81]
740
741							# two difference models are given in two pdls
742							# each as would be pass on to ols_t
743							# the 1st model includes only linear trend
744							# the 2nd model includes linear and quadratic trends
745							# when necessary use dummy dim so both models have the same ndims
746
747							perldl> %c = $y->r2_change( $x1->dummy(1), cat($x1, $x2) )
748
749							perldl> p "$_\t$c{$_}\n" for (sort keys %c)
750							# person 1 person 2
751							F_change [0.72164948 0.071283096]
752							# df same for both persons
753							F_df [1 7]
754							F_p [0.42370145 0.79717232]
755							R2_change [0.0085966043 0.00048562549]
756							model0 HASH(0x8c10828)
757							model1 HASH(0x8c135c8)
758
759							# the answer here is no.
760
761							=cut
762
763							*r2_change = \&PDLA::r2_change;
764							sub PDLA::r2_change {
765	1			1	0	2104	my ($self, $ivs0, $ivs1, $opt) = @_;
766	1	50				6	$ivs0->getndims == 1 and $ivs0 = $ivs0->dummy(1);
767
768	1					26	my %ret;
769
770	1					3	$ret{model0} = { $self->ols_t( $ivs0, $opt ) };
771	1					4	$ret{model1} = { $self->ols_t( $ivs1, $opt ) };
772
773	1					10	$ret{R2_change} = $ret{model1}->{R2} - $ret{model0}->{R2};
774							$ret{F_df}
775							= pdl($ivs1->dim(1) - $ivs0->dim(1),
776	1					7	$ret{model1}->{F_df}->((1)) );
777							$ret{F_change}
778							= $ret{R2_change} * $ret{F_df}->((1))
779	1					34	/ ( (1-$ret{model1}->{R2}) * $ret{F_df}->((0)) );
780							$ret{F_p} = 1 - $ret{F_change}->gsl_cdf_fdist_P( $ret{F_df}->dog )
781	1	50				47	if $CDF;
782
783	1	100				3	for (keys %ret) { ref $ret{$_} eq 'PDLA' and $ret{$_} = $ret{$_}->squeeze };
	5					73
784
785	1					33	return %ret;
786							}
787
788							=head1 METHODS
789
790							=head2 anova
791
792							=for ref
793
794							Analysis of variance. Uses type III sum of squares for unbalanced data.
795
796							Dependent variable should be a 1D pdl. Independent variables can be passed as 1D perl array ref or 1D pdl.
797
798							Supports bad value (by ignoring missing or BAD values in dependent and independent variables list-wise).
799
800							=for options
801
802							Default options (case insensitive):
803
804							V => 1, # carps if bad value in variables
805							IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
806							PLOT => 1, # plots highest order effect
807							# can set plot_means options here
808
809							=for usage
810
811							Usage:
812
813							# suppose this is ratings for 12 apples
814
815							perldl> p $y = qsort ceil( random(12)*5 )
816							[1 1 2 2 2 3 3 4 4 4 5 5]
817
818							# IV for types of apple
819
820							perldl> p $a = sequence(12) % 3 + 1
821							[1 2 3 1 2 3 1 2 3 1 2 3]
822
823							# IV for whether we baked the apple
824
825							perldl> @b = qw( y y y y y y n n n n n n )
826
827							perldl> %m = $y->anova( $a, \@b, { IVNM=>['apple', 'bake'] } )
828
829							perldl> p "$_\t$m{$_}\n" for (sort keys %m)
830							# apple # m
831							[
832							[2.5 3 3.5]
833							]
834
835							# apple # se
836							[
837							[0.64549722 0.91287093 0.64549722]
838							]
839
840							# apple ~ bake # m
841							[
842							[1.5 1.5 2.5]
843							[3.5 4.5 4.5]
844							]
845
846							# apple ~ bake # se
847							[
848							[0.5 0.5 0.5]
849							[0.5 0.5 0.5]
850							]
851
852							# bake # m
853							[
854							[ 1.8333333 4.1666667]
855							]
856
857							# bake # se
858							[
859							[0.30731815 0.30731815]
860							]
861
862							F 7.6
863							F_df [5 6]
864							F_p 0.0141586545851857
865							ms_model 3.8
866							ms_residual 0.5
867							ss_model 19
868							ss_residual 3
869							ss_total 22
870							\| apple \| F 2
871							\| apple \| F_df [2 6]
872							\| apple \| F_p 0.216
873							\| apple \| ms 1
874							\| apple \| ss 2
875							\| apple ~ bake \| F 0.666666666666667
876							\| apple ~ bake \| F_df [2 6]
877							\| apple ~ bake \| F_p 0.54770848985725
878							\| apple ~ bake \| ms 0.333333333333334
879							\| apple ~ bake \| ss 0.666666666666667
880							\| bake \| F 32.6666666666667
881							\| bake \| F_df [1 6]
882							\| bake \| F_p 0.00124263849516693
883							\| bake \| ms 16.3333333333333
884							\| bake \| ss 16.3333333333333
885
886							=cut
887
888							*anova = \&PDLA::anova;
889							sub PDLA::anova {
890	4	50		4	0	4654	my $opt = pop @_
891							if ref $_[-1] eq 'HASH';
892	4					11	my ($y, @ivs_raw) = @_;
893							croak "Mismatched number of elements in DV and IV. Are you passing IVs the old-and-abandoned way?"
894	4	50	66			14	if (ref $ivs_raw[0] eq 'ARRAY') and (@{ $ivs_raw[0] } != $y->nelem);
	3					15
895
896	4					9	for (@ivs_raw) {
897	10	50	66			144	croak "too many dims in IV!"
898							if ref $_ eq 'PDLA' and $_->squeeze->ndims > 1;
899							}
900
901	4					145	my %opt = (
902							IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
903							PLOT => 1, # plots highest order effect
904							V => 1, # carps if bad value
905							);
906	4		33			34	$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
907	1					4	$opt{IVNM} = [ map { "IV_$_" } (0 .. $#ivs_raw) ]
908	4	100	66			14	if !$opt{IVNM} or !@{$opt{IVNM}};
	4					16
909	4					6	my @idv = @{ $opt{IVNM} };
	4					9
910
911	4					7	my %ret;
912
913	4					10	$y = $y->squeeze;
914							# create new vars here so we don't mess up original caller @
915							my @pdl_ivs_raw = map {
916	4	100				121	my $var = (ref $_ eq 'PDLA')? [list $_] : $_;
	10					34
917	10					183	scalar PDLA::Stats::Basic::_array_to_pdl $var;
918							} @ivs_raw;
919
920	4					12	my $pdl_ivs_raw = pdl \@pdl_ivs_raw;
921							# explicit set badflag if any iv had bad value because pdl() removes badflag
922	4					92	$pdl_ivs_raw->badflag( scalar grep { $_->badflag } @pdl_ivs_raw );
	10					26
923
924	4					13	($y, $pdl_ivs_raw) = _rm_bad_value( $y, $pdl_ivs_raw );
925
926	4	100	100			34	if ($opt{V} and $y->nelem < $pdl_ivs_raw[0]->nelem) {
927	1					41	printf STDERR "%d subjects with missing data removed\n", $pdl_ivs_raw[0]->nelem - $y->nelem;
928							}
929
930							# dog preserves data flow
931	4					19	@pdl_ivs_raw = map {$_->copy} $pdl_ivs_raw->dog;
	10					340
932
933	4					162	my ($ivs_ref, $i_cmo_ref)
934							= _effect_code_ivs( \@pdl_ivs_raw );
935
936	4					13	($ivs_ref, $i_cmo_ref, my( $idv, $ivs_cm_ref ))
937							= _add_interactions( $ivs_ref, $i_cmo_ref, \@idv, \@pdl_ivs_raw );
938
939							# add const here
940	4					17	my $ivs = PDLA->null->glue( 1, @$ivs_ref );
941	4					722	$ivs = $ivs->glue(1, ones $ivs->dim(0));
942
943	4					525	my $b_full = $y->ols_t( $ivs, {CONST=>0} );
944
945	4					75	$ret{ss_total} = $y->ss;
946	4					152	$ret{ss_residual} = $y->sse( sumover( $b_full * $ivs->xchg(0,1) ) );
947	4					49	$ret{ss_model} = $ret{ss_total} - $ret{ss_residual};
948	4					31	$ret{F_df} = pdl($ivs->dim(1) - 1, $y->nelem - ($ivs->dim(1) - 1) -1);
949	4					94	$ret{ms_model} = $ret{ss_model} / $ret{F_df}->(0);
950	4					85	$ret{ms_residual} = $ret{ss_residual} / $ret{F_df}->(1);
951	4					79	$ret{F} = $ret{ms_model} / $ret{ms_residual};
952							$ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $ret{F_df}->dog )
953	4	50				16	if $CDF;
954
955							# get IV ss from $ivs_ref instead of $ivs pdl
956
957	4					11	for my $k (0 .. $#$ivs_ref) {
958	22					45	my (@G, $G, $b_G);
959	22					58	@G = grep { $_ != $k } (0 .. $#$ivs_ref);
	148					220
960
961	22	100				66	if (@G) {
962	21					57	$G = PDLA->null->glue( 1, @$ivs_ref[@G] );
963	21					3020	$G = $G->glue(1, ones $G->dim(0));
964							}
965							else {
966	1					5	$G = ones( $y->dim(0) );
967							}
968	22					2817	$b_G = $y->ols_t( $G, {CONST=>0} );
969
970							$ret{ "\| $idv->[$k] \| ss" }
971	22					170	= $y->sse( sumover($b_G * $G->transpose) ) - $ret{ss_residual};
972							$ret{ "\| $idv->[$k] \| F_df" }
973	22					1295	= pdl( $ivs_ref->[$k]->dim(1), $ret{F_df}->(1)->copy )->squeeze;
974							$ret{ "\| $idv->[$k] \| ms" }
975	22					2138	= $ret{ "\| $idv->[$k] \| ss" } / $ret{ "\| $idv->[$k] \| F_df" }->(0);
976							$ret{ "\| $idv->[$k] \| F" }
977	22					570	= $ret{ "\| $idv->[$k] \| ms" } / $ret{ms_residual};
978							$ret{ "\| $idv->[$k] \| F_p" }
979	22	50				206	= 1 - $ret{ "\| $idv->[$k] \| F" }->gsl_cdf_fdist_P( $ret{ "\| $idv->[$k] \| F_df" }->dog )
980							if $CDF;
981							}
982
983	4					29	for (keys %ret) { $ret{$_} = $ret{$_}->squeeze };
	116					3082
984
985	4					130	my $cm_ref = _cell_means( $y, $ivs_cm_ref, $i_cmo_ref, $idv, \@pdl_ivs_raw );
986							# sort bc we can't count on perl % internal key order implementation
987	4					54	@ret{ sort keys %$cm_ref } = @$cm_ref{ sort keys %$cm_ref };
988
989	4					10	my $highest = join(' ~ ', @{ $opt{IVNM} });
	4					19
990							$cm_ref->{"# $highest # m"}->plot_means( $cm_ref->{"# $highest # se"}, {%opt, IVNM=>$idv} )
991	4	50				13	if $opt{PLOT};
992
993	4					228	return %ret;
994							}
995
996							sub _old_interface_check {
997	0			0		0	my ($n, $ivs_ref) = @_;
998							return 1
999	0	0	0			0	if (ref $ivs_ref->[0][0] eq 'ARRAY') and (@{ $ivs_ref->[0][0] } != $n);
	0					0
1000							}
1001
1002							sub _effect_code_ivs {
1003	12			12		30	my $ivs = shift;
1004
1005	12					24	my (@i_iv, @i_cmo);
1006	12					32	for (@$ivs) {
1007	28					379	my ($e, $map) = effect_code($_->squeeze);
1008	28	50				128	my $var = ($e->getndims == 1)? $e->dummy(1) : $e;
1009	28					52	push @i_iv, $var;
1010	28					133	my @indices = sort { $a<=>$b } values %$map;
	59					112
1011	28					64	push @i_cmo, pdl @indices;
1012							}
1013	12					289	return \@i_iv, \@i_cmo;
1014							}
1015
1016							sub _add_interactions {
1017	12			12		28	my ($var_ref, $i_cmo_ref, $idv, $raw_ref) = @_;
1018
1019							# append info re inter to main effects
1020	12					18	my (@inter, @idv_inter, @inter_cm, @inter_cmo);
1021	12					44	for my $nway ( 2 .. @$var_ref ) {
1022	16					64	my $iter_idv = _combinations( $nway, [0..$#$var_ref] );
1023
1024	16					38	while ( my @v = &$iter_idv() ) {
1025	32					155	my $i = ones( $var_ref->[0]->dim(0), 1 );
1026	32					1674	for (@v) {
1027	74					2023	$i = $i * $var_ref->[$_]->dummy(1);
1028	74					3358	$i = $i->clump(1,2);
1029							}
1030	32					1265	push @inter, $i;
1031
1032	32					82	my $e = join( ' ~ ', @$idv[@v] );
1033	32					50	push @idv_inter, $e;
1034
1035							# now prepare for cell mean
1036	32					46	my @i_cm = ();
1037	32					129	for my $o ( 0 .. $raw_ref->[0]->dim(0) - 1 ) {
1038	1540					81160	my @cell = map { $_($o)->squeeze } @$raw_ref[@v];
	3594					82820
1039	1540					57942	push @i_cm, join('', @cell);
1040							}
1041	32					1776	my ($inter, $map) = effect_code( \@i_cm );
1042	32					66	push @inter_cm, $inter;
1043
1044							# get the order to put means in correct multi dim pdl pos
1045							# this is order in var_e dim(1)
1046	32					180	my @levels = sort { $map->{$a} <=> $map->{$b} } keys %$map;
	658					805
1047							# this is order needed for cell mean
1048	32					91	my @i_cmo = sort { reverse($levels[$a]) cmp reverse($levels[$b]) }
	699					962
1049							0 .. $#levels;
1050	32					93	push @inter_cmo, pdl @i_cmo;
1051							}
1052							}
1053							# append info re inter to main effects
1054	12					88	return ([@$var_ref, @inter], [@$i_cmo_ref, @inter_cmo],
1055							[@$idv, @idv_inter], [@$var_ref, @inter_cm] );
1056							}
1057
1058							sub _cell_means {
1059	12			12		36	my ($data, $ivs_ref, $i_cmo_ref, $ids, $raw_ref) = @_;
1060
1061	12					17	my %ind_id;
1062	12					87	@ind_id{ @$ids } = 0..$#$ids;
1063
1064	12					23	my %cm;
1065	12					23	my $i = 0;
1066	12					35	for (@$ivs_ref) {
1067	60					206	my $last = zeroes $_->dim(0);
1068	60					2847	my $i_neg = which $_( ,0) == -1;
1069	60					3517	($_tmp = $last($i_neg)) .= 1;
1070	60					3605	($_tmp = $_->where($_ == -1)) .= 0;
1071	60					4757	$_ = $_->glue(1, $last);
1072
1073	60					5106	my @v = split ' ~ ', $ids->[$i];
1074	60					162	my @shape = map { $raw_ref->[$_]->uniq->nelem } @ind_id{@v};
	102					7661
1075
1076	60					11272	my ($m, $ss) = $data->centroid( $_ );
1077	60					238	$m = $m($i_cmo_ref->[$i])->sever;
1078	60					2547	$ss = $ss($i_cmo_ref->[$i])->sever;
1079	60					2117	$m = $m->reshape(@shape);
1080	60	100				2130	$m->getndims == 1 and $m = $m->dummy(1);
1081	60					2217	my $se = sqrt( ($ss/($_->sumover - 1)) / $_->sumover )->reshape(@shape);
1082	60	100				2731	$se->getndims == 1 and $se = $se->dummy(1);
1083	60					908	$cm{ "# $ids->[$i] # m" } = $m;
1084	60					142	$cm{ "# $ids->[$i] # se" } = $se;
1085	60					280	$i++;
1086							}
1087	12					56	return \%cm;
1088							}
1089
1090							# http://www.perlmonks.org/?node_id=371228
1091							sub _combinations {
1092	16			16		35	my ($num, $arr) = @_;
1093
1094	0			0		0	return sub { return }
1095	16	50	33			77	if $num == 0 or $num > @$arr;
1096
1097	16					20	my @pick;
1098
1099							return sub {
1100	48	100		48		924	return @$arr[ @pick = ( 0 .. $num - 1 ) ]
1101							unless @pick;
1102
1103	32					55	my $i = $#pick;
1104	32		100			213	$i-- until $i < 0 or $pick[$i]++ < @$arr - $num + $i;
1105	32	100				177	return if $i < 0;
1106
1107	16					42	@pick[$i .. $#pick] = $pick[$i] .. $#$arr;
1108
1109	16					43	return @$arr[@pick];
1110	16					97	};
1111							}
1112
1113							=head2 anova_rptd
1114
1115							Repeated measures and mixed model anova. Uses type III sum of squares. The standard error (se) for the means are based on the relevant mean squared error from the anova, ie it is pooled across levels of the effect.
1116
1117							anova_rptd supports bad value in the dependent and independent variables. It automatically removes bad data listwise, ie remove a subject's data if there is any cell missing for the subject.
1118
1119							Default options (case insensitive):
1120
1121							V => 1, # carps if bad value in dv
1122							IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
1123							BTWN => [], # indices of between-subject IVs (matches IVNM indices)
1124							PLOT => 1, # plots highest order effect
1125							# see plot_means() for more options
1126
1127							Usage:
1128
1129							Some fictional data: recall_w_beer_and_wings.txt
1130
1131							Subject Beer Wings Recall
1132							Alex 1 1 8
1133							Alex 1 2 9
1134							Alex 1 3 12
1135							Alex 2 1 7
1136							Alex 2 2 9
1137							Alex 2 3 12
1138							Brian 1 1 12
1139							Brian 1 2 13
1140							Brian 1 3 14
1141							Brian 2 1 9
1142							Brian 2 2 8
1143							Brian 2 3 14
1144							...
1145
1146							# rtable allows text only in 1st row and col
1147							my ($data, $idv, $subj) = rtable 'recall_w_beer_and_wings.txt';
1148
1149							my ($b, $w, $dv) = $data->dog;
1150							# subj and IVs can be 1d pdl or @ ref
1151							# subj must be the first argument
1152							my %m = $dv->anova_rptd( $subj, $b, $w, {ivnm=>['Beer', 'Wings']} );
1153
1154							print "$_\t$m{$_}\n" for (sort keys %m);
1155
1156							# Beer # m
1157							[
1158							[ 10.916667 8.9166667]
1159							]
1160
1161							# Beer # se
1162							[
1163							[ 0.4614791 0.4614791]
1164							]
1165
1166							# Beer ~ Wings # m
1167							[
1168							[ 10 7]
1169							[ 10.5 9.25]
1170							[12.25 10.5]
1171							]
1172
1173							# Beer ~ Wings # se
1174							[
1175							[0.89170561 0.89170561]
1176							[0.89170561 0.89170561]
1177							[0.89170561 0.89170561]
1178							]
1179
1180							# Wings # m
1181							[
1182							[ 8.5 9.875 11.375]
1183							]
1184
1185							# Wings # se
1186							[
1187							[0.67571978 0.67571978 0.67571978]
1188							]
1189
1190							ss_residual 19.0833333333333
1191							ss_subject 24.8333333333333
1192							ss_total 133.833333333333
1193							\| Beer \| F 9.39130434782609
1194							\| Beer \| F_p 0.0547977008378944
1195							\| Beer \| df 1
1196							\| Beer \| ms 24
1197							\| Beer \| ss 24
1198							\| Beer \|\| err df 3
1199							\| Beer \|\| err ms 2.55555555555556
1200							\| Beer \|\| err ss 7.66666666666667
1201							\| Beer ~ Wings \| F 0.510917030567687
1202							\| Beer ~ Wings \| F_p 0.623881438624431
1203							\| Beer ~ Wings \| df 2
1204							\| Beer ~ Wings \| ms 1.625
1205							\| Beer ~ Wings \| ss 3.25000000000001
1206							\| Beer ~ Wings \|\| err df 6
1207							\| Beer ~ Wings \|\| err ms 3.18055555555555
1208							\| Beer ~ Wings \|\| err ss 19.0833333333333
1209							\| Wings \| F 4.52851711026616
1210							\| Wings \| F_p 0.0632754786153548
1211							\| Wings \| df 2
1212							\| Wings \| ms 16.5416666666667
1213							\| Wings \| ss 33.0833333333333
1214							\| Wings \|\| err df 6
1215							\| Wings \|\| err ms 3.65277777777778
1216							\| Wings \|\| err ss 21.9166666666667
1217
1218							For mixed model anova, ie when there are between-subject IVs involved, feed the IVs as above, but specify in BTWN which IVs are between-subject. For example, if we had added age as a between-subject IV in the above example, we would do
1219
1220							my %m = $dv->anova_rptd( $subj, $age, $b, $w,
1221							{ ivnm=>['Age', 'Beer', 'Wings'], btwn=>[0] });
1222
1223							=cut
1224
1225							*anova_rptd = \&PDLA::anova_rptd;
1226							sub PDLA::anova_rptd {
1227	8	50		8	0	12615	my $opt = pop @_
1228							if ref $_[-1] eq 'HASH';
1229	8					29	my ($y, $subj, @ivs_raw) = @_;
1230
1231	8					23	for (@ivs_raw) {
1232	18	50	33			443	croak "too many dims in IV!"
1233							if ref $_ eq 'PDLA' and $_->squeeze->ndims > 1
1234							}
1235
1236	8					291	my %opt = (
1237							V => 1, # carps if bad value in dv
1238							IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
1239							BTWN => [], # indices of between-subject IVs (matches IVNM indices)
1240							PLOT => 1, # plots highest order effect
1241							);
1242	8		33			76	$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
1243	1					5	$opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
1244	8	100	66			29	if !$opt{IVNM} or !@{ $opt{IVNM} };
	8					35
1245	8					14	my @idv = @{ $opt{IVNM} };
	8					22
1246
1247	8					13	my %ret;
1248
1249							# create new vars here so we don't mess up original caller @
1250							my ($sj, @pdl_ivs_raw)
1251	8	100				21	= map { my $var = (ref $_ eq 'PDLA')? [list $_] : $_;
	26					90
1252	26					580	scalar PDLA::Stats::Basic::_array_to_pdl $var;
1253							} ( $subj, @ivs_raw );
1254
1255							# delete bad data listwise ie remove subj if any cell missing
1256	8					35	$y = $y->squeeze;
1257	8					289	my $pdl_ivs_raw = pdl \@pdl_ivs_raw;
1258							# explicit set badflag because pdl() removes badflag
1259	8					163	$pdl_ivs_raw->badflag( scalar grep { $_->badflag } @pdl_ivs_raw );
	18					49
1260
1261	8					103	my $ibad = which( $y->isbad \| nbadover($pdl_ivs_raw->transpose) );
1262
1263	8					455	my $sj_bad = $sj($ibad)->uniq;
1264	8	100				844	if ($sj_bad->nelem) {
1265							print STDERR $sj_bad->nelem . " subjects with missing data removed\n"
1266	3	50				11	if $opt{V};
1267							$sj = $sj->setvaltobad($_)
1268	3					10	for (list $sj_bad);
1269	3					101	my $igood = which $sj->isgood;
1270	3					84	for ($y, $sj, @pdl_ivs_raw) {
1271	12					29	$_ = $_( $igood )->sever;
1272	12					444	$_->badflag(0);
1273							}
1274							}
1275							# code for ivs and cell mean in diff @s: effect_code vs iv_cluster
1276	8					29	my ($ivs_ref, $i_cmo_ref)
1277							= _effect_code_ivs( \@pdl_ivs_raw );
1278
1279	8					33	($ivs_ref, $i_cmo_ref, my( $idv, $ivs_cm_ref))
1280							= _add_interactions( $ivs_ref, $i_cmo_ref, \@idv, \@pdl_ivs_raw );
1281
1282							# matches $ivs_ref, with an extra last pdl for subj effect
1283	8					46	my $err_ref
1284							= _add_errors( $sj, $ivs_ref, $idv, \@pdl_ivs_raw, \%opt );
1285
1286							# stitch together
1287	8					31	my $ivs = PDLA->null->glue( 1, @$ivs_ref );
1288	8	100				1113	$ivs = $ivs->glue(1, grep { defined($_) and ref($_) } @$err_ref);
	46					131
1289	8					834	$ivs = $ivs->glue(1, ones $ivs->dim(0));
1290	8					963	my $b_full = $y->ols_t( $ivs, {CONST=>0} );
1291
1292	8					153	$ret{ss_total} = $y->ss;
1293	8					260	$ret{ss_residual} = $y->sse( sumover( $b_full * $ivs->xchg(0,1) ) );
1294
1295	8					71	my @full = (@$ivs_ref, @$err_ref);
1296	8					28	EFFECT: for my $k (0 .. $#full) {
1297	84	100				304	my $e = ($k > $#$ivs_ref)? '\| err' : '';
1298	84	100				225	my $i = ($k > $#$ivs_ref)? $k - @$ivs_ref : $k;
1299
1300	84	100				315	if (!defined $full[$k]) { # ss_residual as error
		100
1301	14					55	$ret{ "\| $idv->[$i] \|$e ss" } = $ret{ss_residual};
1302							# highest ord inter for purely within design, (p-1)(q-1)(n-1)
1303							$ret{ "\| $idv->[$i] \|$e df" }
1304	14					42	= pdl(map { $_->dim(1) } @full[0 .. $#ivs_raw])->prodover;
	36					96
1305	14	100				475	$ret{ "\| $idv->[$i] \|$e df" }
1306							*= ref($full[-1])? $full[-1]->dim(1)
1307							: $err_ref->[$err_ref->[-1]]->dim(1)
1308							;
1309							$ret{ "\| $idv->[$i] \|$e ms" }
1310	14					294	= $ret{ "\| $idv->[$i] \|$e ss" } / $ret{ "\| $idv->[$i] \|$e df" };
1311							}
1312							elsif (ref $full[$k]) { # unique error term
1313	58					138	my (@G, $G, $b_G);
1314	58	100				186	@G = grep { $_ != $k and defined $full[$_] } (0 .. $#full);
	942					2231
1315
1316							next EFFECT
1317	58	50				183	unless @G;
1318
1319	58					175	$G = PDLA->null->glue( 1, grep { ref $_ } @full[@G] );
	760					1692
1320	58					11469	$G = $G->glue(1, ones $G->dim(0));
1321	58					7626	$b_G = $y->ols_t( $G, {CONST=>0} );
1322
1323	58	100				300	if ($k == $#full) {
1324							$ret{ss_subject}
1325	4					22	= $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual};
1326							}
1327							else {
1328							$ret{ "\| $idv->[$i] \|$e ss" }
1329	54					339	= $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual};
1330	54					3323	$ret{ "\| $idv->[$i] \|$e df" }
1331							= $full[$k]->dim(1);
1332							$ret{ "\| $idv->[$i] \|$e ms" }
1333	54					1235	= $ret{ "\| $idv->[$i] \|$e ss" } / $ret{ "\| $idv->[$i] \|$e df" };
1334							}
1335							}
1336							else { # repeating error term
1337	12	100				24	if ($k == $#full) {
1338	4					21	$ret{ss_subject} = $ret{"\| $idv->[$full[$k]] \|$e ss"};
1339							}
1340							else {
1341	8					37	$ret{ "\| $idv->[$i] \|$e ss" } = $ret{"\| $idv->[$full[$k]] \|$e ss"};
1342	8					27	$ret{ "\| $idv->[$i] \|$e df" } = $ret{"\| $idv->[$full[$k]] \|$e df"};
1343							$ret{ "\| $idv->[$i] \|$e ms" }
1344	8					98	= $ret{ "\| $idv->[$i] \|$e ss" } / $ret{ "\| $idv->[$i] \|$e df" };
1345							}
1346							}
1347							}
1348							# have all iv, inter, and error effects. get F and F_p
1349	8					219	for (0 .. $#$ivs_ref) {
1350							$ret{ "\| $idv->[$_] \| F" }
1351	38					304	= $ret{ "\| $idv->[$_] \| ms" } / $ret{ "\| $idv->[$_] \|\| err ms" };
1352							$ret{ "\| $idv->[$_] \| F_p" }
1353							= 1 - $ret{ "\| $idv->[$_] \| F" }->gsl_cdf_fdist_P(
1354	38	50				87	$ret{ "\| $idv->[$_] \| df" }, $ret{ "\| $idv->[$_] \|\| err df" } )
1355							if $CDF;
1356							}
1357
1358	8	100				62	for (keys %ret) {ref $ret{$_} eq 'PDLA' and $ret{$_} = $ret{$_}->squeeze};
	290					5417
1359
1360	8					216	my $cm_ref
1361							= _cell_means( $y, $ivs_cm_ref, $i_cmo_ref, $idv, \@pdl_ivs_raw );
1362	8					20	my @ls = map { $_->uniq->nelem } @pdl_ivs_raw;
	18					2001
1363							$cm_ref
1364	8					1333	= _fix_rptd_se( $cm_ref, \%ret, $opt{'IVNM'}, \@ls, $sj->uniq->nelem );
1365
1366							# integrate mean and se into %ret
1367							# sort bc we can't count on perl % internal key order implementation
1368	8					132	@ret{ sort keys %$cm_ref } = @$cm_ref{ sort keys %$cm_ref };
1369
1370	8					25	my $highest = join(' ~ ', @{ $opt{IVNM} });
	8					37
1371							$cm_ref->{"# $highest # m"}->plot_means( $cm_ref->{"# $highest # se"},
1372							{ %opt, IVNM=>$idv } )
1373	8	50				30	if $opt{PLOT};
1374
1375	8					638	return %ret;
1376							}
1377
1378							sub _add_errors {
1379	8			8		26	my ($subj, $ivs_ref, $idv, $raw_ivs, $opt) = @_;
1380
1381							# code (btwn group) subjects. Rutherford (2001) pp 101-102
1382
1383	8					15	my (@grp, %grp_s);
1384	8					42	for my $n (0 .. $subj->nelem - 1) {
1385							# construct string to code group membership
1386							# something not treated as BAD by _array_to_pdl to start off marking group membership
1387							# if no $opt->{BTWN}, everyone ends up in the same grp
1388	219					292	my $s = '_';
1389							$s .= $_->($n)
1390	219					229	for (@$raw_ivs[@{ $opt->{BTWN} }]);
	219					466
1391	219					9171	push @grp, $s; # group membership
1392	219					398	$s .= $subj($n); # keep track of total uniq subj
1393	219					11723	$grp_s{$s} = 1;
1394							}
1395	8					42	my $grp = PDLA::Stats::Kmeans::iv_cluster \@grp;
1396
1397	8					56	my $spdl = zeroes $subj->dim(0), keys(%grp_s) - $grp->dim(1);
1398	8					413	my ($d0, $d1) = (0, 0);
1399	8					40	for my $g (0 .. $grp->dim(1)-1) {
1400	14					42	my $gsub = $subj( which $grp( ,$g) )->effect_code;
1401	14					103	my ($nobs, $nsub) = $gsub->dims;
1402	14					320	($_tmp = $spdl($d0:$d0+$nobs-1, $d1:$d1+$nsub-1)) .= $gsub;
1403	14					437	$d0 += $nobs;
1404	14					45	$d1 += $nsub;
1405							}
1406
1407							# if btwn factor involved, or highest order inter for within factors
1408							# elem is undef, so that
1409							# @errors ind matches @$ivs_ref, with an extra elem at the end for subj
1410
1411							# mark btwn factors for error terms
1412							# same error term for B(wn) and A(btwn) x B(wn) (Rutherford, p98)
1413	8					16	my @qr = map { "(?:$idv->[$_])" } @{ $opt->{BTWN} };
	5					20
	8					22
1414	8					29	my $qr = join('\|', @qr);
1415
1416	8					13	my $ie_subj;
1417							my @errors = map
1418	8					27	{ my @fs = split ' ~ ', $idv->[$_];
	38					1460
1419							# separate bw and wn factors
1420							# if only bw, error is bw x subj
1421							# if only wn or wn and bw, error is wn x subj
1422	38					60	my (@wn, @bw);
1423	38	100				59	if ($qr) {
1424	24					37	for (@fs) {
1425	44	100				246	/$qr/? push @bw, $_ : push @wn, $_;
1426							}
1427							}
1428							else {
1429	14					30	@wn = @fs;
1430							}
1431	38	100				76	$ie_subj = defined($ie_subj)? $ie_subj : $_
		100
1432							if !@wn;
1433
1434	38	100				120	my $err = @wn? join(' ~ ', @wn) : join(' ~ ', @bw);
1435	38					49	my $ie; # mark repeating error term
1436	38					69	for my $i (0 .. $#$ivs_ref) {
1437	129	100				205	if ($idv->[$i] eq $err) {
1438	38					48	$ie = $i;
1439	38					47	last;
1440							}
1441							}
1442
1443							# highest order inter of within factors, use ss_residual as error
1444	38	100	100			53	if ( @wn == @$raw_ivs - @{$opt->{BTWN}} ) { undef }
	38	100				124
	14	100				41
1445							# repeating btwn factors use ss_subject as error
1446	2					5	elsif (!@wn and $_ > $ie_subj) { $ie_subj }
1447							# repeating error term
1448	6					12	elsif ($_ > $ie) { $ie }
1449	16					50	else { PDLA::clump($ivs_ref->[$_] * $spdl->dummy(1), 1,2) }
1450							} 0 .. $#$ivs_ref;
1451
1452	8	100				15	@{$opt->{BTWN}}? push @errors, $ie_subj : push @errors, $spdl;
	8					26
1453
1454	8					60	return \@errors;
1455							}
1456
1457							sub _fix_rptd_se {
1458							# if ivnm lvls_ref for within ss only this can work for mixed design
1459	8			8		1281	my ($cm_ref, $ret, $ivnm, $lvls_ref, $n) = @_;
1460
1461	8					107	my @se = grep /se$/, keys %$cm_ref;
1462	8					24	@se = map { /^# (.+?) # se$/; $1; } @se;
	38					109
	38					91
1463
1464							my @n_obs
1465							= map {
1466	8					26	my @ivs = split / ~ /, $_;
	38					121
1467	38					123	my $i_ivs = which_id $ivnm, \@ivs;
1468	38					1092	my $icollapsed = setops pdl(0 .. $#$ivnm), 'XOR', $i_ivs;
1469
1470	38	100				23029	my $collapsed = $icollapsed->nelem?
1471							pdl( @$lvls_ref[(list $icollapsed)] )->prodover
1472							: 1
1473							;
1474	38					2040	$n * $collapsed;
1475							} @se;
1476
1477	8					41	for my $i (0 .. $#se) {
1478							($_tmp = $cm_ref->{"# $se[$i] # se"})
1479	38					946	.= sqrt( $ret->{"\| $se[$i] \|\| err ms"} / $n_obs[$i] );
1480							}
1481
1482	8					184	return $cm_ref;
1483							}
1484
1485							=head2 dummy_code
1486
1487							=for ref
1488
1489							Dummy coding of nominal variable (perl @ ref or 1d pdl) for use in regression.
1490
1491							Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD).
1492
1493							=for usage
1494
1495							perldl> @a = qw(a a a b b b c c c)
1496							perldl> p $a = dummy_code(\@a)
1497							[
1498							[1 1 1 0 0 0 0 0 0]
1499							[0 0 0 1 1 1 0 0 0]
1500							]
1501
1502							=cut
1503
1504							*dummy_code = \&PDLA::dummy_code;
1505							sub PDLA::dummy_code {
1506	0			0	0	0	my ($var_ref) = @_;
1507
1508	0					0	my $var_e = effect_code( $var_ref );
1509
1510	0					0	($_tmp = $var_e->where( $var_e == -1 )) .= 0;
1511
1512	0					0	return $var_e;
1513							}
1514
1515							=head2 effect_code
1516
1517							=for ref
1518
1519							Unweighted effect coding of nominal variable (perl @ ref or 1d pdl) for use in regression. returns in @ context coded pdl and % ref to level - pdl->dim(1) index.
1520
1521							Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD).
1522
1523							=for usage
1524
1525							my @var = qw( a a a b b b c c c );
1526							my ($var_e, $map) = effect_code( \@var );
1527
1528							print $var_e . $var_e->info . "\n";
1529
1530							[
1531							[ 1 1 1 0 0 0 -1 -1 -1]
1532							[ 0 0 0 1 1 1 -1 -1 -1]
1533							]
1534							PDLA: Double D [9,2]
1535
1536							print "$_\t$map->{$_}\n" for (sort keys %$map)
1537							a 0
1538							b 1
1539							c 2
1540
1541							=cut
1542
1543							*effect_code = \&PDLA::effect_code;
1544							sub PDLA::effect_code {
1545	80			80	0	11556	my ($var_ref) = @_;
1546
1547							# pdl->uniq sorts elems. so instead list it to maintain old order
1548	80	100				207	if (ref $var_ref eq 'PDLA') {
1549	44					95	$var_ref = $var_ref->squeeze;
1550	44	50				1523	$var_ref->getndims > 1 and
1551							croak "multidim pdl passed for single var!";
1552	44					105	$var_ref = [ list $var_ref ];
1553							}
1554
1555	80					1195	my ($var, $map_ref) = PDLA::Stats::Basic::_array_to_pdl( $var_ref );
1556	80					224	my $var_e = zeroes float, $var->nelem, $var->max;
1557
1558	80					7463	for my $l (0 .. $var->max - 1) {
1559	347					25695	my $v = $var_e( ,$l);
1560	347					6364	($_tmp = $v->index( which $var == $l )) .= 1;
1561	347					17319	($_tmp = $v->index( which $var == $var->max )) .= -1;
1562							}
1563
1564	80	100				6946	if ($var->badflag) {
1565	1					10	my $ibad = which $var->isbad;
1566	1					27	($_tmp = $var_e($ibad, )) .= -99;
1567	1					57	$var_e = $var_e->setvaltobad(-99);
1568							}
1569
1570	80	100				423	return wantarray? ($var_e, $map_ref) : $var_e;
1571							}
1572
1573							=head2 effect_code_w
1574
1575							=for ref
1576
1577							Weighted effect code for nominal variable. returns in @ context coded pdl and % ref to level - pdl->dim(1) index.
1578
1579							Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD).
1580
1581							=for usage
1582
1583							perldl> @a = qw( a a b b b c c )
1584							perldl> p $a = effect_code_w(\@a)
1585							[
1586							[ 1 1 0 0 0 -1 -1]
1587							[ 0 0 1 1 1 -1.5 -1.5]
1588							]
1589
1590							=cut
1591
1592							*effect_code_w = \&PDLA::effect_code_w;
1593							sub PDLA::effect_code_w {
1594	1			1	0	1113	my ($var_ref) = @_;
1595
1596	1					3	my ($var_e, $map_ref) = effect_code( $var_ref );
1597
1598	1	50				4	if ($var_e->sum == 0) {
1599	0	0				0	return wantarray? ($var_e, $map_ref) : $var_e;
1600							}
1601
1602	1					41	for (0..$var_e->dim(1)-1) {
1603	2					61	my $factor = $var_e( ,$_);
1604	2					41	my $pos = which $factor == 1;
1605	2					85	my $neg = which $factor == -1;
1606	2					104	my $w = $pos->nelem / $neg->nelem;
1607	2					7	($_tmp = $factor($neg)) *= $w;
1608							}
1609
1610	1	50				56	return wantarray? ($var_e, $map_ref) : $var_e;
1611							}
1612
1613							=head2 interaction_code
1614
1615							Returns the coded interaction term for effect-coded variables.
1616
1617							Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD).
1618
1619							=for usage
1620
1621							perldl> $a = sequence(6) > 2
1622							perldl> p $a = $a->effect_code
1623							[
1624							[ 1 1 1 -1 -1 -1]
1625							]
1626
1627							perldl> $b = pdl( qw( 0 1 2 0 1 2 ) )
1628							perldl> p $b = $b->effect_code
1629							[
1630							[ 1 0 -1 1 0 -1]
1631							[ 0 1 -1 0 1 -1]
1632							]
1633
1634							perldl> p $ab = interaction_code( $a, $b )
1635							[
1636							[ 1 0 -1 -1 -0 1]
1637							[ 0 1 -1 -0 -1 1]
1638							]
1639
1640							=cut
1641
1642							*interaction_code = \&PDLA::interaction_code;
1643							sub PDLA::interaction_code {
1644	10			10	0	61	my @vars = @_;
1645
1646	10					52	my $i = ones( $vars[0]->dim(0), 1 );
1647	10					575	for (@vars) {
1648	21					582	$i = $i * $_->dummy(1);
1649	21					1193	$i = $i->clump(1,2);
1650							}
1651
1652	10					429	return $i;
1653							}
1654
1655							=head2 ols
1656
1657							=for ref
1658
1659							Ordinary least squares regression, aka linear regression. Unlike B, ols is not threadable, but it can handle bad value (by ignoring observations with bad value in dependent or independent variables list-wise) and returns the full model in list context with various stats.
1660
1661							IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply the constant vector in $x. Intercept is automatically added and returned as LAST of the coeffs if CONST=>1. Returns full model in list context and coeff in scalar context.
1662
1663							=for options
1664
1665							Default options (case insensitive):
1666
1667							CONST => 1,
1668							PLOT => 1, # see plot_residuals() for plot options
1669
1670							=for usage
1671
1672							Usage:
1673
1674							# suppose this is a person's ratings for top 10 box office movies
1675							# ascending sorted by box office
1676
1677							perldl> p $y = qsort ceil( random(10) * 5 )
1678							[1 1 2 2 2 2 4 4 5 5]
1679
1680							# construct IV with linear and quadratic component
1681
1682							perldl> p $x = cat sequence(10), sequence(10)**2
1683							[
1684							[ 0 1 2 3 4 5 6 7 8 9]
1685							[ 0 1 4 9 16 25 36 49 64 81]
1686							]
1687
1688							perldl> %m = $y->ols( $x )
1689
1690							perldl> p "$_\t$m{$_}\n" for (sort keys %m)
1691
1692							F 40.4225352112676
1693							F_df [2 7]
1694							F_p 0.000142834216344756
1695							R2 0.920314253647587
1696
1697							# coeff linear quadratic constant
1698
1699							b [0.21212121 0.03030303 0.98181818]
1700							b_p [0.32800118 0.20303404 0.039910509]
1701							b_se [0.20174693 0.021579989 0.38987581]
1702							b_t [ 1.0514223 1.404219 2.5182844]
1703							ss_model 19.8787878787879
1704							ss_residual 1.72121212121212
1705							ss_total 21.6
1706							y_pred [0.98181818 1.2242424 1.5272727 ... 4.6181818 5.3454545]
1707
1708							=cut
1709
1710							*ols = \&PDLA::ols;
1711							sub PDLA::ols {
1712							# y [n], ivs [n x attr] pdl
1713	4			4	0	1834	my ($y, $ivs, $opt) = @_;
1714	4					9	my %opt = (
1715							CONST => 1,
1716							PLOT => 1,
1717							);
1718	4		33			22	$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
1719
1720	4					12	$y = $y->squeeze;
1721	4	50				152	$y->getndims > 1 and
1722							croak "use ols_t for threaded version";
1723
1724	4	50				17	$ivs = $ivs->dummy(1) if $ivs->getndims == 1;
1725
1726	4					115	($y, $ivs) = _rm_bad_value( $y, $ivs );
1727
1728							# set up ivs and const as ivs
1729							$opt{CONST} and
1730	4	100				17	$ivs = $ivs->glue( 1, ones($ivs->dim(0)) );
1731
1732							# Internally normalise data
1733
1734	4					230	my $ymean = (abs($y)->sum)/($y->nelem);
1735	4	50				197	$ymean = 1 if $ymean == 0;
1736	4					34	my $y2 = $y / $ymean;
1737
1738							# Do the fit
1739
1740	4					15	my $Y = $ivs x $y2->dummy(0);
1741
1742	4					246	my $C;
1743	4	50				9	if ( $SLATEC ) {
1744	0					0	$C = PDLA::Slatec::matinv( $ivs x $ivs->xchg(0,1) );
1745							}
1746							else {
1747	4					17	$C = inv( $ivs x $ivs->xchg(0,1) );
1748							}
1749
1750							# Fitted coefficients vector
1751	4					5330	my $coeff = PDLA::squeeze( $C x $Y );
1752	4					321	$coeff *= $ymean; # Un-normalise
1753
1754	4					50	my %ret;
1755
1756							# *$coeff x $ivs looks nice but produces nan on successive tries*
1757	4					13	$ret{y_pred} = sumover( $coeff * $ivs->transpose );
1758
1759	4	50				91	$opt{PLOT} and $y->plot_residuals( $ret{y_pred}, \%opt );
1760
1761	4	100				23	return $coeff
1762							unless wantarray;
1763
1764	2					3	$ret{b} = $coeff;
1765	2	50				20	$ret{ss_total} = $opt{CONST}? $y->ss : sum( $y ** 2 );
1766	2					18	$ret{ss_residual} = $y->sse( $ret{y_pred} );
1767	2					14	$ret{ss_model} = $ret{ss_total} - $ret{ss_residual};
1768	2					13	$ret{R2} = $ret{ss_model} / $ret{ss_total};
1769
1770	2	50				10	my $n_var = $opt{CONST}? $ivs->dim(1) - 1 : $ivs->dim(1);
1771	2					9	$ret{F_df} = pdl( $n_var, $y->nelem - $ivs->dim(1) );
1772							$ret{F} = $ret{ss_model} / $ret{F_df}->(0)
1773	2					43	/ ( $ret{ss_residual} / $ret{F_df}->(1) );
1774							$ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $ret{F_df}->dog )
1775	2	50				72	if $CDF;
1776
1777	2	50				6	my $se_b = ones( $coeff->dims? $coeff->dims : 1 );
1778
1779							$opt{CONST} and
1780	2	50				154	($_tmp = $se_b(-1)) .= sqrt( $ret{ss_residual} / $ret{F_df}->(1) * $C(-1,-1) );
1781
1782							# get the se for bs by successivly regressing each iv by the rest ivs
1783	2	50				132	if ($ivs->dim(1) > 1) {
1784	2					6	for my $k (0 .. $n_var-1) {
1785	2					4	my @G = grep { $_ != $k } (0 .. $n_var-1);
	2					6
1786	2					8	my $G = $ivs->dice_axis(1, \@G);
1787							$opt{CONST} and
1788	2	50				75	$G = $G->glue( 1, ones($ivs->dim(0)) );
1789	2					118	my $b_G = $ivs( ,$k)->ols( $G, {CONST=>0,PLOT=>0} );
1790
1791	2					10	my $ss_res_k = $ivs( ,$k)->squeeze->sse( sumover($b_G * $G->transpose) );
1792
1793	2					282	($_tmp = $se_b($k)) .= sqrt( $ret{ss_residual} / $ret{F_df}->(1) / $ss_res_k );
1794							}
1795							}
1796							else {
1797							($_tmp = $se_b(0))
1798	0					0	.= sqrt( $ret{ss_residual} / $ret{F_df}->(1) / sum( $ivs( ,0)**2 ) );
1799							}
1800
1801	2					107	$ret{b_se} = $se_b;
1802	2					14	$ret{b_t} = $ret{b} / $ret{b_se};
1803	2	50				6	$ret{b_p} = 2 * ( 1 - $ret{b_t}->abs->gsl_cdf_tdist_P( $ret{F_df}->(1) ) )
1804							if $CDF;
1805
1806	2	50				7	for (keys %ret) { ref $ret{$_} eq 'PDLA' and $ret{$_} = $ret{$_}->squeeze };
	20					503
1807
1808	2					76	return %ret;
1809							}
1810
1811							sub _rm_bad_value {
1812	8			8		15	my ($y, $ivs) = @_;
1813
1814	8					11	my $idx;
1815	8	100	66			22	if ($y->check_badflag or $ivs->check_badflag) {
1816	3					186	$idx = which(($y->isbad==0) & (nbadover ($ivs->transpose)==0));
1817	3					192	$y = $y($idx)->sever;
1818	3					125	$ivs = $ivs($idx,)->sever;
1819	3					116	$ivs->badflag(0);
1820	3					7	$y->badflag(0);
1821							}
1822
1823	8					81	return $y, $ivs, $idx;
1824							}
1825
1826							=head2 ols_rptd
1827
1828							=for ref
1829
1830							Repeated measures linear regression (Lorch & Myers, 1990; Van den Noortgate & Onghena, 2006). Handles purely within-subject design for now. See t/stats_ols_rptd.t for an example using the Lorch and Myers data.
1831
1832							=for usage
1833
1834							Usage:
1835
1836							# This is the example from Lorch and Myers (1990),
1837							# a study on how characteristics of sentences affected reading time
1838							# Three within-subject IVs:
1839							# SP -- serial position of sentence
1840							# WORDS -- number of words in sentence
1841							# NEW -- number of new arguments in sentence
1842
1843							# $subj can be 1D pdl or @ ref and must be the first argument
1844							# IV can be 1D @ ref or pdl
1845							# 1D @ ref is effect coded internally into pdl
1846							# pdl is left as is
1847
1848							my %r = $rt->ols_rptd( $subj, $sp, $words, $new );
1849
1850							print "$_\t$r{$_}\n" for (sort keys %r);
1851
1852							(ss_residual) 58.3754646504336
1853							(ss_subject) 51.8590337714286
1854							(ss_total) 405.188241771429
1855							# SP WORDS NEW
1856							F [ 7.208473 61.354153 1.0243311]
1857							F_p [0.025006181 2.619081e-05 0.33792837]
1858							coeff [0.33337285 0.45858933 0.15162986]
1859							df [1 1 1]
1860							df_err [9 9 9]
1861							ms [ 18.450705 73.813294 0.57026483]
1862							ms_err [ 2.5595857 1.2030692 0.55671923]
1863							ss [ 18.450705 73.813294 0.57026483]
1864							ss_err [ 23.036272 10.827623 5.0104731]
1865
1866
1867							=cut
1868
1869							*ols_rptd = \&PDLA::ols_rptd;
1870							sub PDLA::ols_rptd {
1871	1			1	0	68	my ($y, $subj, @ivs_raw) = @_;
1872
1873	1					6	$y = $y->squeeze;
1874	1	50				63	$y->getndims > 1 and
1875							croak "ols_rptd does not support threading";
1876
1877	1	50	66			4	my @ivs = map { (ref $_ eq 'PDLA' and $_->ndims > 1)? $_
	3	100				59
1878							: ref $_ eq 'PDLA' ? $_->dummy(1)
1879							: scalar effect_code($_)
1880							;
1881							} @ivs_raw;
1882
1883	1					3	my %r;
1884
1885	1					71	$r{'(ss_total)'} = $y->ss;
1886
1887							# STEP 1: subj
1888
1889	1					9	my $s = effect_code $subj; # gives same results as dummy_code
1890	1					8	my $b_s = $y->ols_t($s);
1891	1					9	my $pred = sumover($b_s(0:-2) * $s->transpose) + $b_s(-1);
1892	1					161	$r{'(ss_subject)'} = $r{'(ss_total)'} - $y->sse( $pred );
1893
1894							# STEP 2: add predictor variables
1895
1896	1					12	my $iv_p = $s->glue(1, @ivs);
1897	1					108	my $b_p = $y->ols_t($iv_p);
1898
1899							# only care about coeff for predictor vars. no subj or const coeff
1900	1					8	$r{coeff} = $b_p(-(@ivs+1) : -2)->sever;
1901
1902							# get total sse for this step
1903	1					20	$pred = sumover($b_p(0:-2) * $iv_p->transpose) + $b_p(-1);
1904	1					85	my $ss_pe = $y->sse( $pred );
1905
1906							# get predictor ss by successively reducing the model
1907	1					7	$r{ss} = zeroes scalar(@ivs);
1908	1					67	for my $i (0 .. $#ivs) {
1909	3					135	my @i_rest = grep { $_ != $i } 0 .. $#ivs;
	9					21
1910	3					24	my $iv = $s->glue(1, @ivs[ @i_rest ]);
1911	3					292	my $b = $y->ols_t($iv);
1912	3					16	$pred = sumover($b(0:-2) * $iv->transpose) + $b(-1);
1913	3					204	($_tmp = $r{ss}->($i)) .= $y->sse($pred) - $ss_pe;
1914							}
1915
1916							# STEP 3: get precitor x subj interaction as error term
1917
1918	1					67	my $iv_e = PDLA::glue 1, map { interaction_code( $s, $_ ) } @ivs;
	3					10
1919
1920							# get total sse for this step. full model now.
1921	1					180	my $b_f = $y->ols_t( $iv_p->glue(1,$iv_e) );
1922	1					12	$pred = sumover($b_f(0:-2) * $iv_p->glue(1,$iv_e)->transpose) + $b_f(-1);
1923	1					189	$r{'(ss_residual)'} = $y->sse( $pred );
1924
1925							# get predictor x subj ss by successively reducing the error term
1926	1					7	$r{ss_err} = zeroes scalar(@ivs);
1927	1					83	for my $i (0 .. $#ivs) {
1928	3					141	my @i_rest = grep { $_ != $i } 0 .. $#ivs;
	9					21
1929	3					11	my $e_rest = PDLA::glue 1, map { interaction_code( $s, $_ ) } @ivs[@i_rest];
	6					20
1930	3					253	my $iv = $iv_p->glue(1, $e_rest);
1931	3					126	my $b = $y->ols_t($iv);
1932	3					14	my $pred = sumover($b(0:-2) * $iv->transpose) + $b(-1);
1933	3					221	($_tmp = $r{ss_err}->($i)) .= $y->sse($pred) - $r{'(ss_residual)'};
1934							}
1935
1936							# Finally, get MS, F, etc
1937
1938	1					66	$r{df} = pdl( map { $_->squeeze->ndims } @ivs );
	3					86
1939	1					67	$r{ms} = $r{ss} / $r{df};
1940
1941	1					14	$r{df_err} = $s->dim(1) * $r{df};
1942	1					11	$r{ms_err} = $r{ss_err} / $r{df_err};
1943
1944	1					8	$r{F} = $r{ms} / $r{ms_err};
1945
1946							$r{F_p} = 1 - $r{F}->gsl_cdf_fdist_P( $r{df}, $r{df_err} )
1947	1	50				6	if $CDF;
1948
1949	1					73	return %r;
1950							}
1951
1952
1953							=head2 logistic
1954
1955							=for ref
1956
1957							Logistic regression with maximum likelihood estimation using PDLA::Fit::LM (requires PDLA::Slatec. Hence loaded with "require" in the sub instead of "use" at the beginning).
1958
1959							IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply the constant vector in $x. It is included in the model and returned as LAST of coeff. Returns full model in list context and coeff in scalar context.
1960
1961							The significance tests are likelihood ratio tests (-2LL deviance) tests. IV significance is tested by comparing deviances between the reduced model (ie with the IV in question removed) and the full model.
1962
1963							*NOTE: the results here are qualitatively similar to but not identical with results from R, because different algorithms are used for the nonlinear parameter fit. Use with discretion*
1964
1965							=for options
1966
1967							Default options (case insensitive):
1968
1969							INITP => zeroes( $x->dim(1) + 1 ), # n_iv + 1
1970							MAXIT => 1000,
1971							EPS => 1e-7,
1972
1973							=for usage
1974
1975							Usage:
1976
1977							# suppose this is whether a person had rented 10 movies
1978
1979							perldl> p $y = ushort( random(10)*2 )
1980							[0 0 0 1 1 0 0 1 1 1]
1981
1982							# IV 1 is box office ranking
1983
1984							perldl> p $x1 = sequence(10)
1985							[0 1 2 3 4 5 6 7 8 9]
1986
1987							# IV 2 is whether the movie is action- or chick-flick
1988
1989							perldl> p $x2 = sequence(10) % 2
1990							[0 1 0 1 0 1 0 1 0 1]
1991
1992							# concatenate the IVs together
1993
1994							perldl> p $x = cat $x1, $x2
1995							[
1996							[0 1 2 3 4 5 6 7 8 9]
1997							[0 1 0 1 0 1 0 1 0 1]
1998							]
1999
2000							perldl> %m = $y->logistic( $x )
2001
2002							perldl> p "$_\t$m{$_}\n" for (sort keys %m)
2003
2004							D0 13.8629436111989
2005							Dm 9.8627829791575
2006							Dm_chisq 4.00016063204141
2007							Dm_df 2
2008							Dm_p 0.135324414081692
2009							# ranking genre constant
2010							b [0.41127706 0.53876358 -2.1201285]
2011							b_chisq [ 3.5974504 0.16835559 2.8577151]
2012							b_p [0.057868258 0.6815774 0.090936587]
2013							iter 12
2014							y_pred [0.10715577 0.23683909 ... 0.76316091 0.89284423]
2015
2016
2017							=cut
2018
2019							*logistic = \&PDLA::logistic;
2020							sub PDLA::logistic {
2021	0			0	0	0	require PDLA::Fit::LM; # uses PDLA::Slatec
2022
2023	0					0	my ( $self, $ivs, $opt ) = @_;
2024
2025	0					0	$self = $self->squeeze;
2026							# make compatible w multiple var cases
2027	0	0				0	$ivs->getndims == 1 and $ivs = $ivs->dummy(1);
2028	0	0				0	$self->dim(0) != $ivs->dim(0) and
2029							carp "mismatched n btwn DV and IV!";
2030
2031	0					0	my %opt = (
2032							INITP => zeroes( $ivs->dim(1) + 1 ), # n_ivs + 1
2033							MAXIT => 1000,
2034							EPS => 1e-7,
2035							);
2036	0		0			0	$opt and $opt{uc $_} = $opt->{$_} for (%$opt);
2037							# not using it atm
2038	0					0	$opt{WT} = 1;
2039
2040							# Use lmfit. Fourth input argument is reference to user-defined
2041							# copy INITP so we have the original value when needed
2042							my ($yfit,$coeff,$cov,$iter)
2043							= PDLA::Fit::LM::lmfit($ivs, $self, $opt{WT}, \&_logistic, $opt{INITP}->copy,
2044	0					0	{ Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
2045							# apparently at least coeff is child of some pdl
2046							# which is changed in later lmfit calls
2047	0					0	$yfit = $yfit->copy;
2048	0					0	$coeff = $coeff->copy;
2049
2050	0	0				0	return $coeff unless wantarray;
2051
2052	0					0	my %ret;
2053
2054	0					0	my $n0 = $self->where($self == 0)->nelem;
2055	0					0	my $n1 = $self->nelem - $n0;
2056
2057	0					0	$ret{D0} = -2($n0 log($n0 / $self->nelem) + $n1 * log($n1 / $self->nelem));
2058	0					0	$ret{Dm} = sum( $self->dvrs( $yfit ) ** 2 );
2059	0					0	$ret{Dm_chisq} = $ret{D0} - $ret{Dm};
2060	0					0	$ret{Dm_df} = $ivs->dim(1);
2061							$ret{Dm_p}
2062							= 1 - PDLA::GSL::CDF::gsl_cdf_chisq_P( $ret{Dm_chisq}, $ret{Dm_df} )
2063	0	0				0	if $CDF;
2064
2065	0					0	my $coeff_chisq = zeroes $opt{INITP}->nelem;
2066
2067	0	0				0	if ( $ivs->dim(1) > 1 ) {
2068	0					0	for my $k (0 .. $ivs->dim(1)-1) {
2069	0					0	my @G = grep { $_ != $k } (0 .. $ivs->dim(1)-1);
	0					0
2070	0					0	my $G = $ivs->dice_axis(1, \@G);
2071
2072	0					0	my $init = $opt{INITP}->dice([ @G, $opt{INITP}->dim(0)-1 ])->copy;
2073							my $y_G
2074							= PDLA::Fit::LM::lmfit( $G, $self, $opt{WT}, \&_logistic, $init,
2075	0					0	{ Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
2076
2077	0					0	($_tmp = $coeff_chisq($k)) .= $self->dm( $y_G ) - $ret{Dm};
2078							}
2079							}
2080							else {
2081							# d0 is, by definition, the deviance with only intercept
2082	0					0	($_tmp = $coeff_chisq(0)) .= $ret{D0} - $ret{Dm};
2083							}
2084
2085							my $y_c
2086							= PDLA::Fit::LM::lmfit( $ivs, $self, $opt{WT}, \&_logistic_no_intercept, $opt{INITP}->(0:-2)->copy,
2087	0					0	{ Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
2088
2089	0					0	($_tmp = $coeff_chisq(-1)) .= $self->dm( $y_c ) - $ret{Dm};
2090
2091	0					0	$ret{b} = $coeff;
2092	0					0	$ret{b_chisq} = $coeff_chisq;
2093	0	0				0	$ret{b_p} = 1 - $ret{b_chisq}->gsl_cdf_chisq_P( 1 )
2094							if $CDF;
2095	0					0	$ret{y_pred} = $yfit;
2096	0					0	$ret{iter} = $iter;
2097
2098	0	0				0	for (keys %ret) { ref $ret{$_} eq 'PDLA' and $ret{$_} = $ret{$_}->squeeze };
	0					0
2099
2100	0					0	return %ret;
2101							}
2102
2103							sub _logistic {
2104	0			0		0	my ($x,$par,$ym,$dyda) = @_;
2105
2106							# $b and $c are fit parameters slope and intercept
2107	0					0	my $b = $par(0 : $x->dim(1) - 1)->sever;
2108	0					0	my $c = $par(-1)->sever;
2109
2110							# Write function with dependent variable $ym,
2111							# independent variable $x, and fit parameters as specified above.
2112							# Use the .= (dot equals) assignment operator to express the equality
2113							# (not just a plain equals)
2114	0					0	($_tmp = $ym) .= 1 / ( 1 + exp( -1 * (sumover($b * $x->transpose) + $c) ) );
2115
2116	0					0	my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1);
	0					0
2117
2118							# Partial derivative of the function with respect to each slope
2119							# fit parameter ($b in this case). Again, note .= assignment
2120							# operator (not just "equals")
2121							($_tmp = $dy[$_]) .= $x( ,$_) * $ym * (1 - $ym)
2122	0					0	for (0 .. $b->dim(0)-1);
2123
2124							# Partial derivative of the function re intercept par
2125	0					0	($_tmp = $dy[-1]) .= $ym * (1 - $ym);
2126							}
2127
2128							sub _logistic_no_intercept {
2129	0			0		0	my ($x,$par,$ym,$dyda) = @_;
2130
2131	0					0	my $b = $par(0 : $x->dim(1) - 1)->sever;
2132
2133							# Write function with dependent variable $ym,
2134							# independent variable $x, and fit parameters as specified above.
2135							# Use the .= (dot equals) assignment operator to express the equality
2136							# (not just a plain equals)
2137	0					0	($_tmp = $ym) .= 1 / ( 1 + exp( -1 * sumover($b * $x->transpose) ) );
2138
2139	0					0	my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1);
	0					0
2140
2141							# Partial derivative of the function with respect to each slope
2142							# fit parameter ($b in this case). Again, note .= assignment
2143							# operator (not just "equals")
2144							($_tmp = $dy[$_]) .= $x( ,$_) * $ym * (1 - $ym)
2145	0					0	for (0 .. $b->dim(0)-1);
2146							}
2147
2148							=head2 pca
2149
2150							=for ref
2151
2152							Principal component analysis. Based on corr instead of cov (bad values are ignored pair-wise. OK when bad values are few but otherwise probably should fill_m etc before pca). Use PDLA::Slatec::eigsys() if installed, otherwise use PDLA::MatrixOps::eigens_sym().
2153
2154							=for options
2155
2156							Default options (case insensitive):
2157
2158							CORR => 1, # boolean. use correlation or covariance
2159							PLOT => 1, # calls plot_screes by default
2160							# can set plot_screes options here
2161
2162							=for usage
2163
2164							Usage:
2165
2166							my $d = qsort random 10, 5; # 10 obs on 5 variables
2167							my %r = $d->pca( \%opt );
2168							print "$_\t$r{$_}\n" for (keys %r);
2169
2170							eigenvalue # variance accounted for by each component
2171							[4.70192 0.199604 0.0471421 0.0372981 0.0140346]
2172
2173							eigenvector # dim var x comp. weights for mapping variables to component
2174							[
2175							[ -0.451251 -0.440696 -0.457628 -0.451491 -0.434618]
2176							[ -0.274551 0.582455 0.131494 0.255261 -0.709168]
2177							[ 0.43282 0.500662 -0.139209 -0.735144 -0.0467834]
2178							[ 0.693634 -0.428171 0.125114 0.128145 -0.550879]
2179							[ 0.229202 0.180393 -0.859217 0.4173 0.0503155]
2180							]
2181
2182							loadings # dim var x comp. correlation between variable and component
2183							[
2184							[ -0.978489 -0.955601 -0.992316 -0.97901 -0.942421]
2185							[ -0.122661 0.260224 0.0587476 0.114043 -0.316836]
2186							[ 0.0939749 0.108705 -0.0302253 -0.159616 -0.0101577]
2187							[ 0.13396 -0.0826915 0.0241629 0.0247483 -0.10639]
2188							[ 0.027153 0.0213708 -0.101789 0.0494365 0.00596076]
2189							]
2190
2191							pct_var # percent variance accounted for by each component
2192							[0.940384 0.0399209 0.00942842 0.00745963 0.00280691]
2193
2194							Plot scores along the first two components,
2195
2196							$d->plot_scores( $r{eigenvector} );
2197
2198							=cut
2199
2200							*pca = \&PDLA::pca;
2201							sub PDLA::pca {
2202	3			3	0	5295	my ($self, $opt) = @_;
2203
2204	3					7	my %opt = (
2205							CORR => 1,
2206							PLOT => 1,
2207							);
2208	3		33			18	$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
2209
2210	3	100				125	my $var_var = $opt{CORR}? $self->corr_table : $self->cov_table;
2211
2212							# value is axis pdl and score is var x axis
2213	3					8	my ($eigval, $eigvec);
2214	3	50				8	if ( $SLATEC ) {
2215	0					0	($eigval, $eigvec) = $var_var->PDLA::Slatec::eigsys;
2216							}
2217							else {
2218	3					12	($eigvec, $eigval) = $var_var->eigens_sym;
2219							# compatibility with PDLA::Slatec::eigsys
2220	3					2415	$eigvec = $eigvec->inplace->transpose->sever;
2221							}
2222
2223							# ind is sticky point for threading
2224	3					130	my $ind_sorted = $eigval->qsorti->(-1:0);
2225	3					49	$eigvec = $eigvec( ,$ind_sorted)->sever;
2226	3					129	$eigval = $eigval($ind_sorted)->sever;
2227
2228							# var x axis
2229	3					103	my $var = $eigval / $eigval->sum;
2230	3					134	my $loadings;
2231	3	100				9	if ($opt{CORR}) {
2232	2					7	$loadings = $eigvec * sqrt( $eigval->transpose );
2233							}
2234							else {
2235	1					13	my $scores = $eigvec x $self->dev_m;
2236	1					72	$loadings = $self->corr( $scores->dummy(1) );
2237							}
2238
2239							$var->plot_screes(\%opt)
2240	3	50				232	if $opt{PLOT};
2241
2242	3					29	return ( eigenvalue=>$eigval, eigenvector=>$eigvec,
2243							pct_var=>$var, loadings=>$loadings );
2244							}
2245
2246							=head2 pca_sorti
2247
2248							Determine by which vars a component is best represented. Descending sort vars by size of association with that component. Returns sorted var and relevant component indices.
2249
2250							=for options
2251
2252							Default options (case insensitive):
2253
2254							NCOMP => 10, # maximum number of components to consider
2255
2256							=for usage
2257
2258							Usage:
2259
2260							# let's see if we replicated the Osgood et al. (1957) study
2261							perldl> ($data, $idv, $ido) = rtable 'osgood_exp.csv', {v=>0}
2262
2263							# select a subset of var to do pca
2264							perldl> $ind = which_id $idv, [qw( ACTIVE BASS BRIGHT CALM FAST GOOD HAPPY HARD LARGE HEAVY )]
2265							perldl> $data = $data( ,$ind)->sever
2266							perldl> @$idv = @$idv[list $ind]
2267
2268							perldl> %m = $data->pca
2269
2270							perldl> ($iv, $ic) = $m{loadings}->pca_sorti()
2271
2272							perldl> p "$idv->[$_]\t" . $m{loadings}->($_,$ic)->flat . "\n" for (list $iv)
2273
2274							# COMP0 COMP1 COMP2 COMP3
2275							HAPPY [0.860191 0.364911 0.174372 -0.10484]
2276							GOOD [0.848694 0.303652 0.198378 -0.115177]
2277							CALM [0.821177 -0.130542 0.396215 -0.125368]
2278							BRIGHT [0.78303 0.232808 -0.0534081 -0.0528796]
2279							HEAVY [-0.623036 0.454826 0.50447 0.073007]
2280							HARD [-0.679179 0.0505568 0.384467 0.165608]
2281							ACTIVE [-0.161098 0.760778 -0.44893 -0.0888592]
2282							FAST [-0.196042 0.71479 -0.471355 0.00460276]
2283							LARGE [-0.241994 0.594644 0.634703 -0.00618055]
2284							BASS [-0.621213 -0.124918 0.0605367 -0.765184]
2285
2286							=cut
2287
2288							*pca_sorti = \&PDLA::pca_sorti;
2289							sub PDLA::pca_sorti {
2290							# $self is pdl (var x component)
2291	1			1	0	7	my ($self, $opt) = @_;
2292
2293	1					4	my %opt = (
2294							NCOMP => 10, # maximum number of components to consider
2295							);
2296	1		0			4	$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
2297
2298	1					6	my $ncomp = pdl($opt{NCOMP}, $self->dim(1))->min;
2299	1					62	$self = $self->dice_axis( 1, pdl(0..$ncomp-1) );
2300
2301	1					47	my $icomp = $self->transpose->abs->maximum_ind;
2302
2303							# sort between comp
2304	1					33	my $ivar_sort = $icomp->qsorti;
2305	1					4	$self = $self($ivar_sort, )->sever;
2306
2307							# sort within comp
2308	1					41	my $ic = $icomp($ivar_sort)->iv_cluster;
2309	1					7	for my $comp (0 .. $ic->dim(1)-1) {
2310	3					369	my $i = $self(which($ic( ,$comp)), ($comp))->qsorti->(-1:0);
2311	3					270	($_tmp = $ivar_sort(which $ic( ,$comp)))
2312							.= $ivar_sort(which $ic( ,$comp))->($i)->sever;
2313							}
2314	1	50				168	return wantarray? ($ivar_sort, pdl(0 .. $ic->dim(1)-1)) : $ivar_sort;
2315							}
2316
2317							=head2 plot_means
2318
2319							Plots means anova style. Can handle up to 4-way interactions (ie 4D pdl).
2320
2321							=for options
2322
2323							Default options (case insensitive):
2324
2325							IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'],
2326							DVNM => 'DV',
2327							AUTO => 1, # auto set dims to be on x-axis, line, panel
2328							# if set 0, dim 0 goes on x-axis, dim 1 as lines
2329							# dim 2+ as panels
2330							# see PDLA::Graphics::PGPLOT::Window for next options
2331							WIN => undef, # pgwin object. not closed here if passed
2332							# allows comparing multiple lines in same plot
2333							# set env before passing WIN
2334							DEV => '/xs', # open and close dev for plotting if no WIN
2335							# defaults to '/png' in Windows
2336							SIZE => 640, # individual square panel size in pixels
2337							SYMBL => [0, 4, 7, 11],
2338
2339							=for usage
2340
2341							Usage:
2342
2343							# see anova for mean / se pdl structure
2344							$mean->plot_means( $se, {IVNM=>['apple', 'bake']} );
2345
2346							Or like this:
2347
2348							$m{'# apple ~ bake # m'}->plot_means;
2349
2350							=cut
2351
2352							*plot_means = \&PDLA::plot_means;
2353							sub PDLA::plot_means {
2354	0	0		0	0		my $opt = pop @_
2355							if ref $_[-1] eq 'HASH';
2356	0						my ($self, $se) = @_;
2357	0	0					if (!$PGPLOT) {
2358	0						carp "No PDLA::Graphics::PGPLOT, no plot :(";
2359	0						return;
2360							}
2361	0						$self = $self->squeeze;
2362	0	0					if ($self->ndims > 4) {
2363	0						carp "Data is > 4D. No plot here.";
2364	0						return;
2365							}
2366
2367	0						my %opt = (
2368							IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'],
2369							DVNM => 'DV',
2370							AUTO => 1, # auto set vars to be on X axis, line, panel
2371							WIN => undef, # PDLA::Graphics::PGPLOT::Window object
2372							DEV => $DEV,
2373							SIZE => 640, # individual square panel size in pixels
2374							SYMBL => [0, 4, 7, 11], # ref PDLA::Graphics::PGPLOT::Window
2375							);
2376	0		0				$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
2377
2378							# decide which vars to plot as x axis, lines, panels
2379							# put var w most levels on x axis
2380							# put var w least levels on diff panels
2381	0						my @iD = 0..3;
2382	0						my @dims = (1, 1, 1, 1);
2383							# splice ARRAY,OFFSET,LENGTH,LIST
2384	0						splice @dims, 0, $self->ndims, $self->dims;
2385	0						$self = $self->reshape(@dims)->sever;
2386	0	0					$se = $se->reshape(@dims)->sever
2387							if defined $se;
2388	0						@iD = reverse sort { $a<=>$b } @dims
2389	0	0					if $opt{AUTO};
2390
2391							# $iD[0] on x axis
2392							# $iD[1] as separate lines
2393	0						my $nx = $self->dim($iD[2]); # n xpanels
2394	0						my $ny = $self->dim($iD[3]); # n ypanels
2395
2396	0						my $w = $opt{WIN};
2397	0	0					if (!defined $w) {
2398							$w = pgwin(DEV=>$opt{DEV}, NX=>$nx, NY=>$ny,
2399	0						SIZE=>[$opt{SIZE}$nx, $opt{SIZE}$ny], UNIT=>3);
2400							}
2401
2402	0	0					my ($min, $max) = defined $se? pdl($self + $se, $self - $se)->minmax
2403							: $self->minmax
2404							;
2405	0						my $range = $max - $min;
2406	0						my $p = 0; # panel
2407
2408	0						for my $y (0..$self->dim($iD[3])-1) {
2409	0						for my $x (0..$self->dim($iD[2])-1) {
2410	0						$p ++;
2411	0						my $tl = '';
2412	0	0					$tl = $opt{IVNM}->[$iD[2]] . " $x" if $self->dim($iD[2]) > 1;
2413	0	0					$tl .= ' ' . $opt{IVNM}->[$iD[3]] . " $y" if $self->dim($iD[3]) > 1;
2414							$w->env( 0, $self->dim($iD[0])-1, $min - 2*$range/5, $max + $range/5,
2415							{ XTitle=>$opt{IVNM}->[$iD[0]], YTitle=>$opt{DVNM}, Title=>$tl, PANEL=>$p, AXIS=>['BCNT', 'BCNST'], Border=>1,
2416							} )
2417	0	0					unless $opt{WIN};
2418
2419	0						my (@legend, @color);
2420	0						for (0 .. $self->dim($iD[1]) - 1) {
2421	0	0					push @legend, $opt{IVNM}->[$iD[1]] . " $_"
2422							if ($self->dim($iD[1]) > 1);
2423	0						push @color, $_ + 2; # start from red
2424							$w->points( sequence($self->dim($iD[0])),
2425							$self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_),
2426	0						$opt{SYMBL}->[$_],
2427							{ PANEL=>$p, CHARSIZE=>2, COLOR=>$_+2, PLOTLINE=>1, } );
2428	0	0					$w->errb( sequence($self->dim($iD[0])),
2429							$self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_),
2430							$se->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_),
2431							{ PANEL=>$p, CHARSIZE=>2, COLOR=>$_+2 } )
2432							if defined $se;
2433							}
2434	0	0					if ($self->dim($iD[1]) > 1) {
2435	0						$w->legend( \@legend, ($self->dim($iD[0])-1)/1.6, $min - $range/10,
2436							{ COLOR=>\@color } );
2437							$w->legend( \@legend, ($self->dim($iD[0])-1)/1.6, $min - $range/10,
2438	0						{ COLOR=>\@color, SYMBOL=>[ @{$opt{SYMBL}}[0..$#color] ] } );
	0
2439							}
2440							}
2441							}
2442							$w->close
2443	0	0					unless $opt{WIN};
2444
2445	0						return;
2446							}
2447
2448							=head2 plot_residuals
2449
2450							Plots residuals against predicted values.
2451
2452							=for usage
2453
2454							Usage:
2455
2456							$y->plot_residuals( $y_pred, { dev=>'/png' } );
2457
2458							=for options
2459
2460							Default options (case insensitive):
2461
2462							# see PDLA::Graphics::PGPLOT::Window for more info
2463							WIN => undef, # pgwin object. not closed here if passed
2464							# allows comparing multiple lines in same plot
2465							# set env before passing WIN
2466							DEV => '/xs', # open and close dev for plotting if no WIN
2467							# defaults to '/png' in Windows
2468							SIZE => 640, # plot size in pixels
2469							COLOR => 1,
2470
2471							=cut
2472
2473							*plot_residuals = \&PDLA::plot_residuals;
2474							sub PDLA::plot_residuals {
2475	0	0		0	0		if (!$PGPLOT) {
2476	0						carp "No PDLA::Graphics::PGPLOT, no plot :(";
2477	0						return;
2478							}
2479	0	0					my $opt = pop @_
2480							if ref $_[-1] eq 'HASH';
2481	0						my ($y, $y_pred) = @_;
2482	0						my %opt = (
2483							# see PDLA::Graphics::PGPLOT::Window for next options
2484							WIN => undef, # pgwin object. not closed here if passed
2485							# allows comparing multiple lines in same plot
2486							# set env before passing WIN
2487							DEV => $DEV , # open and close dev for plotting if no WIN
2488							SIZE => 640, # plot size in pixels
2489							COLOR => 1,
2490							);
2491	0		0				$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
2492
2493	0						my $residuals = $y - $y_pred;
2494
2495	0						my $win = $opt{WIN};
2496
2497	0	0					if (!$win) {
2498	0						$win = pgwin(DEV=>$opt{DEV}, SIZE=>[$opt{SIZE}, $opt{SIZE}], UNIT=>3);
2499	0						$win->env( $y_pred->minmax, $residuals->minmax,
2500							{XTITLE=>'predicted value', YTITLE=>'residuals',
2501							AXIS=>['BCNT', 'BCNST'], Border=>1,} );
2502							}
2503
2504	0						$win->points($y_pred, $residuals, { COLOR=>$opt{COLOR} });
2505							# add 0-line
2506	0						$win->line(pdl($y_pred->minmax), pdl(0,0), { COLOR=>$opt{COLOR} } );
2507
2508							$win->close
2509	0	0					unless $opt{WIN};
2510
2511	0						return;
2512							}
2513
2514
2515							=head2 plot_scores
2516
2517							Plots standardized original and PCA transformed scores against two components. (Thank you, Bob MacCallum, for the documentation suggestion that led to this function.)
2518
2519							=for options
2520
2521							Default options (case insensitive):
2522
2523							CORR => 1, # boolean. PCA was based on correlation or covariance
2524							COMP => [0,1], # indices to components to plot
2525							# see PDLA::Graphics::PGPLOT::Window for next options
2526							WIN => undef, # pgwin object. not closed here if passed
2527							# allows comparing multiple lines in same plot
2528							# set env before passing WIN
2529							DEV => '/xs', # open and close dev for plotting if no WIN
2530							# defaults to '/png' in Windows
2531							SIZE => 640, # plot size in pixels
2532							COLOR => [1,2], # color for original and rotated scores
2533
2534							=for usage
2535
2536							Usage:
2537
2538							my %p = $data->pca();
2539							$data->plot_scores( $p{eigenvector}, \%opt );
2540
2541							=cut
2542
2543							*plot_scores = \&PDLA::plot_scores;
2544							sub PDLA::plot_scores {
2545	0	0		0	0		if (!$PGPLOT) {
2546	0						carp "No PDLA::Graphics::PGPLOT, no plot :(";
2547	0						return;
2548							}
2549	0	0					my $opt = pop @_
2550							if ref $_[-1] eq 'HASH';
2551	0						my ($self, $eigvec) = @_;
2552	0						my %opt = (
2553							CORR => 1, # boolean. PCA was based on correlation or covariance
2554							COMP => [0,1], # indices to components to plot
2555							# see PDLA::Graphics::PGPLOT::Window for next options
2556							WIN => undef, # pgwin object. not closed here if passed
2557							# allows comparing multiple lines in same plot
2558							# set env before passing WIN
2559							DEV => $DEV , # open and close dev for plotting if no WIN
2560							SIZE => 640, # plot size in pixels
2561							COLOR => [1,2], # color for original and transformed scoress
2562							);
2563	0		0				$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
2564
2565	0						my $i = pdl $opt{COMP};
2566	0	0					my $z = $opt{CORR}? $self->stddz : $self->dev_m;
2567
2568							# transformed normed values
2569	0						my $scores = sumover($eigvec( ,$i) * $z->transpose->dummy(1))->transpose;
2570	0						$z = $z( ,$i)->sever;
2571
2572	0						my $win = $opt{WIN};
2573	0						my $max = pdl($z, $scores)->abs->ceil->max;
2574	0	0					if (!$win) {
2575	0						$win = pgwin(DEV=>$opt{DEV}, SIZE=>[$opt{SIZE}, $opt{SIZE}], UNIT=>3);
2576	0						$win->env(-$max, $max, -$max, $max,
2577							{XTitle=>"Compoment $opt{COMP}->[0]", YTitle=>"Component $opt{COMP}->[1]",
2578							AXIS=>['ABCNST', 'ABCNST'], Border=>1, });
2579							}
2580
2581	0						$win->points( $z( ,0;-), $z( ,1;-), { COLOR=>$opt{COLOR}->[0] } );
2582	0						$win->points( $scores( ,0;-), $scores( ,1;-), { COLOR=>$opt{COLOR}->[1] } );
2583	0						$win->legend( ['original', 'transformed'], .2$max, .8$max, {color=>[1,2],symbol=>[1,1]} );
2584							$win->close
2585	0	0					unless $opt{WIN};
2586	0						return;
2587							}
2588
2589
2590							=head2 plot_screes
2591
2592							Scree plot. Plots proportion of variance accounted for by PCA components.
2593
2594							=for options
2595
2596							Default options (case insensitive):
2597
2598							NCOMP => 20, # max number of components to plot
2599							CUT => 0, # set to plot cutoff line after this many components
2600							# undef to plot suggested cutoff line for NCOMP comps
2601							# see PDLA::Graphics::PGPLOT::Window for next options
2602							WIN => undef, # pgwin object. not closed here if passed
2603							# allows comparing multiple lines in same plot
2604							# set env before passing WIN
2605							DEV => '/xs', # open and close dev for plotting if no WIN
2606							# defaults to '/png' in Windows
2607							SIZE => 640, # plot size in pixels
2608							COLOR => 1,
2609
2610							=for usage
2611
2612							Usage:
2613
2614							# variance should be in descending order
2615
2616							$pca{var}->plot_screes( {ncomp=>16} );
2617
2618							Or, because NCOMP is used so often, it is allowed a shortcut,
2619
2620							$pca{var}->plot_screes( 16 );
2621
2622							=cut
2623
2624							*plot_scree = \&PDLA::plot_screes; # here for now for compatibility
2625							*plot_screes = \&PDLA::plot_screes;
2626							sub PDLA::plot_screes {
2627	0	0		0	0		if (!$PGPLOT) {
2628	0						carp "No PDLA::Graphics::PGPLOT, no plot :(";
2629	0						return;
2630							}
2631	0	0					my $opt = pop @_
2632							if ref $_[-1] eq 'HASH';
2633	0						my ($self, $ncomp) = @_;
2634	0						my %opt = (
2635							NCOMP => 20, # max number of components to plot
2636							CUT => 0, # set to plot cutoff line after this many components
2637							# undef to plot suggested cutoff line for NCOMP comps
2638							# see PDLA::Graphics::PGPLOT::Window for next options
2639							WIN => undef, # pgwin object. not closed here if passed
2640							# allows comparing multiple lines in same plot
2641							# set env before passing WIN
2642							DEV => $DEV , # open and close dev for plotting if no WIN
2643							SIZE => 640, # plot size in pixels
2644							COLOR => 1,
2645							);
2646	0		0				$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt);
2647	0	0					$opt{NCOMP} = $ncomp
2648							if $ncomp;
2649							# re-use $ncomp below
2650	0	0					$ncomp = ($self->dim(0) < $opt{NCOMP})? $self->dim(0) : $opt{NCOMP};
2651							$opt{CUT} = PDLA::Stats::Kmeans::_scree_ind $self(0:$ncomp-1)
2652	0	0					if !defined $opt{CUT};
2653
2654	0						my $win = $opt{WIN};
2655
2656	0	0					if (!$win) {
2657	0						$win = pgwin(DEV=>$opt{DEV}, SIZE=>[$opt{SIZE}, $opt{SIZE}], UNIT=>3);
2658	0						$win->env(0, $ncomp-1, 0, 1,
2659							{XTitle=>'Compoment', YTitle=>'Proportion of Variance Accounted for',
2660							AXIS=>['BCNT', 'BCNST'], Border=>1, });
2661							}
2662
2663							$win->points(sequence($ncomp), $self(0:$ncomp-1, ),
2664	0						{CHARSIZE=>2, COLOR=>$opt{COLOR}, PLOTLINE=>1} );
2665							$win->line( pdl($opt{CUT}-.5, $opt{CUT}-.5), pdl(-.05, $self->max+.05),
2666							{COLOR=>15} )
2667	0	0					if $opt{CUT};
2668							$win->close
2669	0	0					unless $opt{WIN};
2670	0						return;
2671							}
2672
2673							=head1 SEE ALSO
2674
2675							PDLA::Fit::Linfit
2676
2677							PDLA::Fit::LM
2678
2679							=head1 REFERENCES
2680
2681							Cohen, J., Cohen, P., West, S.G., & Aiken, L.S. (2003). Applied Multiple Regression/correlation Analysis for the Behavioral Sciences (3rd ed.). Mahwah, NJ: Lawrence Erlbaum Associates Publishers.
2682
2683							Hosmer, D.W., & Lemeshow, S. (2000). Applied Logistic Regression (2nd ed.). New York, NY: Wiley-Interscience.
2684
2685							Lorch, R.F., & Myers, J.L. (1990). Regression analyses of repeated measures data in cognitive research. Journal of Experimental Psychology: Learning, Memory, & Cognition, 16, 149-157.
2686
2687							Osgood C.E., Suci, G.J., & Tannenbaum, P.H. (1957). The Measurement of Meaning. Champaign, IL: University of Illinois Press.
2688
2689							Rutherford, A. (2001). Introducing Anova and Ancova: A GLM Approach (1st ed.). Thousand Oaks, CA: Sage Publications.
2690
2691							Shlens, J. (2009). A Tutorial on Principal Component Analysis. Retrieved April 10, 2011 from http://citeseerx.ist.psu.edu/
2692
2693							The GLM procedure: unbalanced ANOVA for two-way design with interaction. (2008). SAS/STAT(R) 9.2 User's Guide. Retrieved June 18, 2009 from http://support.sas.com/
2694
2695							Van den Noortgatea, W., & Onghenaa, P. (2006). Analysing repeated measures data in cognitive research: A comment on regression coefficient analyses. European Journal of Cognitive Psychology, 18, 937-952.
2696
2697							=head1 AUTHOR
2698
2699							Copyright (C) 2009 Maggie J. Xiong
2700
2701							All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDLA distribution.
2702
2703							=cut
2704
2705
2706
2707							;
2708
2709
2710
2711							# Exit with OK status
2712
2713							1;
2714
2715