File Coverage

lib/PDL/Stats/GLM.pd

Criterion	Covered	Total	%
statement	425	588	72.2
branch	161	272	59.1
condition	36	69	52.1
subroutine	31	41	75.6
pod	0	19	0.0
total	653	989	66.0

line	stmt	bran	cond	sub	pod	time	code
1							use strict;
2							use warnings;
3
4							my $F = [qw(F D)];
5
6							pp_add_exported(qw(ols_t ols ols_rptd
7							anova anova_rptd anova_design_matrix
8							dummy_code effect_code effect_code_w interaction_code
9							r2_change logistic
10							pca pca_sorti
11							plot_means plot_residuals plot_screes
12							));
13
14							pp_addpm({At=>'Top'}, <<'EOD');
15	1			1		9
	1					2
	1					43
16	1			1		5	use strict;
	1					1
	1					66
17							use warnings;
18	1			1		5
	1					1
	1					108
19	1			1		7	use Carp;
	1					2
	1					7
20	1			1		2817	use PDL::LiteF;
	1					2
	1					11
21	1			1		247	use PDL::MatrixOps;
	1					2
	1					6
22	1			1		1097	use PDL::Stats::Basic;
	1					4
	1					12
23							use PDL::Stats::Kmeans;
24
25							eval { require PDL::Core; require PDL::GSL::CDF; };
26							my $CDF = 1 if !$@;
27
28							=encoding utf8
29
30							=head1 NAME
31
32							PDL::Stats::GLM -- general and generalized linear modelling methods such as ANOVA, linear regression, PCA, and logistic regression.
33
34							=head1 SYNOPSIS
35
36							use PDL::LiteF;
37							use PDL::Stats::GLM;
38
39							# do a multiple linear regression and plot the residuals
40							my $y = pdl( 8, 7, 7, 0, 2, 5, 0 );
41							my $x = pdl( [ 0, 1, 2, 3, 4, 5, 6 ], # linear component
42							[ 0, 1, 4, 9, 16, 25, 36 ] ); # quadratic component
43							my %m = $y->ols( $x, {plot=>1} );
44							print "$_\t$m{$_}\n" for sort keys %m;
45
46							=head1 DESCRIPTION
47
48							For more about general linear modelling, see
49							L.
50							For an unbelievably thorough text on experimental design
51							and analysis, including linear modelling, see L
52							Course in Design and Analysis of Experiments, Gary
53							W. Oehlert\|http://users.stat.umn.edu/~gary/book/fcdae.pdf>.
54
55							The terms FUNCTIONS and METHODS are arbitrarily used to refer to
56							methods that are broadcastable and methods that are NOT broadcastable,
57							respectively. FUNCTIONS support bad values.
58
59							P-values, where appropriate, are provided if PDL::GSL::CDF is installed.
60
61							=cut
62
63							EOD
64
65							pp_addhdr('
66							#include
67							#include
68							#include
69
70							'
71							);
72
73							pp_def('fill_m',
74							Pars => 'a(n); [o]b(n)',
75							Inplace => 1,
76							GenericTypes => $F,
77							HandleBad => 1,
78							Code => '
79							broadcastloop %{
80							PDL_IF_BAD($GENERIC(b) sa = 0;
81							PDL_Indx N = 0;
82							loop (n) %{
83							if ($ISBAD($a())) continue;
84							sa += $a();
85							N++;
86							%}
87							$GENERIC(b) m = N ? sa / N : 0;,)
88							loop (n) %{
89							$b() = PDL_IF_BAD($ISBAD($a()) ? m :,) $a();
90							%}
91							%}
92							$PDLSTATESETGOOD(b);
93							',
94							Doc => '
95							=for ref
96
97							Replaces bad values with sample mean. Mean is set to 0 if all obs are
98							bad.
99
100							=for usage
101
102							pdl> p $data
103							[
104							[ 5 BAD 2 BAD]
105							[ 7 3 7 BAD]
106							]
107
108							pdl> p $data->fill_m
109							[
110							[ 5 3.5 2 3.5]
111							[ 7 3 7 5.66667]
112							]
113							',
114							BadDoc => 'The output pdl badflag is cleared.',
115							);
116
117							pp_def('fill_rand',
118							Pars => 'a(n); [o]b(n)',
119							Inplace => 1,
120							HandleBad => 1,
121							Code => '
122							broadcastloop %{
123							PDL_IF_BAD($GENERIC(a) *g[ $SIZE(n) ];
124							PDL_Indx i = 0;
125							srand( time( NULL ) );
126							loop (n) %{
127							if ( $ISGOOD($a()) ) {
128							g[i++] = &$a();
129							}
130							%},)
131							loop (n) %{
132							PDL_IF_BAD(if ($ISBAD($a())) {
133							/* XXX-FIXME works on 64bit, but rand() is quite limited */
134							PDL_Indx j = (PDL_Indx) ((i-1) * (double)(rand()) / (double)(RAND_MAX) + .5);
135							$b() = *g[j];
136							continue;
137							},)
138							$b() = $a();
139							%}
140							%}
141							$PDLSTATESETGOOD(b);
142							',
143							Doc => '
144							=for ref
145
146							Replaces bad values with random sample (with replacement) of good
147							observations from the same variable.
148
149							=for usage
150
151							pdl> p $data
152							[
153							[ 5 BAD 2 BAD]
154							[ 7 3 7 BAD]
155							]
156
157							pdl> p $data->fill_rand
158
159							[
160							[5 2 2 5]
161							[7 3 7 7]
162							]
163							',
164							BadDoc => 'The output pdl badflag is cleared.',
165							);
166
167							pp_def('dev_m',
168							Pars => 'a(n); [o]b(n)',
169							Inplace => 1,
170							GenericTypes => $F,
171							HandleBad => 1,
172							Code => '
173							$GENERIC(b) sa = 0, m = 0;
174							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
175							loop (n) %{
176							PDL_IF_BAD(if ($ISBAD($a())) continue;,)
177							sa += $a();
178							PDL_IF_BAD(N++;,)
179							%}
180							m = sa / N;
181							loop (n) %{
182							if ($ISBAD($a())) { $SETBAD(b()); continue; }
183							$b() = $a() - m;
184							%}
185							',
186							Doc => 'Replaces values with deviations from the mean.',
187							);
188
189							pp_def('stddz',
190							Pars => 'a(n); [o]b(n)',
191							Inplace => 1,
192							GenericTypes => $F,
193							HandleBad => 1,
194							Code => '
195							$GENERIC(b) sa = 0, a2 = 0, m = 0, sd = 0;
196							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
197							loop (n) %{
198							PDL_IF_BAD(if ($ISBAD($a())) continue;,)
199							sa += $a();
200							a2 += $a() * $a();
201							PDL_IF_BAD(N++;,)
202							%}
203							PDL_IF_BAD(if (N < 1) { loop (n) %{ $SETBAD(b()); %} continue; },)
204							m = sa / N;
205							sd = pow( a2/N - m*m, .5 );
206							loop (n) %{
207							PDL_IF_BAD(if ($ISBAD(a())) { $SETBAD(b()); continue; },)
208							/* sd? does not work, presumably due to floating point */
209							$b() = (sd>0)? (($a() - m) / sd) : 0;
210							%}
211							',
212							Doc => 'Standardize ie replace values with z_scores based on sample standard deviation from the mean (replace with 0s if stdv==0).',
213							);
214
215							pp_def('sse',
216							Pars => 'a(n); b(n); [o]c()',
217							GenericTypes => $F,
218							HandleBad => 1,
219							Code => '
220							$GENERIC(c) ss = 0;
221							loop (n) %{
222							PDL_IF_BAD(if ( $ISBAD($a()) \|\| $ISBAD($b()) ) continue;,)
223							$GENERIC() diff = $a() - $b();
224							ss += diff * diff;
225							%}
226							$c() = ss;
227							',
228							Doc => 'Sum of squared errors between actual and predicted values.',
229							);
230
231							pp_def('mse',
232							Pars => 'a(n); b(n); [o]c()',
233							GenericTypes => $F,
234							HandleBad => 1,
235							Code => '
236							$GENERIC(c) ss = 0;
237							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
238							loop (n) %{
239							PDL_IF_BAD(if ( $ISBAD($a()) \|\| $ISBAD($b()) ) continue;,)
240							$GENERIC() diff = $a() - $b();
241							ss += diff * diff;
242							PDL_IF_BAD(N++;,)
243							%}
244							if (N < 1) { $SETBAD(c()); continue; }
245							$c() = ss/N;
246							',
247							Doc => 'Mean of squared errors between actual and predicted values, ie variance around predicted value.',
248							);
249
250							pp_def('rmse',
251							Pars => 'a(n); b(n); [o]c()',
252							GenericTypes => $F,
253							HandleBad => 1,
254							Code => '
255							$GENERIC(c) d2 = 0;
256							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
257							loop (n) %{
258							PDL_IF_BAD(if ( $ISBAD($a()) \|\| $ISBAD($b()) ) continue;,)
259							$GENERIC() diff = $a() - $b();
260							d2 += diff * diff;
261							PDL_IF_BAD(N++;,)
262							%}
263							if (N < 1) { $SETBAD(c()); continue; }
264							$c() = sqrt( d2 / N );
265							',
266							Doc => 'Root mean squared error, ie stdv around predicted value.',
267							);
268
269							pp_def('pred_logistic',
270							Pars => 'a(n,m); b(m); [o]c(n)',
271							GenericTypes => $F,
272							HandleBad => 1,
273							Code => '
274							loop (n) %{
275							$GENERIC(c) l = 0;
276							PDL_IF_BAD(PDL_Indx bad = 0;,)
277							loop (m) %{
278							PDL_IF_BAD(if ($ISBAD($a()) \|\| $ISBAD($b())) { bad = 1; continue; },)
279							l += $a() * $b();
280							%}
281							PDL_IF_BAD(if (bad) { $SETBAD($c()); continue; },)
282							$c() = 1 / ( 1 + exp(-l) );
283							%}
284							',
285							Doc => '
286							=for ref
287
288							Calculates predicted prob value for logistic regression.
289
290							=for usage
291
292							# glue constant then apply coeff returned by the logistic method
293							$pred = $x->glue(1,ones($x->dim(0)))->pred_logistic( $m{b} );
294							',
295							);
296
297							pp_def('d0',
298							Pars => 'a(n); [o]c()',
299							GenericTypes => $F,
300							HandleBad => 1,
301							Code => '
302							$GENERIC(c) p = 0, ll = 0;
303							PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
304							loop (n) %{
305							PDL_IF_BAD(if ($ISBAD($a())) continue;,)
306							p += $a();
307							PDL_IF_BAD(N++;,)
308							%}
309							if (N < 1) { $SETBAD(c()); continue; }
310							p /= N;
311							loop (n) %{
312							PDL_IF_BAD(if ($ISBAD($a())) continue;,)
313							ll += $a()? log( p ) : log( 1 - p );
314							%}
315							$c() = -2 * ll;
316							',
317							Doc => 'Null deviance for logistic regression.',
318							);
319
320							pp_def('dm',
321							Pars => 'a(n); b(n); [o]c()',
322							GenericTypes => $F,
323							HandleBad => 1,
324							Code => '
325							$GENERIC(c) ll;
326							ll = 0;
327							loop (n) %{
328							PDL_IF_BAD(if ($ISBAD($a()) \|\| $ISBAD($b())) continue;,)
329							ll += $a()? log( $b() ) : log( 1 - $b() );
330							%}
331							$c() = -2 * ll;
332							',
333							Doc => '
334							=for ref
335
336							Model deviance for logistic regression.
337
338							=for usage
339
340							my $dm = $y->dm( $y_pred );
341							# null deviance
342							my $d0 = $y->dm( ones($y->nelem) * $y->avg );
343							',
344							);
345
346							pp_def('dvrs',
347							Pars => 'a(); b(); [o]c()',
348							GenericTypes => $F,
349							HandleBad => 1,
350							Code => '
351							PDL_IF_BAD(if ($ISBAD($a()) \|\| $ISBAD($b())) { $SETBAD( $c() ); continue; },)
352							$c() = $a()? sqrt( -2 * log($b()) )
353							: -1 * sqrt( -2 * log(1-$b()) )
354							;
355							',
356							Doc => 'Deviance residual for logistic regression.',
357							);
358
359							pp_addpm pp_line_numbers(__LINE__, <<'EOD');
360
361							=head2 ols_t
362
363							=for ref
364
365							Broadcasted version of ordinary least squares regression (B). The
366							price of broadcasting was losing significance tests for coefficients
367							(but see B). The fitting function was shamelessly copied then
368							modified from PDL::Fit::Linfit. Intercept is FIRST of coeff if CONST => 1.
369
370							ols_t does not handle bad values. consider B or B
371							if there are bad values.
372
373							=for options
374
375							Default options (case insensitive):
376
377							CONST => 1,
378
379							=for usage
380
381							Usage:
382
383							# DV, 2 person's ratings for top-10 box office movies
384							# ascending sorted by box office numbers
385
386							pdl> p $y = pdl '1 1 2 4 4 4 4 5 5 5; 1 2 2 2 3 3 3 3 5 5'
387
388							# model with 2 IVs, a linear and a quadratic trend component
389
390							pdl> $x = cat sequence(10), sequence(10)**2
391
392							# suppose our novice modeler thinks this creates 3 different models
393							# for predicting movie ratings
394
395							pdl> p $x = cat $x, $x * 2, $x * 3
396							[
397							[
398							[ 0 1 2 3 4 5 6 7 8 9]
399							[ 0 1 4 9 16 25 36 49 64 81]
400							]
401							[
402							[ 0 2 4 6 8 10 12 14 16 18]
403							[ 0 2 8 18 32 50 72 98 128 162]
404							]
405							[
406							[ 0 3 6 9 12 15 18 21 24 27]
407							[ 0 3 12 27 48 75 108 147 192 243]
408							]
409							]
410
411							pdl> p $x->info
412							PDL: Double D [10,2,3]
413
414							# insert a dummy dim between IV and the dim (model) to be broadcasted
415
416							pdl> %m = $y->ols_t( $x->dummy(2) )
417
418							pdl> p "$_\t@{[$m{$_} =~ /^\n(.?)\n*\z/s]}\n" for sort keys %m
419
420							# 2 persons' ratings, each fitted with 3 "different" models
421							F [
422							[ 38.314159 25.087209]
423							[ 38.314159 25.087209]
424							[ 38.314159 25.087209]
425							]
426
427							# df is the same across dv and iv models
428							F_df [2 7]
429							F_p [
430							[0.00016967051 0.00064215074]
431							[0.00016967051 0.00064215074]
432							[0.00016967051 0.00064215074]
433							]
434
435							R2 [
436							[ 0.9162963 0.87756762]
437							[ 0.9162963 0.87756762]
438							[ 0.9162963 0.87756762]
439							]
440
441							b [ # constant linear quadratic
442							[
443							[ 0.66363636 0.99015152 -0.056818182] # person 1
444							[ 1.4 0.18939394 0.022727273] # person 2
445							]
446							[
447							[ 0.66363636 0.49507576 -0.028409091]
448							[ 1.4 0.09469697 0.011363636]
449							]
450							[
451							[ 0.66363636 0.33005051 -0.018939394]
452							[ 1.4 0.063131313 0.0075757576]
453							]
454							]
455
456							# our novice modeler realizes at this point that
457							# the 3 models only differ in the scaling of the IV coefficients
458							ss_model [
459							[ 20.616667 13.075758]
460							[ 20.616667 13.075758]
461							[ 20.616667 13.075758]
462							]
463
464							ss_residual [
465							[ 1.8833333 1.8242424]
466							[ 1.8833333 1.8242424]
467							[ 1.8833333 1.8242424]
468							]
469
470							ss_total [22.5 14.9]
471							y_pred [
472							[
473							[0.66363636 1.5969697 2.4166667 3.1227273 ... 4.9727273]
474							...
475
476							=cut
477
478							*ols_t = \&PDL::ols_t;
479							sub PDL::ols_t {
480	117			117	0	247912	_ols_common(1, @_);
481							}
482
483							sub _ols_common {
484	121			121		409	my ($broadcasted, $y, $ivs, $opt) = @_;
485	121					595	($y, $ivs) = _ols_prep_inputs(@_);
486	121					442	_ols_main($broadcasted, $y, $ivs, $opt);
487							}
488
489							sub _ols_prep_inputs {
490							# y [n], ivs [n x attr] pdl
491	121			121		366	my ($broadcasted, $y, $ivs, $opt) = @_;
492	121					729	my %opt = (
493							CONST => 1,
494							PLOT => 0,
495							WIN => undef, # for plotting
496							);
497	121	100				476	if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
	109					770
498	121	100				567	if (!$broadcasted) {
499	4					14	$y = $y->squeeze;
500	4	50				215	$y->getndims > 1 and
501							croak "use ols_t for broadcasted version";
502							}
503	121	100				834	$ivs = $ivs->dummy(1) if $ivs->getndims == 1;
504	121	100				2064	($y, $ivs) = _rm_bad_value( $y, $ivs ) if !$broadcasted;
505							# set up ivs and const as ivs
506							$opt{CONST} and
507	121	100				1073	$ivs = ones($ivs->dim(0))->glue( 1, $ivs );
508	121					3865	($y, $ivs);
509							}
510
511							sub _ols_main {
512	141			141		1041	my ($broadcasted, $y, $ivs, $opt) = @_;
513	141					632	my %opt = (
514							CONST => 1,
515							PLOT => 0,
516							WIN => undef, # for plotting
517							);
518	141	100				457	if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
	129					771
519	141					773	my $C = inv( $ivs x $ivs->t );
520							# Internally normalise data
521							# (double) it or ushort y and sequence iv won't work right
522	141					9867212	my $ymean = $y->abs->avgover->double;
523	141					15721	$ymean->where( $ymean==0 ) .= 1;
524	141	100				31977	my $divisor = $broadcasted ? $ymean->dummy(0) : $ymean;
525	141					6403	my $y2 = $y / $divisor;
526	141					4122	my $Y = $ivs x $y2->dummy(0);
527							# Do the fit
528							# Fitted coefficients vector
529	141					16410	my $coeff = PDL::squeeze( $C x $Y );
530	141	100	100			16779	$coeff = $coeff->dummy(0)
			100
531							if $broadcasted and $coeff->getndims == 1 and $y->getndims > 1;
532	141					766	$coeff *= $divisor; # Un-normalise
533							# *$coeff x $ivs looks nice but produces nan on successive tries*
534	141	100				2876	my $y_pred = sumover( ($broadcasted ? $coeff->dummy(1) : $coeff) * $ivs->transpose );
535	141	50				13833	$opt{PLOT} and $y->plot_residuals( $y_pred, \%opt );
536	141	100				2284	return $coeff unless wantarray;
537	6					22	my %ret = (y_pred => $y_pred);
538	6	100				137	$ret{ss_total} = $opt{CONST} ? $y->ss : sumover( $y ** 2 );
539	6					112	$ret{ss_residual} = $y->sse( $ret{y_pred} );
540	6					21	$ret{ss_model} = $ret{ss_total} - $ret{ss_residual};
541	6					91	$ret{R2} = $ret{ss_model} / $ret{ss_total};
542	6	100				95	my $n_var = $opt{CONST} ? $ivs->dim(1) - 1 : $ivs->dim(1);
543	6					33	$ret{F_df} = pdl( $n_var, my $df1 = $y->dim(0) - $ivs->dim(1) );
544							$ret{F} = $ret{ss_model} / $n_var
545	6					370	/ ($ret{ss_residual} / $df1);
546	6	50				282	$ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $n_var, $df1 )
547							if $CDF;
548	6	100				16	if (!$broadcasted) {
549	2	50				16	my $se_b = ones( $coeff->dims? $coeff->dims : 1 );
550							$opt{CONST} and
551	2	50				85	$se_b->slice(0) .= sqrt( $ret{ss_residual} / $df1 * $C->slice(0,0) );
552							# get the se for bs by successively regressing each iv by the rest ivs
553	2	50				314	if ($ivs->dim(1) > 1) {
554	2	50				11	my @coords = $opt{CONST} ? 1..$n_var : 0..$n_var-1;
555	2	50				13	my $ones = !$opt{CONST} ? undef : ones($ivs->dim(0));
556	2					73	for my $k (@coords) {
557	2					11	my $G = $ivs->dice_axis(1, [grep $_ != $k, @coords]);
558	2	50				163	$G = $ones->glue( 1, $G ) if $opt{CONST};
559	2					203	my $b_G = $ivs->slice(':',$k)->ols( $G, {CONST=>0,PLOT=>0} );
560	2					14	my $ss_res_k = $ivs->slice(':',$k)->squeeze->sse( sumover($b_G * $G->transpose) );
561	2					225	$se_b->slice($k) .= sqrt( $ret{ss_residual} / $df1 / $ss_res_k );
562							}
563							}
564							else {
565							$se_b->slice(0)
566	0					0	.= sqrt( $ret{ss_residual} / $df1 / sum( $ivs->slice(':',0)**2 ) );
567							}
568	2					155	$ret{b_se} = $se_b;
569	2					6	$ret{b_t} = $coeff / $ret{b_se};
570	2	50				29	$ret{b_p} = 2 * ( 1 - $ret{b_t}->abs->gsl_cdf_tdist_P( $df1 ) )
571							if $CDF;
572							}
573	6	50				26	for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze };
	46					1488
574	6					196	$ret{b} = $coeff;
575	6					124	return %ret;
576							}
577
578							=head2 r2_change
579
580							=for ref
581
582							Significance test for the incremental change in R2 when new variable(s)
583							are added to an ols regression model.
584
585							Returns the change stats as well as stats for both models. Based on
586							L. (One way to make up for the lack of significance tests for
587							coeffs in ols_t).
588
589							=for options
590
591							Default options (case insensitive):
592
593							CONST => 1,
594
595							=for usage
596
597							Usage:
598
599							# suppose these are two persons' ratings for top 10 box office movies
600							# ascending sorted by box office
601
602							pdl> p $y = qsort ceil(random(10, 2) * 5)
603							[
604							[1 1 2 2 2 3 4 4 4 4]
605							[1 2 2 3 3 3 4 4 5 5]
606							]
607
608							# first IV is a simple linear trend
609
610							pdl> p $x1 = sequence 10
611							[0 1 2 3 4 5 6 7 8 9]
612
613							# the modeler wonders if adding a quadratic trend improves the fit
614
615							pdl> p $x2 = sequence(10) ** 2
616							[0 1 4 9 16 25 36 49 64 81]
617
618							# two difference models are given in two pdls
619							# each as would be pass on to ols_t
620							# the 1st model includes only linear trend
621							# the 2nd model includes linear and quadratic trends
622							# when necessary use dummy dim so both models have the same ndims
623
624							pdl> %c = $y->r2_change( $x1->dummy(1), cat($x1, $x2) )
625
626							pdl> p "$_\t$c{$_}\n" for sort keys %c
627							# person 1 person 2
628							F_change [0.72164948 0.071283096]
629							# df same for both persons
630							F_df [1 7]
631							F_p [0.42370145 0.79717232]
632							R2_change [0.0085966043 0.00048562549]
633							model0 HASH(0x8c10828)
634							model1 HASH(0x8c135c8)
635
636							# the answer here is no.
637
638							=cut
639
640							*r2_change = \&PDL::r2_change;
641							sub PDL::r2_change {
642	1			1	0	9154	my ($self, $ivs0, $ivs1, $opt) = @_;
643	1	50				12	$ivs0->getndims == 1 and $ivs0 = $ivs0->dummy(1);
644
645	1					48	my %ret;
646
647	1					6	$ret{model0} = { $self->ols_t( $ivs0, $opt ) };
648	1					5	$ret{model1} = { $self->ols_t( $ivs1, $opt ) };
649
650	1					5	$ret{R2_change} = $ret{model1}->{R2} - $ret{model0}->{R2};
651							$ret{F_df}
652							= pdl(my $df0 = $ivs1->dim(1) - $ivs0->dim(1),
653	1					25	my $df1 = $ret{model1}->{F_df}->slice('(1)') );
654							$ret{F_change}
655							= $ret{R2_change} * $df1
656	1					110	/ ( (1-$ret{model1}->{R2}) * $df0 );
657	1	50				113	$ret{F_p} = 1 - $ret{F_change}->gsl_cdf_fdist_P( $df0, $df1 )
658							if $CDF;
659
660	1	100				5	for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze };
	5					90
661
662	1					43	%ret;
663							}
664
665							=head1 METHODS
666
667							=head2 anova
668
669							=for ref
670
671							Analysis of variance. Uses type III sum of squares for unbalanced data.
672
673							Dependent variable should be a 1D pdl. Independent variables can be
674							passed as 1D perl array ref or 1D pdl.
675
676							Will only calculate p-value (C) if there are more samples than the
677							product of categories of all the IVs.
678
679							Supports bad value (by ignoring missing or BAD values in dependent and
680							independent variables list-wise).
681
682							For more on ANOVA, see L.
683
684							=for options
685
686							Default options (case insensitive):
687
688							V => 1, # carps if bad value in variables
689							IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
690							PLOT => 0, # plots highest order effect
691							# can set plot_means options here
692							WIN => undef, # for plotting
693
694							=for usage
695
696							Usage:
697
698							# suppose this is ratings for 12 apples
699
700							pdl> p $y = qsort ceil( random(12)*5 )
701							[1 1 2 2 2 3 3 4 4 4 5 5]
702
703							# IV for types of apple
704
705							pdl> p $a = sequence(12) % 3 + 1
706							[1 2 3 1 2 3 1 2 3 1 2 3]
707
708							# IV for whether we baked the apple
709
710							pdl> @b = qw( y y y y y y n n n n n n )
711
712							pdl> %m = $y->anova( $a, \@b, { IVNM=>['apple', 'bake'] } )
713
714							pdl> p "$_\t@{[$m{$_} =~ /^\n(.?)\n*\z/s]}\n" for sort keys %m
715							F 2.46666666666667
716							F_df [5 6]
717							F_p 0.151168719948632
718							ms_model 3.08333333333333
719							ms_residual 1.25
720							ss_model 15.4166666666667
721							ss_residual 7.5
722							ss_total 22.9166666666667
723							\| apple \| F 0.466666666666667
724							\| apple \| F_p 0.648078345471096
725							\| apple \| df 2
726							\| apple \| m [2.75 3 3.5]
727							\| apple \| ms 0.583333333333334
728							\| apple \| se [0.85391256 0.81649658 0.64549722]
729							\| apple \| ss 1.16666666666667
730							\| apple \|\| err df 6
731							\| apple ~ bake \| F 0.0666666666666671
732							\| apple ~ bake \| F_p 0.936190104380701
733							\| apple ~ bake \| df 2
734							\| apple ~ bake \| m [
735							[1.5 2 2.5]
736							[ 4 4 4.5]
737							]
738							\| apple ~ bake \| ms 0.0833333333333339
739							\| apple ~ bake \| se [
740							[0.5 1 0.5]
741							[ 1 1 0.5]
742							]
743							\| apple ~ bake \| ss 0.166666666666668
744							\| apple ~ bake \|\| err df 6
745							\| bake \| F 11.2666666666667
746							\| bake \| F_p 0.015294126084452
747							\| bake \| df 1
748							\| bake \| m [2 4.1666667]
749							\| bake \| ms 14.0833333333333
750							\| bake \| se [0.36514837 0.40138649]
751							\| bake \| ss 14.0833333333333
752							\| bake \|\| err df 6
753
754							This is implemented as a call to L, with an C
755							subjects vector.
756
757							=cut
758
759							*anova = \&PDL::anova;
760							sub PDL::anova {
761	6			6	0	85104	my ($y, @args) = @_;
762	6					30	anova_rptd($y, undef, @args);
763							}
764
765							sub _interactions {
766	25			25		72	my ($ivs_ref, $idv) = @_;
767	25					53	my (@inter, @idv_inter);
768	25					84	for my $nway ( 2 .. @$ivs_ref ) {
769	28					142	my $iter_idv = _combinations( $nway, [0..$#$ivs_ref] );
770	28					85	while ( my @v = &$iter_idv() ) {
771	48					153	push @inter, interaction_code(@$ivs_ref[@v]);
772	48					285	push @idv_inter, join ' ~ ', @$idv[@v];
773							}
774							}
775	25					93	(\@inter, \@idv_inter);
776							}
777
778							# now prepare for cell mean
779							sub _interactions_cm {
780	19			19		47	my ($ivs_ref, $pdl_ivs_raw) = @_;
781	19					93	my ($dim0, @inter_cm, @inter_cmo) = $ivs_ref->[0]->dim(0);
782	19					61	for my $nway ( 2 .. @$ivs_ref ) {
783	21					526	my $iter_idv = _combinations( $nway, [0..$#$ivs_ref] );
784	21					77	while ( my @v = &$iter_idv() ) {
785	37					59	my @i_cm;
786	37					109	for my $o ( 0 .. $dim0 - 1 ) {
787	1714					199463	push @i_cm, join '', map $_->slice("($o)"), @$pdl_ivs_raw[@v];
788							}
789	37					4277	my ($inter, $map) = effect_code( \@i_cm );
790	37					114	push @inter_cm, $inter;
791							# get the order to put means in correct multi dim pdl pos
792							# this is order in var_e dim(1)
793	37					324	my @levels = sort { $map->{$a} <=> $map->{$b} } keys %$map;
	719					1229
794							# this is order needed for cell mean
795	37					179	my @i_cmo = sort { reverse($levels[$a]) cmp reverse($levels[$b]) }
	772					1253
796							0 .. $#levels;
797	37					197	push @inter_cmo, pdl @i_cmo;
798							}
799							}
800	19					122	(\@inter_cmo, \@inter_cm);
801							}
802
803							sub _cell_means {
804	19			19		62	my ($data, $ivs_cm_ref, $i_cmo_ref, $idv, $pdl_ivs_raw) = @_;
805	19					36	my %ind_id;
806	19					163	@ind_id{ @$idv } = 0..$#$idv;
807	19					39	my %cm;
808	19					39	my $i = 0;
809	19					56	for (@$ivs_cm_ref) {
810	77	50				246	confess "_cell_means passed empty ivs_cm_ref ndarray at pos $i" if $_->isempty;
811	77					765	my $last = zeroes $_->dim(0);
812	77					1306	my $i_neg = which $_->slice(':',0) == -1;
813	77					12299	$last->slice($i_neg) .= 1;
814	77					6713	$_->where($_ == -1) .= 0;
815	77					15473	$_ = $_->glue(1, $last);
816	77					16092	my @v = split ' ~ ', $idv->[$i];
817	77					436	my @shape = map $pdl_ivs_raw->[$_]->uniq->nelem, @ind_id{@v};
818	77					62322	my ($m, $ss) = $data->centroid( $_ );
819	77					318	$m = $m->slice($i_cmo_ref->[$i])->sever;
820	77					4918	$ss = $ss->slice($i_cmo_ref->[$i])->sever;
821	77					3713	$m = $m->reshape(@shape);
822	77					3839	my $se = sqrt( ($ss/($_->sumover - 1)) / $_->sumover )->reshape(@shape);
823	77					7466	$cm{ "\| $idv->[$i] \| m" } = $m;
824	77					248	$cm{ "\| $idv->[$i] \| se" } = $se;
825	77					484	$i++;
826							}
827	19					114	\%cm;
828							}
829
830							# http://www.perlmonks.org/?node_id=371228
831							sub _combinations {
832	49			49		137	my ($num, $arr) = @_;
833	0			0		0	return sub { return }
834	49	50	33			304	if $num == 0 or $num > @$arr;
835	49					99	my @pick;
836							return sub {
837	134	100		134		3388	return @$arr[ @pick = ( 0 .. $num - 1 ) ]
838							unless @pick;
839	85					191	my $i = $#pick;
840	85		100			870	$i-- until $i < 0 or $pick[$i]++ < @$arr - $num + $i;
841	85	100				698	return if $i < 0;
842	36					129	@pick[$i .. $#pick] = $pick[$i] .. $#$arr;
843	36					145	return @$arr[@pick];
844	49					412	};
845							}
846
847							=head2 anova_rptd
848
849							=for ref
850
851							Repeated measures and mixed model anova. Uses type III sum of squares.
852
853							The standard error (se) for the means are based on the relevant mean
854							squared error from the anova, ie it is pooled across levels of the effect.
855							Will only calculate p-value (C) if there are more samples than the
856							product of categories of all the IVs.
857
858							Uses L, so supports bad values.
859
860							For more on repeated measures ANOVA, see
861							L,
862							and for mixed models, see
863							L.
864
865							=for options
866
867							Default options (case insensitive):
868
869							V => 1, # carps if bad value in dv
870							IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
871							BTWN => [], # indices of between-subject IVs (matches IVNM indices)
872							PLOT => 0, # plots highest order effect
873							# see plot_means() for more options
874							WIN => undef, # for plotting
875
876							=for usage
877
878							Usage:
879
880							Some fictional data: recall_w_beer_and_wings.txt
881
882							Subject Beer Wings Recall
883							Alex 1 1 8
884							Alex 1 2 9
885							Alex 1 3 12
886							Alex 2 1 7
887							Alex 2 2 9
888							Alex 2 3 12
889							Brian 1 1 12
890							Brian 1 2 13
891							Brian 1 3 14
892							Brian 2 1 9
893							Brian 2 2 8
894							Brian 2 3 14
895							...
896
897							# rtable allows text only in 1st row and col
898							my ($data, $idv, $subj) = rtable 'recall_w_beer_and_wings.txt';
899
900							my ($b, $w, $dv) = $data->dog;
901							# subj and IVs can be 1d pdl or @ ref
902							# subj must be the first argument
903							my %m = $dv->anova_rptd( $subj, $b, $w, {ivnm=>['Beer', 'Wings']} );
904
905							print "$_\t@{[$m{$_} =~ /^\n(.?)\n*\z/s]}\n" for sort keys %m
906
907							ss_residual 19.0833333333333
908							ss_subject 24.8333333333333
909							ss_total 133.833333333333
910							\| Beer \| F 9.39130434782609
911							\| Beer \| F_p 0.0547977008378944
912							\| Beer \| df 1
913							\| Beer \| m [10.916667 8.9166667]
914							\| Beer \| ms 24
915							\| Beer \| se [0.4614791 0.4614791]
916							\| Beer \| ss 24
917							\| Beer \|\| err df 3
918							\| Beer \|\| err ms 2.55555555555556
919							\| Beer \|\| err ss 7.66666666666667
920							\| Beer ~ Wings \| F 0.510917030567687
921							\| Beer ~ Wings \| F_p 0.623881438624431
922							\| Beer ~ Wings \| df 2
923							\| Beer ~ Wings \| m [
924							[ 10 7]
925							[ 10.5 9.25]
926							[12.25 10.5]
927							]
928							\| Beer ~ Wings \| ms 1.625
929							\| Beer ~ Wings \| se [
930							[0.89170561 0.89170561]
931							[0.89170561 0.89170561]
932							[0.89170561 0.89170561]
933							]
934							\| Beer ~ Wings \| ss 3.25000000000001
935							\| Beer ~ Wings \|\| err df 6
936							\| Beer ~ Wings \|\| err ms 3.18055555555555
937							\| Beer ~ Wings \|\| err ss 19.0833333333333
938							\| Wings \| F 4.52851711026616
939							\| Wings \| F_p 0.0632754786153548
940							\| Wings \| df 2
941							\| Wings \| m [8.5 9.875 11.375]
942							\| Wings \| ms 16.5416666666667
943							\| Wings \| se [0.67571978 0.67571978 0.67571978]
944							\| Wings \| ss 33.0833333333333
945							\| Wings \|\| err df 6
946							\| Wings \|\| err ms 3.65277777777778
947							\| Wings \|\| err ss 21.9166666666667
948
949							For mixed model anova, ie when there are between-subject IVs
950							involved, feed the IVs as above, but specify in BTWN which IVs are
951							between-subject. For example, if we had added age as a between-subject
952							IV in the above example, we would do
953
954							my %m = $dv->anova_rptd( $subj, $age, $b, $w,
955							{ ivnm=>['Age', 'Beer', 'Wings'], btwn=>[0] });
956
957							=cut
958
959							*anova_rptd = \&PDL::anova_rptd;
960							sub PDL::anova_rptd {
961	20	100		20	0	174624	my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
962	20					65	my ($y, $subj, @ivs_raw) = @_;
963	20	50				122	confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1;
964							croak "Mismatched number of elements in DV and IV. Are you passing IVs the old-and-abandoned way?"
965	20	50	66			110	if (ref $ivs_raw[0] eq 'ARRAY') and (@{ $ivs_raw[0] } != $y->nelem);
	3					38
966	20					52	for (@ivs_raw) {
967	43	50	66			1473	croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1
968							}
969	20	100				937	my %opt = (
970							V => 1, # carps if bad value in dv
971							IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
972							( !defined($subj) ? () : (
973							BTWN => [], # indices of between-subject IVs (matches IVNM indices)
974							)),
975							PLOT => 0, # plots highest order effect
976							WIN => undef, # for plotting
977							);
978	20	100				70	if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
	19					199
979	3					15	$opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
980	20	100	66			95	if !$opt{IVNM} or !@{ $opt{IVNM} };
	20					106
981	20					95	(my ($dsgn, $idv), $y, my ($sj, $ivs_ref_filtrd, $pdl_ivs_raw, $ivs_ref, $err_ref)) = $y->anova_design_matrix($subj, @ivs_raw, \%opt);
982	20	50				87	confess "anova_rptd: got singular matrix for X' x X"
983							if any +($dsgn->t x $dsgn)->det == 0;
984	20					782552	my $b_full = _ols_main(1, $y, $dsgn->t, {CONST=>0});
985
986	20					447	my %ret = (ss_total => $y->ss,
987							ss_residual => $y->sse( sumover( $b_full * $dsgn ) ));
988
989	20	100				892	if (defined $subj) {
990	14					72	my @full = (@$ivs_ref, @$err_ref);
991	14					32	my $has_btwn = @{ $opt{BTWN} };
	14					65
992	14					36	my @is_btwn; $is_btwn[$_] = 1 for @{ $opt{BTWN} };
	14					26
	14					68
993	14					55	my @within_inds = 0 .. $#ivs_raw;
994	14	100				73	@within_inds = grep !$is_btwn[$_], @within_inds if $has_btwn;
995	14					108	my $within_df = pdl(map $_->dim(1), @full[@within_inds])->prodover->sclr;
996	14					1078	EFFECT: for my $k (0 .. $#full) {
997	122	100				3315	my $e = ($k > $#$ivs_ref)? '\| err' : '';
998	122	100				355	my $i = ($k > $#$ivs_ref)? $k - @$ivs_ref : $k;
999	122	100				627	my $i_pref = $k == $#full ? undef : "\| $idv->[$i] \|";
1000
1001	122	100				458	if (!defined $full[$k]) { # ss_residual as error
		100
1002	25					97	$ret{ "$i_pref$e ss" } = $ret{ss_residual};
1003							# highest ord inter for purely within design, (p-1)(q-1)(n-1)
1004	25	100				137	my $factor = (ref $full[-1] ? $full[-1] : $err_ref->[$full[-1]])->dim(1);
1005	25					93	my $df = $ret{ "$i_pref$e df" } = $factor * $within_df;
1006	25	50				67	die "${i_pref}residual df = 0" if $df <= 0;
1007	25					84	$ret{ "$i_pref$e ms" } = $ret{ "$i_pref$e ss" } / $df;
1008							} elsif (ref $full[$k]) { # unique error term
1009							next EFFECT
1010	80	50	100			2160	unless my @G = grep $_ != $k && defined $full[$_], 0 .. $#full;
1011	80					632	my $G = ones($y->dim(0))->glue(1, grep ref $_, @full[@G]);
1012	80					77399	my $b_G = $y->ols_t( $G, {CONST=>0} );
1013							my $ss = $ret{$k == $#full ? 'ss_subject' : "$i_pref$e ss"}
1014	80	100				592	= $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual};
1015	80	100				6646	if ($k != $#full) {
1016	75					547	my $df = $ret{"$i_pref$e df"} = $full[$k]->dim(1);
1017	75	50				285	die "residual df = 0" if $df <= 0;
1018	75					262	$ret{"$i_pref$e ms"} = $ss / $df;
1019							}
1020							} else { # repeating error term
1021							my $ss = $ret{$k == $#full ? 'ss_subject' : "$i_pref$e ss"}
1022	17	100				110	= $ret{"\| $idv->[$full[$k]] \|$e ss"};
1023	17	100				92	if ($k != $#full) {
1024	8					63	my $df = $ret{"$i_pref$e df"} = $ret{"\| $idv->[$full[$k]] \|$e df"};
1025	8	50				21	die "residual df = 0" if $df <= 0;
1026	8					24	$ret{"$i_pref$e ms"} = $ss / $df;
1027							}
1028							}
1029							}
1030							} else {
1031	6					31	$ret{ss_model} = $ret{ss_total} - $ret{ss_residual};
1032	6					132	$ret{F_df} = pdl(my $F_df0 = $dsgn->dim(0) - 1, my $df1 = $y->nelem - $dsgn->dim(0));
1033	6					348	$ret{ms_model} = $ret{ss_model} / $F_df0;
1034	6					117	$ret{ms_residual} = $ret{ss_residual} / $df1;
1035	6					80	$ret{F} = $ret{ms_model} / $ret{ms_residual};
1036	6	50	33			78	$ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $F_df0, $df1 )
1037							if $CDF and $df1 > 0;
1038
1039							# get IV ss from $ivs_ref instead of $dsgn pdl
1040	6					29	my $ones = ones($y->dim(0));
1041	6					273	for my $k (0 .. $#$ivs_ref) {
1042	24					716	my $G = $ones->glue(1, @$ivs_ref[grep $_ != $k, 0 .. $#$ivs_ref]);
1043	24					14229	my $b_G = $y->ols_t( $G, {CONST=>0} );
1044							$ret{ "\| $idv->[$k] \| ss" }
1045	24					126	= $y->sse( sumover($b_G * $G->transpose) ) - $ret{ss_residual};
1046	24					1877	my $df0 = $ret{ "\| $idv->[$k] \| df" } = $ivs_ref->[$k]->dim(1);
1047	24					116	$ret{ "\| $idv->[$k] \|\| err df" } = $df1;
1048	24	100				172	die "residual df = 0" if $df1 <= 0;
1049	23					84	$ret{ "\| $idv->[$k] \| ms" } = $ret{ "\| $idv->[$k] \| ss" } / $df0;
1050							}
1051							}
1052							# have all iv, inter, and error effects. get F and F_p
1053	19					237	for (0 .. $#$ivs_ref) {
1054	77	100				210	my $ms_residual = defined $subj ? $ret{ "\| $idv->[$_] \|\| err ms" } : $ret{ms_residual};
1055	77					297	my ($df0, $df1) = @ret{"\| $idv->[$_] \| df" , "\| $idv->[$_] \|\| err df"};
1056	77					545	my $F = $ret{ "\| $idv->[$_] \| F" } = $ret{ "\| $idv->[$_] \| ms" } / $ms_residual;
1057	77	50	33			1137	$ret{ "\| $idv->[$_] \| F_p" } = 1 - $F->gsl_cdf_fdist_P($df0, $df1)
1058							if $CDF and $df1 > 0;
1059							}
1060
1061	19	100				157	for (keys %ret) {ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze};
	570					9519
1062
1063	19					388	my ($inter_cmo_ref, $inter_cm_ref)
1064							= _interactions_cm($ivs_ref_filtrd, $pdl_ivs_raw);
1065							# append inter info to cell means effects
1066	19					72	my $ivs_cm_ref = [@$ivs_ref_filtrd, @$inter_cm_ref];
1067	19					59	my @i_cmo_ref = map pdl(values %{ (effect_code($_))[1] })->qsort, @$pdl_ivs_raw;
	40					3389
1068	19					1280	push @i_cmo_ref, @$inter_cmo_ref;
1069
1070	19					112	my $cm_ref
1071							= _cell_means( $y, $ivs_cm_ref, \@i_cmo_ref, $idv, $pdl_ivs_raw );
1072	19	100				103	if (defined $subj) {
1073	14					45	my @ls = map { $_->uniq->nelem } @$pdl_ivs_raw;
	29					7156
1074							$cm_ref
1075	14					6258	= _fix_rptd_se( $cm_ref, \%ret, $opt{IVNM}, \@ls, $sj->uniq->nelem );
1076							}
1077							# integrate mean and se into %ret
1078	19					248	@ret{ keys %$cm_ref } = values %$cm_ref;
1079
1080	19					50	my $highest = join(' ~ ', @{ $opt{IVNM} });
	19					190
1081							$cm_ref->{"\| $highest \| m"}->plot_means( $cm_ref->{"\| $highest \| se"},
1082							{ %opt, IVNM=>$idv } )
1083	19	50				86	if $opt{PLOT};
1084
1085	19					2174	%ret;
1086							}
1087
1088							=head2 anova_design_matrix
1089
1090							=for ref
1091
1092							Effect-coded design matrix for anova, including repeated-measures and
1093							mixed-model. The C for use in linear regression i.e. C.
1094
1095							Added in 0.854. See L for more.
1096
1097							Supports bad value in the dependent and independent variables. It
1098							automatically removes bad data listwise, i.e. remove a subject's data if
1099							there is any cell missing for the subject.
1100
1101							=for options
1102
1103							Default options (case insensitive):
1104
1105							V => 1, # carps if bad value in dv
1106							IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
1107							BTWN => [], # indices of between-subject IVs (matches IVNM indices)
1108
1109							=for usage
1110
1111							$matrix = $dv->anova_design_matrix(undef, $b, $w, {ivnm=>[qw(b w)]});
1112							$matrix = $dv->anova_design_matrix(
1113							$subj, $b, $w, {ivnm=>[qw(b w)]}); # repeated-measures
1114							$matrix = $dv->anova_design_matrix(
1115							$subj, $b, $w, {ivnm=>[qw(b w)], btwn=>['b']}); # mixed-model
1116							($matrix, $ivnm_ref) = $dv->anova_design_matrix(
1117							$subj, $b, $w, {ivnm=>[qw(b w)], btwn=>['b']}); # list context also names
1118
1119							=cut
1120
1121							*anova_design_matrix = \&PDL::anova_design_matrix;
1122							sub PDL::anova_design_matrix {
1123	25	50		25	0	42871	my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
1124	25					90	my ($y, $subj, @ivs_raw) = @_;
1125	25	50				74	confess "No IVs: did you omit 'undef' for anova?" if !@ivs_raw;
1126	25	50				127	confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1;
1127	25					69	for (@ivs_raw) {
1128	53	50	66			1254	croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1;
1129							}
1130	25	100				940	my %opt = (
1131							V => 1, # carps if bad value in dv
1132							IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
1133							( !defined($subj) ? () : (
1134							BTWN => [], # indices of between-subject IVs (matches IVNM indices)
1135							)),
1136							);
1137	25	50				93	if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
	25					938
1138	3					13	$opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
1139	25	100	66			99	if !$opt{IVNM} or !@{ $opt{IVNM} };
	25					126
1140	25					45	my @idv_orig = @{ $opt{IVNM} };
	25					77
1141	25					212	my @pdl_ivs_raw = map scalar PDL::Stats::Basic::code_ivs($_), @ivs_raw;
1142	25					87	my $pdl_ivs = pdl(\@pdl_ivs_raw);
1143							# explicit set badflag because pdl() removes badflag
1144	25					3686	$pdl_ivs->badflag( scalar grep $_->badflag, @pdl_ivs_raw );
1145	25					52	my $sj;
1146	25	100				90	if (defined($subj)) {
1147							# delete bad data listwise ie remove subj if any cell missing
1148	18					64	$sj = PDL::Stats::Basic::code_ivs($subj);
1149	18					336	my $ibad = which( $y->isbad \| nbadover($pdl_ivs->transpose) );
1150	18					3038	my $sj_bad = $sj->slice($ibad)->uniq;
1151	18	100				1028	if ($sj_bad->nelem) {
1152							warn $sj_bad->nelem . " subjects with missing data removed\n"
1153	4	50				18	if $opt{V};
1154							$sj = $sj->setvaltobad($_)
1155	4					14	for (list $sj_bad);
1156	4					134	my $igood = which $sj->isgood;
1157	4					308	for ($y, $sj, @pdl_ivs_raw) {
1158	16					38	$_ = $_->slice( $igood )->sever;
1159	16					618	$_->badflag(0);
1160							}
1161							}
1162							} else {
1163	7					36	($y, $pdl_ivs) = _rm_bad_value( $y, $pdl_ivs );
1164	7	50	66			288	if ($opt{V} and $y->nelem < $pdl_ivs_raw[0]->nelem) {
1165	0					0	warn sprintf "%d subjects with missing data removed\n", $pdl_ivs_raw[0]->nelem - $y->nelem;
1166							}
1167	7					116	@pdl_ivs_raw = $pdl_ivs->dog;
1168							}
1169	25					742	my @ivs_ref_fltrd = map scalar effect_code($_), @pdl_ivs_raw;
1170	25					140	my ($ivs_inter_ref, $idv_inter) = _interactions(\@ivs_ref_fltrd, \@idv_orig);
1171							# append inter info to main effects
1172	25					67	my $ivs_ref = [@ivs_ref_fltrd, @$ivs_inter_ref];
1173	25					81	my @idv = (@idv_orig, @$idv_inter);
1174							# matches $ivs_ref, with an extra last pdl for subj effect
1175							my $err_ref = !defined($subj) ? [] :
1176	25	100				156	_add_errors( $sj, $ivs_ref, \@idv, \@pdl_ivs_raw, $opt{BTWN} );
1177	25					151	for (grep ref $err_ref->[$_], 0..$#$err_ref) {
1178	32					950	my ($null_row_ids, $non_null_row_ids) = $err_ref->[$_]->zcover->which_both;
1179	32	50				5447	confess "got null columns $null_row_ids in error entry #$_ ($idv[$_])"
1180							if !$null_row_ids->isempty;
1181							}
1182	25					292	my $dsgn = PDL::glue(1, ones($y->dim(0)), @$ivs_ref, (grep ref($_), @$err_ref))->t;
1183	25	100				22324	!wantarray ? $dsgn : ($dsgn, \@idv, $y, $sj, \@ivs_ref_fltrd, \@pdl_ivs_raw, $ivs_ref, $err_ref);
1184							}
1185
1186							# code (btwn group) subjects. Rutherford (2011) pp 208-209
1187							sub _code_btwn {
1188	18			18		38	my ($subj, $btwn) = @_;
1189	18					46	my (@grp, %grp_s);
1190	18					118	for my $n (0 .. $subj->nelem - 1) {
1191							# construct string to code group membership
1192							# something not treated as BAD by code_ivs to start off marking group membership
1193							# if no $btwn, everyone ends up in the same grp
1194	510					1195	my $s = '_' . join '', map $_->slice($n), @$btwn;
1195	510					22422	push @grp, $s; # group membership
1196	510					1052	$s .= $subj->slice($n); # keep track of total uniq subj
1197	510					27952	$grp_s{$s} = 1;
1198							}
1199	18					121	my $grp = PDL::Stats::Kmeans::iv_cluster \@grp;
1200	18					147	my $spdl = zeroes $subj->dim(0), keys(%grp_s) - $grp->dim(1);
1201	18					524	my $d1 = 0;
1202	18					100	for my $g (0 .. $grp->dim(1)-1) {
1203	33					139	my $col_inds = which $grp->slice(':',$g);
1204	33					4559	my $gsub = $subj->slice( $col_inds )->effect_code;
1205	33					314	my ($nobs, $nsub) = $gsub->dims;
1206	33					144	$spdl->slice($col_inds, [$d1,$d1+$nsub-1]) .= $gsub;
1207	33					2618	$d1 += $nsub;
1208							}
1209	18					215	$spdl;
1210							}
1211
1212							sub _add_errors {
1213	18			18		54	my ($subj, $ivs_ref, $idv, $raw_ivs, $btwn) = @_;
1214	18					87	my $spdl = _code_btwn($subj, [@$raw_ivs[@$btwn]]);
1215							# if btwn factor involved, or highest order inter for within factors
1216							# elem is undef, so that
1217							# @errors ind matches @$ivs_ref, with an extra elem at the end for subj
1218							# mark btwn factors for error terms
1219							# same error term for B(wn) and A(btwn) x B(wn) (Rutherford, p205)
1220	18					117	my %is_btwn = map +($_=>1), @$idv[ @$btwn ];
1221	18					46	my $has_btwn = keys %is_btwn;
1222	18					155	my %idv2indx = map +($idv->[$_]=>$_), 0..$#$idv;
1223	18					42	my $ie_subj;
1224							my @errors = map {
1225	18					46	my @fs = split ' ~ ', $idv->[$_];
	64					189
1226							# separate bw and wn factors
1227							# if only bw, error is bw x subj
1228							# if only wn or wn and bw, error is wn x subj
1229	64	100				195	my @bw = !$has_btwn ? () : grep $is_btwn{$_}, @fs;
1230	64	100				185	my @wn = !$has_btwn ? @fs : grep !$is_btwn{$_}, @fs;
1231	64	100	100			250	$ie_subj = $_ if !defined($ie_subj) and !@wn;
1232	64	100				161	my $err = join ' ~ ', @wn ? @wn : @bw;
1233							# highest order inter of within factors, use ss_residual as error
1234	64	100	100			279	if ( @wn == @$raw_ivs - @$btwn ) { undef }
	31	100				110
		100
		100
1235							# repeating btwn factors use ss_subject as error
1236	2					5	elsif (!@wn and $_ > $ie_subj) { $ie_subj }
1237							# repeating error term
1238	6					13	elsif ($_ > $idv2indx{$err}) { $idv2indx{$err} }
1239	14					54	elsif (@wn) { interaction_code($ivs_ref->[$_], $spdl) }
1240	11					31	else { $spdl }
1241							} 0 .. $#$ivs_ref;
1242	18	100				70	push @errors, $has_btwn ? $ie_subj : $spdl;
1243	18					98	\@errors;
1244							}
1245
1246							sub _fix_rptd_se {
1247							# if ivnm lvls_ref for within ss only this can work for mixed design
1248	14			14		6291	my ($cm_ref, $ret, $ivnm, $lvls_ref, $n) = @_;
1249	14	100				358	my @se = map /^\\| (.+?) \\| se$/ ? $1 : (), keys %$cm_ref;
1250							my @n_obs = map {
1251	14					44	my @ivs = split / ~ /, $_;
	54					1082
1252	54					255	my $i_ivs = which_id $ivnm, \@ivs;
1253	54					2789	my $icollapsed = setops pdl(0 .. $#$ivnm), 'XOR', $i_ivs;
1254	54	100				68631	my $collapsed = $icollapsed->nelem?
1255							pdl( @$lvls_ref[(list $icollapsed)] )->prodover
1256							: 1
1257							;
1258	54					3126	$n * $collapsed;
1259							} @se;
1260	14					265	for my $i (0 .. $#se) {
1261							$cm_ref->{"\| $se[$i] \| se"}
1262	54					1473	.= sqrt( $ret->{"\| $se[$i] \|\| err ms"} / $n_obs[$i] );
1263							}
1264	14					529	$cm_ref;
1265							}
1266
1267							=head2 dummy_code
1268
1269							=for ref
1270
1271							Dummy coding of nominal variable (perl @ ref or 1d pdl) for use in regression.
1272
1273							Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD).
1274
1275							=for usage
1276
1277							pdl> @a = qw(a a a b b b c c c)
1278							pdl> p $a = dummy_code(\@a)
1279							[
1280							[1 1 1 0 0 0 0 0 0]
1281							[0 0 0 1 1 1 0 0 0]
1282							]
1283
1284							=cut
1285
1286							*dummy_code = \&PDL::dummy_code;
1287							sub PDL::dummy_code {
1288	0			0	0	0	my ($var_ref) = @_;
1289	0					0	my $var_e = effect_code( $var_ref );
1290	0					0	$var_e->where( $var_e == -1 ) .= 0;
1291	0					0	$var_e;
1292							}
1293
1294							=head2 effect_code
1295
1296							=for ref
1297
1298							Unweighted effect coding of nominal variable (perl @ ref or 1d pdl) for
1299							use in regression. returns in @ context coded pdl and % ref to level -
1300							pdl->dim(1) index.
1301
1302							Supports BAD value (missing or 'BAD' values result in the corresponding
1303							pdl elements being marked as BAD).
1304
1305							=for usage
1306
1307							my @var = qw( a a a b b b c c c );
1308							my ($var_e, $map) = effect_code( \@var );
1309
1310							print $var_e . $var_e->info . "\n";
1311
1312							[
1313							[ 1 1 1 0 0 0 -1 -1 -1]
1314							[ 0 0 0 1 1 1 -1 -1 -1]
1315							]
1316							PDL: Double D [9,2]
1317
1318							print "$_\t$map->{$_}\n" for sort keys %$map
1319							a 0
1320							b 1
1321							c 2
1322
1323							=cut
1324
1325							*effect_code = \&PDL::effect_code;
1326							sub PDL::effect_code {
1327	171			171	0	27024	my ($var_ref) = @_;
1328	171					753	my ($var, $map_ref) = PDL::Stats::Basic::code_ivs( $var_ref );
1329	171					728	my $var_max = $var->max;
1330	171	100				3322	confess "effect_code called with only one unique value" if $var_max < 1;
1331	169					13285	my $var_e = yvals( float, $var->nelem, $var_max ) == $var;
1332	169					18236	$var_e->slice(which( $var == $var_max ), ) .= -1;
1333	169	100				42721	$var_e = $var_e->setbadif( $var->isbad ) if $var->badflag;
1334	169	100				2706	wantarray ? ($var_e, $map_ref) : $var_e;
1335							}
1336
1337							=head2 effect_code_w
1338
1339							=for ref
1340
1341							Weighted effect code for nominal variable. returns in @ context coded pdl and % ref to level - pdl->dim(1) index.
1342
1343							Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD).
1344
1345							=for usage
1346
1347							pdl> @a = qw( a a b b b c c )
1348							pdl> p $a = effect_code_w(\@a)
1349							[
1350							[ 1 1 0 0 0 -1 -1]
1351							[ 0 0 1 1 1 -1.5 -1.5]
1352							]
1353
1354							=cut
1355
1356							*effect_code_w = \&PDL::effect_code_w;
1357							sub PDL::effect_code_w {
1358	2			2	0	3723	my ($var_ref) = @_;
1359	2					7	my ($var_e, $map_ref) = effect_code( $var_ref );
1360	1	0				8	return wantarray ? ($var_e, $map_ref) : $var_e if $var_e->sum == 0;
		50
1361	1					143	my $pos = $var_e == 1;
1362	1					23	my $neg = $var_e == -1;
1363	1					30	my $w = $pos->sumover / $neg->sumover;
1364	1					26	my $neg_ind = $neg->whichND;
1365	1					552	$var_e->indexND($neg_ind) *= $w->slice($neg_ind->slice('(1)'));
1366	1	50				386	wantarray ? ($var_e, $map_ref) : $var_e;
1367							}
1368
1369							=head2 interaction_code
1370
1371							Returns the coded interaction term for effect-coded variables.
1372
1373							Supports BAD value (missing or 'BAD' values result in the corresponding
1374							pdl elements being marked as BAD).
1375
1376							=for usage
1377
1378							pdl> $a = sequence(6) > 2
1379							pdl> p $a = $a->effect_code
1380							[
1381							[ 1 1 1 -1 -1 -1]
1382							]
1383
1384							pdl> $b = pdl( qw( 0 1 2 0 1 2 ) )
1385							pdl> p $b = $b->effect_code
1386							[
1387							[ 1 0 -1 1 0 -1]
1388							[ 0 1 -1 0 1 -1]
1389							]
1390
1391							pdl> p $ab = interaction_code( $a, $b )
1392							[
1393							[ 1 0 -1 -1 -0 1]
1394							[ 0 1 -1 -0 -1 1]
1395							]
1396
1397							=cut
1398
1399							*interaction_code = \&PDL::interaction_code;
1400							sub PDL::interaction_code {
1401	72			72	0	1177	my $i = ones( $_[0]->dim(0), 1 );
1402	72					3832	$i = ($i * $_->dummy(1))->clump(1,2) for @_;
1403	72					22883	$i;
1404							}
1405
1406							=head2 ols
1407
1408							=for ref
1409
1410							Ordinary least squares regression, aka linear regression.
1411
1412							Unlike B, ols is not broadcastable, but it can handle bad value (by
1413							ignoring observations with bad value in dependent or independent variables
1414							list-wise) and returns the full model in list context with various stats.
1415
1416							IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply
1417							the constant vector in $x. Intercept is automatically added and returned
1418							as FIRST of the coeffs if CONST=>1. Returns full model in list context
1419							and coeff in scalar context.
1420
1421							For more on multiple linear regression see
1422							L.
1423
1424							=for options
1425
1426							Default options (case insensitive):
1427
1428							CONST => 1,
1429							PLOT => 0, # see plot_residuals() for plot options
1430							WIN => undef, # for plotting
1431
1432							=for usage
1433
1434							Usage:
1435
1436							# suppose this is a person's ratings for top 10 box office movies
1437							# ascending sorted by box office
1438
1439							pdl> $y = pdl '[1 1 2 2 2 2 4 4 5 5]'
1440
1441							# construct IV with linear and quadratic component
1442
1443							pdl> p $x = cat sequence(10), sequence(10)**2
1444							[
1445							[ 0 1 2 3 4 5 6 7 8 9]
1446							[ 0 1 4 9 16 25 36 49 64 81]
1447							]
1448
1449							pdl> %m = $y->ols( $x )
1450
1451							pdl> p "$_\t@{[$m{$_} =~ /^\n(.?)\n*\z/s]}\n" for sort keys %m
1452
1453							F 40.4225352112676
1454							F_df [2 7]
1455							F_p 0.000142834216344756
1456							R2 0.920314253647587
1457
1458							# coeff constant linear quadratic
1459							b [0.981818 0.212121 0.030303]
1460							b_p [0.039910 0.328001 0.203034]
1461							b_se [0.389875 0.201746 0.021579]
1462							b_t [2.518284 1.051422 1.404218]
1463							ss_model 19.8787878787879
1464							ss_residual 1.72121212121212
1465							ss_total 21.6
1466							y_pred [0.98181818 1.2242424 1.5272727 ... 4.6181818 5.3454545]
1467
1468							=cut
1469
1470							*ols = \&PDL::ols;
1471							sub PDL::ols {
1472	4			4	0	8761	_ols_common(0, @_);
1473							}
1474
1475							# ivs = [nobs x nivs] so can `dog` retval
1476							sub _rm_bad_value {
1477	11			11		30	my ($y, $ivs) = @_;
1478	11	50	66			41	return ($y, $ivs, undef) if !$y->check_badflag and !$ivs->check_badflag;
1479	3					1018	my $idx = which($y->isgood & (nbadover ($ivs->transpose)==0));
1480	3					596	$_ = $_->slice($idx)->sever for $y, $ivs;
1481	3					343	$_->badflag(0) for $y, $ivs;
1482	3					17	($y, $ivs, $idx);
1483							}
1484
1485							=head2 ols_rptd
1486
1487							=for ref
1488
1489							Repeated measures linear regression.
1490							Handles purely within-subject design for now.
1491
1492							(Lorch & Myers, 1990; Van den Noortgate & Onghena, 2006).
1493							See F for an example using the Lorch and Myers data.
1494
1495							=for usage
1496
1497							Usage:
1498
1499							# This is the example from Lorch and Myers (1990),
1500							# a study on how characteristics of sentences affected reading time
1501							# Three within-subject IVs:
1502							# SP -- serial position of sentence
1503							# WORDS -- number of words in sentence
1504							# NEW -- number of new arguments in sentence
1505
1506							# $subj can be 1D pdl or @ ref and must be the first argument
1507							# IV can be 1D @ ref or pdl
1508							# 1D @ ref is effect coded internally into pdl
1509							# pdl is left as is
1510
1511							my %r = $rt->ols_rptd( $subj, $sp, $words, $new );
1512
1513							print "$_\t$r{$_}\n" for sort keys %r;
1514
1515							ss_residual 58.3754646504336
1516							ss_subject 51.8590337714286
1517							ss_total 405.188241771429
1518							# SP WORDS NEW
1519							F [ 7.208473 61.354153 1.0243311]
1520							F_p [0.025006181 2.619081e-05 0.33792837]
1521							coeff [0.33337285 0.45858933 0.15162986]
1522							df [1 1 1]
1523							df_err [9 9 9]
1524							ms [ 18.450705 73.813294 0.57026483]
1525							ms_err [ 2.5595857 1.2030692 0.55671923]
1526							ss [ 18.450705 73.813294 0.57026483]
1527							ss_err [ 23.036272 10.827623 5.0104731]
1528
1529							=cut
1530
1531							*ols_rptd = \&PDL::ols_rptd;
1532							sub PDL::ols_rptd {
1533	1			1	0	131	my ($y, $subj, @ivs_raw) = @_;
1534
1535	1					5	$y = $y->squeeze;
1536	1	50				46	$y->getndims > 1 and
1537							croak "ols_rptd does not support broadcasting";
1538
1539	1	50	33			3	my @ivs = map { (ref $_ eq 'PDL' and $_->ndims > 1)? $_
	3	50				127
1540							: ref $_ eq 'PDL' ? $_->dummy(1)
1541							: scalar effect_code($_)
1542							;
1543							} @ivs_raw;
1544
1545	1					36	my %r;
1546
1547	1					16	$r{ss_total} = $y->ss;
1548
1549							# STEP 1: subj
1550	1					6	my $s = effect_code $subj;
1551	1					5	my $b_s = $y->ols_t($s);
1552	1					5	my $pred = sumover($b_s->slice('1:-1') * $s->transpose) + $b_s->slice(0);
1553	1					104	$r{ss_subject} = $r{ss_total} - $y->sse( $pred );
1554
1555							# STEP 2: add predictor variables
1556	1					21	my $iv_p = $s->glue(1, @ivs);
1557	1					412	my $b_p = $y->ols_t($iv_p);
1558
1559							# only care about coeff for predictor vars. no subj or const coeff
1560	1					17	$r{coeff} = $b_p->slice([-@ivs,-1])->sever;
1561
1562							# get total sse for this step
1563	1					41	$pred = sumover($b_p->slice('1:-1') * $iv_p->transpose) + $b_p->slice(0);
1564	1					128	my $ss_pe = $y->sse( $pred );
1565
1566							# get predictor ss by successively reducing the model
1567	1					8	$r{ss} = zeroes scalar(@ivs);
1568	1					19	for my $i (0 .. $#ivs) {
1569	3					219	my $iv = $s->glue(1, @ivs[ grep $_ != $i, 0..$#ivs ]);
1570	3					987	my $b = $y->ols_t($iv);
1571	3					22	$pred = sumover($b->slice('1:-1') * $iv->transpose) + $b->slice(0);
1572	3					678	$r{ss}->slice($i) .= $y->sse($pred) - $ss_pe;
1573							}
1574
1575							# STEP 3: get predictor x subj interaction as error term
1576	1					79	my $iv_e = PDL::glue 1, map interaction_code( $s, $_ ), @ivs;
1577
1578							# get total sse for this step. full model now.
1579	1					431	my $b_f = $y->ols_t( $iv_p->glue(1,$iv_e) );
1580	1					6	$pred = sumover($b_f->slice('1:-1') * $iv_p->glue(1,$iv_e)->transpose) + $b_f->slice(0);
1581	1					204	$r{ss_residual} = $y->sse( $pred );
1582
1583							# get predictor x subj ss by successively reducing the error term
1584	1					6	$r{ss_err} = zeroes scalar(@ivs);
1585	1					17	for my $i (0 .. $#ivs) {
1586	3					167	my $iv = $iv_p->glue(1, map interaction_code($s, $_),
1587							@ivs[grep $_ != $i, 0..$#ivs]);
1588	3					926	my $b = $y->ols_t($iv);
1589	3					14	my $pred = sumover($b->slice('1:-1') * $iv->transpose) + $b->slice(0);
1590	3					335	$r{ss_err}->slice($i) .= $y->sse($pred) - $r{ss_residual};
1591							}
1592
1593							# Finally, get MS, F, etc
1594
1595	1					78	$r{df} = pdl( map $_->squeeze->ndims, @ivs );
1596	1					251	$r{ms} = $r{ss} / $r{df};
1597
1598	1					25	$r{df_err} = $s->dim(1) * $r{df};
1599	1					24	$r{ms_err} = $r{ss_err} / $r{df_err};
1600
1601	1					15	$r{F} = $r{ms} / $r{ms_err};
1602
1603							$r{F_p} = 1 - $r{F}->gsl_cdf_fdist_P( $r{df}, $r{df_err} )
1604	1	50				18	if $CDF;
1605
1606	1					53	%r;
1607							}
1608
1609
1610							=head2 logistic
1611
1612							=for ref
1613
1614							Logistic regression with maximum likelihood estimation using L.
1615
1616							IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply
1617							the constant vector in $x. It is included in the model and returned as
1618							LAST of coeff. Returns full model in list context and coeff in scalar
1619							context.
1620
1621							The significance tests are likelihood ratio tests (-2LL deviance)
1622							tests. IV significance is tested by comparing deviances between the
1623							reduced model (ie with the IV in question removed) and the full model.
1624
1625							***NOTE: the results here are qualitatively similar to but not identical
1626							with results from R, because different algorithms are used for the
1627							nonlinear parameter fit. Use with discretion***
1628
1629							=for options
1630
1631							Default options (case insensitive):
1632
1633							INITP => zeroes( $x->dim(1) + 1 ), # n_iv + 1
1634							MAXIT => 1000,
1635							EPS => 1e-7,
1636
1637							=for usage
1638
1639							Usage:
1640
1641							# suppose this is whether a person had rented 10 movies
1642
1643							pdl> p $y = ushort( random(10)*2 )
1644							[0 0 0 1 1 0 0 1 1 1]
1645
1646							# IV 1 is box office ranking
1647
1648							pdl> p $x1 = sequence(10)
1649							[0 1 2 3 4 5 6 7 8 9]
1650
1651							# IV 2 is whether the movie is action- or chick-flick
1652
1653							pdl> p $x2 = sequence(10) % 2
1654							[0 1 0 1 0 1 0 1 0 1]
1655
1656							# concatenate the IVs together
1657
1658							pdl> p $x = cat $x1, $x2
1659							[
1660							[0 1 2 3 4 5 6 7 8 9]
1661							[0 1 0 1 0 1 0 1 0 1]
1662							]
1663
1664							pdl> %m = $y->logistic( $x )
1665
1666							pdl> p "$_\t$m{$_}\n" for sort keys %m
1667
1668							D0 13.8629436111989
1669							Dm 9.8627829791575
1670							Dm_chisq 4.00016063204141
1671							Dm_df 2
1672							Dm_p 0.135324414081692
1673							# ranking genre constant
1674							b [0.41127706 0.53876358 -2.1201285]
1675							b_chisq [ 3.5974504 0.16835559 2.8577151]
1676							b_p [0.057868258 0.6815774 0.090936587]
1677							iter 12
1678							y_pred [0.10715577 0.23683909 ... 0.76316091 0.89284423]
1679
1680							# to get the covariance out, supply a true value for the COV option:
1681							pdl> %m = $y->logistic( $x, {COV=>1} )
1682							pdl> p $m{cov};
1683
1684							=cut
1685
1686							*logistic = \&PDL::logistic;
1687	0			0	0	0	sub PDL::logistic {
1688	0					0	require PDL::Fit::LM;
1689	0					0	my ( $self, $ivs, $opt ) = @_;
1690							$self = $self->squeeze;
1691	0	0				0	# make compatible w multiple var cases
1692	0	0				0	$ivs->getndims == 1 and $ivs = $ivs->dummy(1);
1693							$self->dim(0) != $ivs->dim(0) and
1694	0					0	carp "mismatched n btwn DV and IV!";
1695							my %opt = (
1696							INITP => zeroes( $ivs->dim(1) + 1 ), # n_ivs + 1
1697							MAXIT => 1000,
1698							EPS => 1e-7,
1699	0	0				0	);
	0					0
1700							if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
1701	0					0	# not using it atm
1702							$opt{WT} = 1;
1703							# Use lmfit. Fourth input argument is reference to user-defined
1704							# copy INITP so we have the original value when needed
1705							my ($yfit,$coeff,$cov,$iter)
1706	0					0	= PDL::Fit::LM::lmfit($ivs, $self, $opt{WT}, \&_logistic, $opt{INITP}->copy,
1707							{ Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
1708							# apparently at least coeff is child of some pdl
1709	0					0	# which is changed in later lmfit calls
1710	0					0	$yfit = $yfit->copy;
1711	0	0				0	$coeff = $coeff->copy;
1712	0					0	return $coeff unless wantarray;
1713	0					0	my %ret;
1714	0					0	my $n0 = $self->where($self == 0)->nelem;
1715	0	0				0	my $n1 = $self->nelem - $n0;
1716	0					0	$ret{cov} = $cov if $opt{COV};
1717	0					0	$ret{D0} = -2($n0 log($n0 / $self->nelem) + $n1 * log($n1 / $self->nelem));
1718	0					0	$ret{Dm} = sum( $self->dvrs( $yfit ) ** 2 );
1719	0					0	$ret{Dm_chisq} = $ret{D0} - $ret{Dm};
1720							$ret{Dm_df} = $ivs->dim(1);
1721							$ret{Dm_p}
1722	0	0				0	= 1 - PDL::GSL::CDF::gsl_cdf_chisq_P( $ret{Dm_chisq}, $ret{Dm_df} )
1723	0					0	if $CDF;
1724	0	0				0	my $coeff_chisq = zeroes $opt{INITP}->nelem;
1725	0					0	if ( $ivs->dim(1) > 1 ) {
1726	0					0	for my $k (0 .. $ivs->dim(1)-1) {
	0					0
1727	0					0	my @G = grep { $_ != $k } (0 .. $ivs->dim(1)-1);
1728	0					0	my $G = $ivs->dice_axis(1, \@G);
1729							my $init = $opt{INITP}->dice([ @G, $opt{INITP}->dim(0)-1 ])->copy;
1730							my $y_G
1731	0					0	= PDL::Fit::LM::lmfit( $G, $self, $opt{WT}, \&_logistic, $init,
1732	0					0	{ Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
1733							$coeff_chisq->slice($k) .= $self->dm( $y_G ) - $ret{Dm};
1734							}
1735							}
1736							else {
1737	0					0	# d0 is, by definition, the deviance with only intercept
1738							$coeff_chisq->slice(0) .= $ret{D0} - $ret{Dm};
1739							}
1740							my $y_c
1741	0					0	= PDL::Fit::LM::lmfit( $ivs, $self, $opt{WT}, \&_logistic_no_intercept, $opt{INITP}->slice('0:-2')->sever,
1742	0					0	{ Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
1743	0					0	$coeff_chisq->slice(-1) .= $self->dm( $y_c ) - $ret{Dm};
1744	0					0	$ret{b} = $coeff;
1745	0	0				0	$ret{b_chisq} = $coeff_chisq;
1746							$ret{b_p} = 1 - $ret{b_chisq}->gsl_cdf_chisq_P( 1 )
1747	0					0	if $CDF;
1748	0					0	$ret{y_pred} = $yfit;
1749	0	0				0	$ret{iter} = $iter;
	0					0
1750	0					0	for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze };
1751							%ret;
1752							}
1753
1754	0			0		0	sub _logistic {
1755							my ($x,$par,$ym,$dyda) = @_;
1756	0					0	# $b and $c are fit parameters slope and intercept
1757	0					0	my $b = $par->slice([0,$x->dim(1) - 1])->sever;
1758							my $c = $par->slice(-1)->sever;
1759							# Write function with dependent variable $ym,
1760							# independent variable $x, and fit parameters as specified above.
1761							# Use the .= (dot equals) assignment operator to express the equality
1762	0					0	# (not just a plain equals)
1763	0					0	$ym .= 1 / ( 1 + exp( -1 * (sumover($b * $x->transpose) + $c) ) );
1764							my @dy = map $dyda->slice(",($_)"), 0 .. $par->dim(0)-1;
1765							# Partial derivative of the function with respect to each slope
1766							# fit parameter ($b in this case). Again, note .= assignment
1767							# operator (not just "equals")
1768	0					0	$dy[$_] .= $x->slice(':',$_) * $ym * (1 - $ym)
1769							for (0 .. $b->dim(0)-1);
1770	0					0	# Partial derivative of the function re intercept par
1771							$dy[-1] .= $ym * (1 - $ym);
1772							}
1773
1774	0			0		0	sub _logistic_no_intercept {
1775	0					0	my ($x,$par,$ym,$dyda) = @_;
1776							my $b = $par->slice([0,$x->dim(1) - 1])->sever;
1777							# Write function with dependent variable $ym,
1778							# independent variable $x, and fit parameters as specified above.
1779							# Use the .= (dot equals) assignment operator to express the equality
1780	0					0	# (not just a plain equals)
1781	0					0	$ym .= 1 / ( 1 + exp( -1 * sumover($b * $x->transpose) ) );
	0					0
1782							my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1);
1783							# Partial derivative of the function with respect to each slope
1784							# fit parameter ($b in this case). Again, note .= assignment
1785							# operator (not just "equals")
1786	0					0	$dy[$_] .= $x->slice(':',$_) * $ym * (1 - $ym)
1787							for 0 .. $b->dim(0)-1;
1788							}
1789
1790							=head2 pca
1791
1792							=for ref
1793
1794							Principal component analysis. Based on corr instead of cov.
1795
1796							Bad values are ignored pair-wise. OK when bad values are few but otherwise
1797							probably should fill_m etc before pca). Uses L.
1798
1799							=for options
1800
1801							Default options (case insensitive):
1802
1803							CORR => 1, # boolean. use correlation or covariance
1804							PLOT => 0, # calls plot_screes by default
1805							# can set plot_screes options here
1806							WIN => undef, # for plotting
1807
1808							=for usage
1809
1810							Usage:
1811
1812							my $d = qsort random 10, 5; # 10 obs on 5 variables
1813							my %r = $d->pca( \%opt );
1814							print "$_\t$r{$_}\n" for (keys %r);
1815
1816							eigenvalue # variance accounted for by each component
1817							[4.70192 0.199604 0.0471421 0.0372981 0.0140346]
1818
1819							eigenvector # dim var x comp. weights for mapping variables to component
1820							[
1821							[ -0.451251 -0.440696 -0.457628 -0.451491 -0.434618]
1822							[ -0.274551 0.582455 0.131494 0.255261 -0.709168]
1823							[ 0.43282 0.500662 -0.139209 -0.735144 -0.0467834]
1824							[ 0.693634 -0.428171 0.125114 0.128145 -0.550879]
1825							[ 0.229202 0.180393 -0.859217 0.4173 0.0503155]
1826							]
1827
1828							loadings # dim var x comp. correlation between variable and component
1829							[
1830							[ -0.978489 -0.955601 -0.992316 -0.97901 -0.942421]
1831							[ -0.122661 0.260224 0.0587476 0.114043 -0.316836]
1832							[ 0.0939749 0.108705 -0.0302253 -0.159616 -0.0101577]
1833							[ 0.13396 -0.0826915 0.0241629 0.0247483 -0.10639]
1834							[ 0.027153 0.0213708 -0.101789 0.0494365 0.00596076]
1835							]
1836
1837							pct_var # percent variance accounted for by each component
1838							[0.940384 0.0399209 0.00942842 0.00745963 0.00280691]
1839
1840							Plot scores along the first two components,
1841
1842							$d->plot_scores( $r{eigenvector} );
1843
1844							=cut
1845
1846							*pca = \&PDL::pca;
1847	3			3	0	16578	sub PDL::pca {
1848	3					17	my ($self, $opt) = @_;
1849							my %opt = (
1850							CORR => 1,
1851							PLOT => 0,
1852							WIN => undef, # for plotting
1853	3	50				14	);
	3					21
1854	3	100				344	if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
1855							my $var_var = $opt{CORR}? $self->corr_table : $self->cov_table;
1856	3					67	# value is axis pdl and score is var x axis
1857	3					4008	my ($eigvec, $eigval) = $var_var->eigens_sym;
1858							$eigvec = $eigvec->transpose; # compatibility with PDL::Slatec::eigsys
1859	3					182	# ind is sticky point for broadcasting
1860	3					196	my $ind_sorted = $eigval->qsorti->slice('-1:0');
1861	3					409	$eigvec = $eigvec->slice(':',$ind_sorted)->sever;
1862							$eigval = $eigval->slice($ind_sorted)->sever;
1863	3					289	# var x axis
1864	3					221	my $var = $eigval / $eigval->sum->sclr;
1865	3	100				14	my $loadings;
1866	2					8	if ($opt{CORR}) {
1867							$loadings = $eigvec * sqrt( $eigval->transpose );
1868							}
1869	1					25	else {
1870	1					185	my $scores = $eigvec x $self->dev_m;
1871							$loadings = $self->corr( $scores->dummy(1) );
1872							}
1873	3	50				389	$var->plot_screes(\%opt)
1874	3					99	if $opt{PLOT};
1875							( eigenvalue=>$eigval, eigenvector=>$eigvec,
1876							pct_var=>$var, loadings=>$loadings );
1877							}
1878
1879							=head2 pca_sorti
1880
1881							Determine by which vars a component is best represented. Descending sort
1882							vars by size of association with that component. Returns sorted var and
1883							relevant component indices.
1884
1885							=for options
1886
1887							Default options (case insensitive):
1888
1889							NCOMP => 10, # maximum number of components to consider
1890
1891							=for usage
1892
1893							Usage:
1894
1895							# let's see if we replicated the Osgood et al. (1957) study
1896							pdl> ($data, $idv, $ido) = rtable 'osgood_exp.csv', {v=>0}
1897
1898							# select a subset of var to do pca
1899							pdl> $ind = which_id $idv, [qw( ACTIVE BASS BRIGHT CALM FAST GOOD HAPPY HARD LARGE HEAVY )]
1900							pdl> $data = $data( ,$ind)->sever
1901							pdl> @$idv = @$idv[list $ind]
1902
1903							pdl> %m = $data->pca
1904
1905							pdl> ($iv, $ic) = $m{loadings}->pca_sorti()
1906
1907							pdl> p "$idv->[$_]\t" . $m{loadings}->($_,$ic)->flat . "\n" for (list $iv)
1908
1909							# COMP0 COMP1 COMP2 COMP3
1910							HAPPY [0.860191 0.364911 0.174372 -0.10484]
1911							GOOD [0.848694 0.303652 0.198378 -0.115177]
1912							CALM [0.821177 -0.130542 0.396215 -0.125368]
1913							BRIGHT [0.78303 0.232808 -0.0534081 -0.0528796]
1914							HEAVY [-0.623036 0.454826 0.50447 0.073007]
1915							HARD [-0.679179 0.0505568 0.384467 0.165608]
1916							ACTIVE [-0.161098 0.760778 -0.44893 -0.0888592]
1917							FAST [-0.196042 0.71479 -0.471355 0.00460276]
1918							LARGE [-0.241994 0.594644 0.634703 -0.00618055]
1919							BASS [-0.621213 -0.124918 0.0605367 -0.765184]
1920
1921							=cut
1922
1923							*pca_sorti = \&PDL::pca_sorti;
1924							sub PDL::pca_sorti {
1925	1			1	0	9917	# $self is pdl (var x component)
1926	1					4	my ($self, $opt) = @_;
1927							my %opt = (
1928							NCOMP => 10, # maximum number of components to consider
1929	1	50				6	);
	0					0
1930	1					8	if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
1931	1					187	my $ncomp = pdl($opt{NCOMP}, $self->dim(1))->min;
1932	1					244	$self = $self->dice_axis( 1, pdl(0..$ncomp-1) );
1933							my $icomp = $self->transpose->abs->maximum_ind;
1934	1					95	# sort between comp
1935	1					6	my $ivar_sort = $icomp->qsorti;
1936							$self = $self->slice($ivar_sort)->sever;
1937	1					74	# sort within comp
1938	1					12	my $ic = $icomp->slice($ivar_sort)->iv_cluster;
1939	3					893	for my $comp (0 .. $ic->dim(1)-1) {
1940	3					739	my $i = $self->slice(which($ic->slice(':',$comp)), "($comp)")->qsorti->slice('-1:0');
1941							$ivar_sort->slice(which $ic->slice(':',$comp))
1942							.= $ivar_sort->slice(which $ic->slice(':',$comp))->slice($i);
1943	1	50				421	}
1944							wantarray ? ($ivar_sort, pdl(0 .. $ic->dim(1)-1)) : $ivar_sort;
1945							}
1946
1947							=head2 plot_means
1948
1949							Plots means anova style. Can handle up to 4-way interactions (ie 4D pdl).
1950
1951							=for options
1952
1953							Default options (case insensitive):
1954
1955							IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'],
1956							DVNM => 'DV',
1957							AUTO => 1, # auto set dims to be on x-axis, line, panel
1958							# if set 0, dim 0 goes on x-axis, dim 1 as lines
1959							# dim 2+ as panels
1960							# see PDL::Graphics::Simple for next option
1961							WIN => undef, # pgswin object. not closed here if passed
1962							# allows comparing multiple lines in same plot
1963							SIZE => 640, # individual square panel size in pixels
1964
1965							=for usage
1966
1967							Usage:
1968
1969							# see anova for mean / se pdl structure
1970							$mean->plot_means( $se, {IVNM=>['apple', 'bake']} );
1971
1972							Or like this:
1973
1974							$m{'\| apple ~ bake \| m'}->plot_means;
1975
1976							=cut
1977
1978							*plot_means = \&PDL::plot_means;
1979	0			0	0		sub PDL::plot_means {
1980	0	0					require PDL::Graphics::Simple;
1981	0						my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
1982	0						my ($self, $se) = @_;
1983	0	0					$self = $self->squeeze;
1984	0						if ($self->ndims > 4) {
1985	0						carp "Data is > 4D. No plot here.";
1986							return;
1987	0						}
1988							my %opt = (
1989							IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'],
1990							DVNM => 'DV',
1991							AUTO => 1, # auto set vars to be on X axis, line, panel
1992							WIN => undef, # PDL::Graphics::Simple object
1993							SIZE => 640, # individual square panel size in pixels
1994	0	0					);
	0
1995							if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt }
1996							# decide which vars to plot as x axis, lines, panels
1997							# put var w most levels on x axis
1998	0						# put var w least levels on diff panels
1999	0						my @iD = 0..3;
2000							my @dims = (1, 1, 1, 1);
2001	0						# splice ARRAY,OFFSET,LENGTH,LIST
2002	0						splice @dims, 0, $self->ndims, $self->dims;
2003	0	0					$self = $self->reshape(@dims)->sever;
2004							$se = $se->reshape(@dims)->sever
2005							if defined $se;
2006	0	0					@iD = reverse list qsorti pdl @dims
2007							if $opt{AUTO};
2008							# $iD[0] on x axis
2009	0						# $iD[1] as separate lines
2010	0						my $nx = $self->dim($iD[2]); # n xpanels
2011							my $ny = $self->dim($iD[3]); # n ypanels
2012	0		0				my $w = $opt{WIN} \|\| PDL::Graphics::Simple::pgswin(
2013	0						size=>[$opt{SIZE}$nx, $opt{SIZE}$ny,'px']);
2014	0						my $seq0 = sequence(my $dim0 = $self->dim($iD[0]));
2015	0						my ($pcount, @plots) = 0;
2016	0						for my $y (0..$ny-1) {
2017	0						for my $x (0..$nx-1) {
2018	0	0					my $key_prefix = "$opt{IVNM}[$iD[0]]\|";
2019	0	0					$key_prefix .= $opt{IVNM}[$iD[2]] . "=$x\|" if $nx > 1;
2020	0						$key_prefix .= $opt{IVNM}[$iD[3]] . "=$y\|" if $ny > 1;
2021	0						for (0 .. $self->dim($iD[1]) - 1) {
2022	0						my $ke = "$key_prefix$opt{IVNM}[$iD[1]]=$_";
2023	0						my $ydiced = $self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_)->squeeze;
2024							push @plots, with=>'lines', ke=>"$ke mean", style=>$pcount,
2025	0	0					$seq0+$pcount*0.05, $ydiced;
2026							push @plots, with=>'errorbars', ke=>"$ke error", style=>$pcount,
2027							$seq0+$pcount*0.05, $ydiced,
2028							$se->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)
2029							->dice_axis($iD[1],$_)->squeeze
2030	0						if defined($se);
2031							$pcount++;
2032							}
2033							}
2034	0	0					}
2035							my ($ymin, $ymax) = pdl($self, !defined $se ? () : ($self+$se, $self-$se))->minmax;
2036							$w->plot(@plots,
2037	0						{ xlabel=>$opt{IVNM}[$iD[0]], ylabel=>$opt{DVNM},
2038							xrange=>[-0.05,$dim0-1+$pcount*0.05], yrange=>[$ymin-0.05,$ymax+0.05] }
2039	0						);
2040							$w;
2041							}
2042
2043							=head2 plot_residuals
2044
2045							Plots residuals against predicted values.
2046
2047							=for usage
2048
2049							Usage:
2050
2051							use PDL::Graphics::Simple;
2052							$w = pgswin();
2053							$y->plot_residuals( $y_pred, { win=>$w } );
2054
2055							=for options
2056
2057							Default options (case insensitive):
2058
2059							# see PDL::Graphics::Simple for more info
2060							WIN => undef, # pgswin object. not closed here if passed
2061							# allows comparing multiple lines in same plot
2062							SIZE => 640, # plot size in pixels
2063							COLOR => 1,
2064
2065							=cut
2066
2067							*plot_residuals = \&PDL::plot_residuals;
2068	0			0	0		sub PDL::plot_residuals {
2069	0	0					require PDL::Graphics::Simple;
2070	0						my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2071	0						my ($y, $y_pred) = @_;
2072							my %opt = (
2073							# see PDL::Graphics::Simple for next options
2074							WIN => undef, # pgswin object. not closed here if passed
2075							# allows comparing multiple lines in same plot
2076							SIZE => 640, # plot size in pixels
2077							COLOR => 1,
2078	0	0					);
	0
2079	0						if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
2080	0		0				my $residuals = $y - $y_pred;
2081							my $win = $opt{WIN} \|\| PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']);
2082							$win->plot(
2083	0						with=>'points', style=>$opt{COLOR}, $y_pred, $residuals,
2084							with=>'lines', style=>$opt{COLOR}, pdl($y_pred->minmax), pdl(0,0), # 0-line
2085							{xlabel=>'predicted value', ylabel=>'residuals'},
2086							);
2087							}
2088
2089
2090							=head2 plot_scores
2091
2092							Plots standardized original and PCA transformed scores against two components. (Thank you, Bob MacCallum, for the documentation suggestion that led to this function.)
2093
2094							=for options
2095
2096							Default options (case insensitive):
2097
2098							CORR => 1, # boolean. PCA was based on correlation or covariance
2099							COMP => [0,1], # indices to components to plot
2100							# see PDL::Graphics::Simple for next options
2101							WIN => undef, # pgswin object. not closed here if passed
2102							# allows comparing multiple lines in same plot
2103							SIZE => 640, # plot size in pixels
2104							COLOR => [1,2], # color for original and rotated scores
2105
2106							=for usage
2107
2108							Usage:
2109
2110							my %p = $data->pca();
2111							$data->plot_scores( $p{eigenvector}, \%opt );
2112
2113							=cut
2114
2115	0			0	0		*plot_scores = \&PDL::plot_scores;
2116	0	0					sub PDL::plot_scores {
2117	0						require PDL::Graphics::Simple;
2118	0						my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2119							my ($self, $eigvec) = @_;
2120							my %opt = (
2121							CORR => 1, # boolean. PCA was based on correlation or covariance
2122							COMP => [0,1], # indices to components to plot
2123							# see PDL::Graphics::Simple for next options
2124							WIN => undef, # pgswin object. not closed here if passed
2125							# allows comparing multiple lines in same plot
2126							SIZE => 640, # plot size in pixels
2127	0	0					COLOR => [1,2], # color for original and transformed scoress
	0
2128	0						);
2129	0	0					if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt }
2130							my $i = pdl $opt{COMP};
2131	0						my $z = $opt{CORR} ? $self->stddz : $self->dev_m;
2132	0						# transformed normed values
2133	0		0				my $scores = sumover($eigvec->slice(':',$i) * $z->transpose->dummy(1))->transpose;
2134							$z = $z->slice(':',$i)->sever;
2135							my $win = $opt{WIN} \|\| PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']);
2136	0						$win->plot(
2137							with=>'points', style=>$opt{COLOR}[0], ke=>'original', $z->slice(',(0)'), $z->slice(',(1)'),
2138							with=>'points', style=>$opt{COLOR}[1], ke=>'transformed', $scores->slice(',(0)'), $scores->slice(',(1)'),
2139							{xlabel=>"Component $opt{COMP}[0]", ylabel=>"Component $opt{COMP}[1]"},
2140							);
2141							}
2142
2143
2144							=head2 plot_screes
2145
2146							Scree plot. Plots proportion of variance accounted for by PCA components.
2147
2148							=for options
2149
2150							Default options (case insensitive):
2151
2152							NCOMP => 20, # max number of components to plot
2153							CUT => 0, # set to plot cutoff line after this many components
2154							# undef to plot suggested cutoff line for NCOMP comps
2155							# see PDL::Graphics::Simple for next options
2156							WIN => undef, # pgswin object. not closed here if passed
2157							# allows comparing multiple lines in same plot
2158							SIZE => 640, # plot size in pixels
2159
2160							=for usage
2161
2162							Usage:
2163
2164							# variance should be in descending order
2165							$d = qsort random 10, 5; # 10 obs on 5 variables
2166							%pca = $d->pca( \%opt );
2167							$pca{pct_var}->plot_screes( {ncomp=>16, win=>$win=PDL::Graphics::Simple::pgswin()} );
2168
2169							Or, because NCOMP is used so often, it is allowed a shortcut,
2170
2171							$pca{pct_var}->plot_screes( 16 );
2172
2173							=cut
2174
2175	0			0	0		*plot_scree = \&PDL::plot_screes; # here for now for compatibility
2176	0	0					*plot_screes = \&PDL::plot_screes;
2177	0						sub PDL::plot_screes {
2178	0						require PDL::Graphics::Simple;
2179							my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2180							my ($self, $ncomp) = @_;
2181							my %opt = (
2182							NCOMP => 20, # max number of components to plot
2183							CUT => 0, # set to plot cutoff line after this many components
2184							# undef to plot suggested cutoff line for NCOMP comps
2185							# see PDL::Graphics::Simple for next options
2186							WIN => undef, # pgswin object. not closed here if passed
2187	0	0					# allows comparing multiple lines in same plot
	0
2188	0	0					SIZE => 640, # plot size in pixels
2189							);
2190							if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
2191	0	0					$opt{NCOMP} = $ncomp
2192	0						if $ncomp;
2193							# re-use $ncomp below
2194	0	0					$ncomp = ($self->dim(0) < $opt{NCOMP})? $self->dim(0) : $opt{NCOMP};
2195							my $self_comp = $self->slice([0,$ncomp-1]);
2196	0		0				$opt{CUT} = PDL::Stats::Kmeans::_scree_ind $self_comp
2197							if !defined $opt{CUT};
2198							my $win = $opt{WIN} \|\|
2199	0	0					PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']);
2200							$win->plot(
2201							with=>'lines', ke=>'scree', sequence($ncomp), $self_comp,
2202							!$opt{CUT} ? () : (with=>'lines', ke=>'cut', pdl($opt{CUT}-.5, $opt{CUT}-.5), pdl(-.05, $self->max->sclr+.05)),
2203							{xlabel=>'Component', ylabel=>'Proportion of Variance Accounted for',
2204							xrange=>[-0.05,$ncomp-0.95], yrange=>[0,1], le=>'tr'},
2205							);
2206							}
2207
2208							=head2 plot_stripchart
2209
2210							Stripchart plot. Plots ANOVA-style data, categorised against given IVs.
2211
2212							=for options
2213
2214							Default options (case insensitive):
2215
2216							IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
2217							DVNM => 'DV',
2218							# see PDL::Graphics::Simple for next options
2219							WIN => undef, # pgswin object. not closed here if passed
2220
2221							=for usage
2222
2223							Usage:
2224
2225							%m = $y->plot_stripchart( $a, \@b, { IVNM=>[qw(apple bake)] } );
2226
2227							=cut
2228
2229	0			0	0		my $CHART_GAP = 0.1;
2230	0	0					*plot_stripchart = \&PDL::plot_stripchart;
2231	0						sub PDL::plot_stripchart {
2232	0						require PDL::Graphics::Simple;
2233							my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2234							my ($y, @ivs_raw) = @_;
2235							my %opt = (
2236							IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
2237	0	0					DVNM => 'DV',
	0
2238	0						WIN => undef, # pgswin object. not closed here if passed
2239	0	0	0				);
	0
2240	0		0				if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
2241	0						$opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
2242							if !$opt{IVNM} or !@{ $opt{IVNM} };
2243	0						my $w = $opt{WIN} \|\| PDL::Graphics::Simple::pgswin();
	0
2244	0						my @codes = map [code_ivs($_)], @ivs_raw;
	0
2245							my @levels = map {
2246	0						my $map = $_->[1];
2247	0						[sort {$map->{$a} <=> $map->{$b}} keys %$map];
2248	0						} @codes;
2249	0						my $xjitter = $y->random * $CHART_GAP;
2250	0						my ($pcount, @plots) = 0;
2251	0						push @plots, with=>'points', ke=>"all data", $xjitter+$pcount, $y;
2252	0						$pcount++;
2253	0						for my $i (0..$#ivs_raw) {
2254	0						my $levs = $levels[$i];
2255	0						my $name = $opt{IVNM}[$i];
2256	0						my $coded = $codes[$i][0];
2257							for my $j (0..$#$levs) {
2258							my $inds = which($coded == $j);
2259	0						push @plots, with=>'points', ke=>"$name=$levs->[$j]",
2260							$xjitter->slice($inds)+$pcount+$j*$CHART_GAP, $y->slice($inds);
2261	0						}
2262	0						$pcount++;
	0
2263							}
2264							my ($ymin, $ymax) = $y->minmax;
2265	0						my $xmax = $pcount-1 + $CHART_GAP*($#{$levels[-1]} + 2);
2266							$w->plot(@plots,
2267	0						{ xlabel=>'IV', ylabel=>$opt{DVNM},
2268							xrange=>[-1,$xmax], yrange=>[$ymin-$CHART_GAP,$ymax+$CHART_GAP] }
2269							);
2270							$w;
2271							}
2272
2273							=head1 SEE ALSO
2274
2275							L
2276
2277							L
2278
2279							=head1 REFERENCES
2280
2281							Cohen, J., Cohen, P., West, S.G., & Aiken, L.S. (2003). Applied
2282							Multiple Regression/correlation Analysis for the Behavioral Sciences
2283							(3rd ed.). Mahwah, NJ: Lawrence Erlbaum Associates Publishers.
2284
2285							Hosmer, D.W., & Lemeshow, S. (2000). Applied Logistic Regression (2nd
2286							ed.). New York, NY: Wiley-Interscience.
2287
2288							Lorch, R.F., & Myers, J.L. (1990). Regression analyses of repeated
2289							measures data in cognitive research. Journal of Experimental Psychology:
2290							Learning, Memory, & Cognition, 16, 149-157.
2291
2292							Osgood C.E., Suci, G.J., & Tannenbaum, P.H. (1957). The Measurement of
2293							Meaning. Champaign, IL: University of Illinois Press.
2294
2295							Rutherford, A. (2011). ANOVA and ANCOVA: A GLM Approach (2nd ed.). Wiley.
2296
2297							Shlens, J. (2009). A Tutorial on Principal Component Analysis. Retrieved
2298							April 10, 2011 from http://citeseerx.ist.psu.edu/
2299
2300							The GLM procedure: unbalanced ANOVA for two-way design with
2301							interaction. (2008). SAS/STAT(R) 9.2 User's Guide. Retrieved June 18,
2302							2009 from http://support.sas.com/
2303
2304							Van den Noortgate, W., & Onghena, P. (2006). Analysing repeated
2305							measures data in cognitive research: A comment on regression coefficient
2306							analyses. European Journal of Cognitive Psychology, 18, 937-952.
2307
2308							=head1 AUTHOR
2309
2310							Copyright (C) 2009 Maggie J. Xiong
2311
2312							All rights reserved. There is no warranty. You are allowed to redistribute
2313							this software / documentation as described in the file COPYING in the
2314							PDL distribution.
2315
2316							=cut
2317
2318							EOD
2319
2320							pp_done();