File Coverage

lib/PDL/Stats/GLM.pd
Criterion Covered Total %
statement 425 588 72.2
branch 161 272 59.1
condition 36 69 52.1
subroutine 31 41 75.6
pod 0 19 0.0
total 653 989 66.0


line stmt bran cond sub pod time code
1             use strict;
2             use warnings;
3              
4             my $F = [qw(F D)];
5              
6             pp_add_exported(qw(ols_t ols ols_rptd
7             anova anova_rptd anova_design_matrix
8             dummy_code effect_code effect_code_w interaction_code
9             r2_change logistic
10             pca pca_sorti
11             plot_means plot_residuals plot_screes
12             ));
13              
14             pp_addpm({At=>'Top'}, <<'EOD');
15 1     1   9  
  1         2  
  1         43  
16 1     1   5 use strict;
  1         1  
  1         66  
17             use warnings;
18 1     1   5  
  1         1  
  1         108  
19 1     1   7 use Carp;
  1         2  
  1         7  
20 1     1   2817 use PDL::LiteF;
  1         2  
  1         11  
21 1     1   247 use PDL::MatrixOps;
  1         2  
  1         6  
22 1     1   1097 use PDL::Stats::Basic;
  1         4  
  1         12  
23             use PDL::Stats::Kmeans;
24              
25             eval { require PDL::Core; require PDL::GSL::CDF; };
26             my $CDF = 1 if !$@;
27              
28             =encoding utf8
29              
30             =head1 NAME
31              
32             PDL::Stats::GLM -- general and generalized linear modelling methods such as ANOVA, linear regression, PCA, and logistic regression.
33              
34             =head1 SYNOPSIS
35              
36             use PDL::LiteF;
37             use PDL::Stats::GLM;
38              
39             # do a multiple linear regression and plot the residuals
40             my $y = pdl( 8, 7, 7, 0, 2, 5, 0 );
41             my $x = pdl( [ 0, 1, 2, 3, 4, 5, 6 ], # linear component
42             [ 0, 1, 4, 9, 16, 25, 36 ] ); # quadratic component
43             my %m = $y->ols( $x, {plot=>1} );
44             print "$_\t$m{$_}\n" for sort keys %m;
45              
46             =head1 DESCRIPTION
47              
48             For more about general linear modelling, see
49             L.
50             For an unbelievably thorough text on experimental design
51             and analysis, including linear modelling, see L
52             Course in Design and Analysis of Experiments, Gary
53             W. Oehlert|http://users.stat.umn.edu/~gary/book/fcdae.pdf>.
54              
55             The terms FUNCTIONS and METHODS are arbitrarily used to refer to
56             methods that are broadcastable and methods that are NOT broadcastable,
57             respectively. FUNCTIONS support bad values.
58              
59             P-values, where appropriate, are provided if PDL::GSL::CDF is installed.
60              
61             =cut
62              
63             EOD
64              
65             pp_addhdr('
66             #include
67             #include
68             #include
69              
70             '
71             );
72              
73             pp_def('fill_m',
74             Pars => 'a(n); [o]b(n)',
75             Inplace => 1,
76             GenericTypes => $F,
77             HandleBad => 1,
78             Code => '
79             broadcastloop %{
80             PDL_IF_BAD($GENERIC(b) sa = 0;
81             PDL_Indx N = 0;
82             loop (n) %{
83             if ($ISBAD($a())) continue;
84             sa += $a();
85             N++;
86             %}
87             $GENERIC(b) m = N ? sa / N : 0;,)
88             loop (n) %{
89             $b() = PDL_IF_BAD($ISBAD($a()) ? m :,) $a();
90             %}
91             %}
92             $PDLSTATESETGOOD(b);
93             ',
94             Doc => '
95             =for ref
96              
97             Replaces bad values with sample mean. Mean is set to 0 if all obs are
98             bad.
99              
100             =for usage
101              
102             pdl> p $data
103             [
104             [ 5 BAD 2 BAD]
105             [ 7 3 7 BAD]
106             ]
107              
108             pdl> p $data->fill_m
109             [
110             [ 5 3.5 2 3.5]
111             [ 7 3 7 5.66667]
112             ]
113             ',
114             BadDoc => 'The output pdl badflag is cleared.',
115             );
116              
117             pp_def('fill_rand',
118             Pars => 'a(n); [o]b(n)',
119             Inplace => 1,
120             HandleBad => 1,
121             Code => '
122             broadcastloop %{
123             PDL_IF_BAD($GENERIC(a) *g[ $SIZE(n) ];
124             PDL_Indx i = 0;
125             srand( time( NULL ) );
126             loop (n) %{
127             if ( $ISGOOD($a()) ) {
128             g[i++] = &$a();
129             }
130             %},)
131             loop (n) %{
132             PDL_IF_BAD(if ($ISBAD($a())) {
133             /* XXX-FIXME works on 64bit, but rand() is quite limited */
134             PDL_Indx j = (PDL_Indx) ((i-1) * (double)(rand()) / (double)(RAND_MAX) + .5);
135             $b() = *g[j];
136             continue;
137             },)
138             $b() = $a();
139             %}
140             %}
141             $PDLSTATESETGOOD(b);
142             ',
143             Doc => '
144             =for ref
145              
146             Replaces bad values with random sample (with replacement) of good
147             observations from the same variable.
148              
149             =for usage
150              
151             pdl> p $data
152             [
153             [ 5 BAD 2 BAD]
154             [ 7 3 7 BAD]
155             ]
156              
157             pdl> p $data->fill_rand
158              
159             [
160             [5 2 2 5]
161             [7 3 7 7]
162             ]
163             ',
164             BadDoc => 'The output pdl badflag is cleared.',
165             );
166              
167             pp_def('dev_m',
168             Pars => 'a(n); [o]b(n)',
169             Inplace => 1,
170             GenericTypes => $F,
171             HandleBad => 1,
172             Code => '
173             $GENERIC(b) sa = 0, m = 0;
174             PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
175             loop (n) %{
176             PDL_IF_BAD(if ($ISBAD($a())) continue;,)
177             sa += $a();
178             PDL_IF_BAD(N++;,)
179             %}
180             m = sa / N;
181             loop (n) %{
182             if ($ISBAD($a())) { $SETBAD(b()); continue; }
183             $b() = $a() - m;
184             %}
185             ',
186             Doc => 'Replaces values with deviations from the mean.',
187             );
188              
189             pp_def('stddz',
190             Pars => 'a(n); [o]b(n)',
191             Inplace => 1,
192             GenericTypes => $F,
193             HandleBad => 1,
194             Code => '
195             $GENERIC(b) sa = 0, a2 = 0, m = 0, sd = 0;
196             PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
197             loop (n) %{
198             PDL_IF_BAD(if ($ISBAD($a())) continue;,)
199             sa += $a();
200             a2 += $a() * $a();
201             PDL_IF_BAD(N++;,)
202             %}
203             PDL_IF_BAD(if (N < 1) { loop (n) %{ $SETBAD(b()); %} continue; },)
204             m = sa / N;
205             sd = pow( a2/N - m*m, .5 );
206             loop (n) %{
207             PDL_IF_BAD(if ($ISBAD(a())) { $SETBAD(b()); continue; },)
208             /* sd? does not work, presumably due to floating point */
209             $b() = (sd>0)? (($a() - m) / sd) : 0;
210             %}
211             ',
212             Doc => 'Standardize ie replace values with z_scores based on sample standard deviation from the mean (replace with 0s if stdv==0).',
213             );
214              
215             pp_def('sse',
216             Pars => 'a(n); b(n); [o]c()',
217             GenericTypes => $F,
218             HandleBad => 1,
219             Code => '
220             $GENERIC(c) ss = 0;
221             loop (n) %{
222             PDL_IF_BAD(if ( $ISBAD($a()) || $ISBAD($b()) ) continue;,)
223             $GENERIC() diff = $a() - $b();
224             ss += diff * diff;
225             %}
226             $c() = ss;
227             ',
228             Doc => 'Sum of squared errors between actual and predicted values.',
229             );
230              
231             pp_def('mse',
232             Pars => 'a(n); b(n); [o]c()',
233             GenericTypes => $F,
234             HandleBad => 1,
235             Code => '
236             $GENERIC(c) ss = 0;
237             PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
238             loop (n) %{
239             PDL_IF_BAD(if ( $ISBAD($a()) || $ISBAD($b()) ) continue;,)
240             $GENERIC() diff = $a() - $b();
241             ss += diff * diff;
242             PDL_IF_BAD(N++;,)
243             %}
244             if (N < 1) { $SETBAD(c()); continue; }
245             $c() = ss/N;
246             ',
247             Doc => 'Mean of squared errors between actual and predicted values, ie variance around predicted value.',
248             );
249              
250             pp_def('rmse',
251             Pars => 'a(n); b(n); [o]c()',
252             GenericTypes => $F,
253             HandleBad => 1,
254             Code => '
255             $GENERIC(c) d2 = 0;
256             PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
257             loop (n) %{
258             PDL_IF_BAD(if ( $ISBAD($a()) || $ISBAD($b()) ) continue;,)
259             $GENERIC() diff = $a() - $b();
260             d2 += diff * diff;
261             PDL_IF_BAD(N++;,)
262             %}
263             if (N < 1) { $SETBAD(c()); continue; }
264             $c() = sqrt( d2 / N );
265             ',
266             Doc => 'Root mean squared error, ie stdv around predicted value.',
267             );
268              
269             pp_def('pred_logistic',
270             Pars => 'a(n,m); b(m); [o]c(n)',
271             GenericTypes => $F,
272             HandleBad => 1,
273             Code => '
274             loop (n) %{
275             $GENERIC(c) l = 0;
276             PDL_IF_BAD(PDL_Indx bad = 0;,)
277             loop (m) %{
278             PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD($b())) { bad = 1; continue; },)
279             l += $a() * $b();
280             %}
281             PDL_IF_BAD(if (bad) { $SETBAD($c()); continue; },)
282             $c() = 1 / ( 1 + exp(-l) );
283             %}
284             ',
285             Doc => '
286             =for ref
287              
288             Calculates predicted prob value for logistic regression.
289              
290             =for usage
291              
292             # glue constant then apply coeff returned by the logistic method
293             $pred = $x->glue(1,ones($x->dim(0)))->pred_logistic( $m{b} );
294             ',
295             );
296              
297             pp_def('d0',
298             Pars => 'a(n); [o]c()',
299             GenericTypes => $F,
300             HandleBad => 1,
301             Code => '
302             $GENERIC(c) p = 0, ll = 0;
303             PDL_Indx N = PDL_IF_BAD(0,$SIZE(n));
304             loop (n) %{
305             PDL_IF_BAD(if ($ISBAD($a())) continue;,)
306             p += $a();
307             PDL_IF_BAD(N++;,)
308             %}
309             if (N < 1) { $SETBAD(c()); continue; }
310             p /= N;
311             loop (n) %{
312             PDL_IF_BAD(if ($ISBAD($a())) continue;,)
313             ll += $a()? log( p ) : log( 1 - p );
314             %}
315             $c() = -2 * ll;
316             ',
317             Doc => 'Null deviance for logistic regression.',
318             );
319              
320             pp_def('dm',
321             Pars => 'a(n); b(n); [o]c()',
322             GenericTypes => $F,
323             HandleBad => 1,
324             Code => '
325             $GENERIC(c) ll;
326             ll = 0;
327             loop (n) %{
328             PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD($b())) continue;,)
329             ll += $a()? log( $b() ) : log( 1 - $b() );
330             %}
331             $c() = -2 * ll;
332             ',
333             Doc => '
334             =for ref
335              
336             Model deviance for logistic regression.
337              
338             =for usage
339              
340             my $dm = $y->dm( $y_pred );
341             # null deviance
342             my $d0 = $y->dm( ones($y->nelem) * $y->avg );
343             ',
344             );
345              
346             pp_def('dvrs',
347             Pars => 'a(); b(); [o]c()',
348             GenericTypes => $F,
349             HandleBad => 1,
350             Code => '
351             PDL_IF_BAD(if ($ISBAD($a()) || $ISBAD($b())) { $SETBAD( $c() ); continue; },)
352             $c() = $a()? sqrt( -2 * log($b()) )
353             : -1 * sqrt( -2 * log(1-$b()) )
354             ;
355             ',
356             Doc => 'Deviance residual for logistic regression.',
357             );
358              
359             pp_addpm pp_line_numbers(__LINE__, <<'EOD');
360              
361             =head2 ols_t
362              
363             =for ref
364              
365             Broadcasted version of ordinary least squares regression (B). The
366             price of broadcasting was losing significance tests for coefficients
367             (but see B). The fitting function was shamelessly copied then
368             modified from PDL::Fit::Linfit. Intercept is FIRST of coeff if CONST => 1.
369              
370             ols_t does not handle bad values. consider B or B
371             if there are bad values.
372              
373             =for options
374              
375             Default options (case insensitive):
376              
377             CONST => 1,
378              
379             =for usage
380              
381             Usage:
382              
383             # DV, 2 person's ratings for top-10 box office movies
384             # ascending sorted by box office numbers
385              
386             pdl> p $y = pdl '1 1 2 4 4 4 4 5 5 5; 1 2 2 2 3 3 3 3 5 5'
387              
388             # model with 2 IVs, a linear and a quadratic trend component
389              
390             pdl> $x = cat sequence(10), sequence(10)**2
391              
392             # suppose our novice modeler thinks this creates 3 different models
393             # for predicting movie ratings
394              
395             pdl> p $x = cat $x, $x * 2, $x * 3
396             [
397             [
398             [ 0 1 2 3 4 5 6 7 8 9]
399             [ 0 1 4 9 16 25 36 49 64 81]
400             ]
401             [
402             [ 0 2 4 6 8 10 12 14 16 18]
403             [ 0 2 8 18 32 50 72 98 128 162]
404             ]
405             [
406             [ 0 3 6 9 12 15 18 21 24 27]
407             [ 0 3 12 27 48 75 108 147 192 243]
408             ]
409             ]
410              
411             pdl> p $x->info
412             PDL: Double D [10,2,3]
413              
414             # insert a dummy dim between IV and the dim (model) to be broadcasted
415              
416             pdl> %m = $y->ols_t( $x->dummy(2) )
417              
418             pdl> p "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m
419              
420             # 2 persons' ratings, each fitted with 3 "different" models
421             F [
422             [ 38.314159 25.087209]
423             [ 38.314159 25.087209]
424             [ 38.314159 25.087209]
425             ]
426              
427             # df is the same across dv and iv models
428             F_df [2 7]
429             F_p [
430             [0.00016967051 0.00064215074]
431             [0.00016967051 0.00064215074]
432             [0.00016967051 0.00064215074]
433             ]
434              
435             R2 [
436             [ 0.9162963 0.87756762]
437             [ 0.9162963 0.87756762]
438             [ 0.9162963 0.87756762]
439             ]
440              
441             b [ # constant linear quadratic
442             [
443             [ 0.66363636 0.99015152 -0.056818182] # person 1
444             [ 1.4 0.18939394 0.022727273] # person 2
445             ]
446             [
447             [ 0.66363636 0.49507576 -0.028409091]
448             [ 1.4 0.09469697 0.011363636]
449             ]
450             [
451             [ 0.66363636 0.33005051 -0.018939394]
452             [ 1.4 0.063131313 0.0075757576]
453             ]
454             ]
455              
456             # our novice modeler realizes at this point that
457             # the 3 models only differ in the scaling of the IV coefficients
458             ss_model [
459             [ 20.616667 13.075758]
460             [ 20.616667 13.075758]
461             [ 20.616667 13.075758]
462             ]
463              
464             ss_residual [
465             [ 1.8833333 1.8242424]
466             [ 1.8833333 1.8242424]
467             [ 1.8833333 1.8242424]
468             ]
469              
470             ss_total [22.5 14.9]
471             y_pred [
472             [
473             [0.66363636 1.5969697 2.4166667 3.1227273 ... 4.9727273]
474             ...
475              
476             =cut
477              
478             *ols_t = \&PDL::ols_t;
479             sub PDL::ols_t {
480 117     117 0 247912 _ols_common(1, @_);
481             }
482              
483             sub _ols_common {
484 121     121   409 my ($broadcasted, $y, $ivs, $opt) = @_;
485 121         595 ($y, $ivs) = _ols_prep_inputs(@_);
486 121         442 _ols_main($broadcasted, $y, $ivs, $opt);
487             }
488              
489             sub _ols_prep_inputs {
490             # y [n], ivs [n x attr] pdl
491 121     121   366 my ($broadcasted, $y, $ivs, $opt) = @_;
492 121         729 my %opt = (
493             CONST => 1,
494             PLOT => 0,
495             WIN => undef, # for plotting
496             );
497 121 100       476 if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  109         770  
498 121 100       567 if (!$broadcasted) {
499 4         14 $y = $y->squeeze;
500 4 50       215 $y->getndims > 1 and
501             croak "use ols_t for broadcasted version";
502             }
503 121 100       834 $ivs = $ivs->dummy(1) if $ivs->getndims == 1;
504 121 100       2064 ($y, $ivs) = _rm_bad_value( $y, $ivs ) if !$broadcasted;
505             # set up ivs and const as ivs
506             $opt{CONST} and
507 121 100       1073 $ivs = ones($ivs->dim(0))->glue( 1, $ivs );
508 121         3865 ($y, $ivs);
509             }
510              
511             sub _ols_main {
512 141     141   1041 my ($broadcasted, $y, $ivs, $opt) = @_;
513 141         632 my %opt = (
514             CONST => 1,
515             PLOT => 0,
516             WIN => undef, # for plotting
517             );
518 141 100       457 if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  129         771  
519 141         773 my $C = inv( $ivs x $ivs->t );
520             # Internally normalise data
521             # (double) it or ushort y and sequence iv won't work right
522 141         9867212 my $ymean = $y->abs->avgover->double;
523 141         15721 $ymean->where( $ymean==0 ) .= 1;
524 141 100       31977 my $divisor = $broadcasted ? $ymean->dummy(0) : $ymean;
525 141         6403 my $y2 = $y / $divisor;
526 141         4122 my $Y = $ivs x $y2->dummy(0);
527             # Do the fit
528             # Fitted coefficients vector
529 141         16410 my $coeff = PDL::squeeze( $C x $Y );
530 141 100 100     16779 $coeff = $coeff->dummy(0)
      100        
531             if $broadcasted and $coeff->getndims == 1 and $y->getndims > 1;
532 141         766 $coeff *= $divisor; # Un-normalise
533             # ***$coeff x $ivs looks nice but produces nan on successive tries***
534 141 100       2876 my $y_pred = sumover( ($broadcasted ? $coeff->dummy(1) : $coeff) * $ivs->transpose );
535 141 50       13833 $opt{PLOT} and $y->plot_residuals( $y_pred, \%opt );
536 141 100       2284 return $coeff unless wantarray;
537 6         22 my %ret = (y_pred => $y_pred);
538 6 100       137 $ret{ss_total} = $opt{CONST} ? $y->ss : sumover( $y ** 2 );
539 6         112 $ret{ss_residual} = $y->sse( $ret{y_pred} );
540 6         21 $ret{ss_model} = $ret{ss_total} - $ret{ss_residual};
541 6         91 $ret{R2} = $ret{ss_model} / $ret{ss_total};
542 6 100       95 my $n_var = $opt{CONST} ? $ivs->dim(1) - 1 : $ivs->dim(1);
543 6         33 $ret{F_df} = pdl( $n_var, my $df1 = $y->dim(0) - $ivs->dim(1) );
544             $ret{F} = $ret{ss_model} / $n_var
545 6         370 / ($ret{ss_residual} / $df1);
546 6 50       282 $ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $n_var, $df1 )
547             if $CDF;
548 6 100       16 if (!$broadcasted) {
549 2 50       16 my $se_b = ones( $coeff->dims? $coeff->dims : 1 );
550             $opt{CONST} and
551 2 50       85 $se_b->slice(0) .= sqrt( $ret{ss_residual} / $df1 * $C->slice(0,0) );
552             # get the se for bs by successively regressing each iv by the rest ivs
553 2 50       314 if ($ivs->dim(1) > 1) {
554 2 50       11 my @coords = $opt{CONST} ? 1..$n_var : 0..$n_var-1;
555 2 50       13 my $ones = !$opt{CONST} ? undef : ones($ivs->dim(0));
556 2         73 for my $k (@coords) {
557 2         11 my $G = $ivs->dice_axis(1, [grep $_ != $k, @coords]);
558 2 50       163 $G = $ones->glue( 1, $G ) if $opt{CONST};
559 2         203 my $b_G = $ivs->slice(':',$k)->ols( $G, {CONST=>0,PLOT=>0} );
560 2         14 my $ss_res_k = $ivs->slice(':',$k)->squeeze->sse( sumover($b_G * $G->transpose) );
561 2         225 $se_b->slice($k) .= sqrt( $ret{ss_residual} / $df1 / $ss_res_k );
562             }
563             }
564             else {
565             $se_b->slice(0)
566 0         0 .= sqrt( $ret{ss_residual} / $df1 / sum( $ivs->slice(':',0)**2 ) );
567             }
568 2         155 $ret{b_se} = $se_b;
569 2         6 $ret{b_t} = $coeff / $ret{b_se};
570 2 50       29 $ret{b_p} = 2 * ( 1 - $ret{b_t}->abs->gsl_cdf_tdist_P( $df1 ) )
571             if $CDF;
572             }
573 6 50       26 for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze };
  46         1488  
574 6         196 $ret{b} = $coeff;
575 6         124 return %ret;
576             }
577              
578             =head2 r2_change
579              
580             =for ref
581              
582             Significance test for the incremental change in R2 when new variable(s)
583             are added to an ols regression model.
584              
585             Returns the change stats as well as stats for both models. Based on
586             L. (One way to make up for the lack of significance tests for
587             coeffs in ols_t).
588              
589             =for options
590              
591             Default options (case insensitive):
592              
593             CONST => 1,
594              
595             =for usage
596              
597             Usage:
598              
599             # suppose these are two persons' ratings for top 10 box office movies
600             # ascending sorted by box office
601              
602             pdl> p $y = qsort ceil(random(10, 2) * 5)
603             [
604             [1 1 2 2 2 3 4 4 4 4]
605             [1 2 2 3 3 3 4 4 5 5]
606             ]
607              
608             # first IV is a simple linear trend
609              
610             pdl> p $x1 = sequence 10
611             [0 1 2 3 4 5 6 7 8 9]
612              
613             # the modeler wonders if adding a quadratic trend improves the fit
614              
615             pdl> p $x2 = sequence(10) ** 2
616             [0 1 4 9 16 25 36 49 64 81]
617              
618             # two difference models are given in two pdls
619             # each as would be pass on to ols_t
620             # the 1st model includes only linear trend
621             # the 2nd model includes linear and quadratic trends
622             # when necessary use dummy dim so both models have the same ndims
623              
624             pdl> %c = $y->r2_change( $x1->dummy(1), cat($x1, $x2) )
625              
626             pdl> p "$_\t$c{$_}\n" for sort keys %c
627             # person 1 person 2
628             F_change [0.72164948 0.071283096]
629             # df same for both persons
630             F_df [1 7]
631             F_p [0.42370145 0.79717232]
632             R2_change [0.0085966043 0.00048562549]
633             model0 HASH(0x8c10828)
634             model1 HASH(0x8c135c8)
635              
636             # the answer here is no.
637              
638             =cut
639              
640             *r2_change = \&PDL::r2_change;
641             sub PDL::r2_change {
642 1     1 0 9154 my ($self, $ivs0, $ivs1, $opt) = @_;
643 1 50       12 $ivs0->getndims == 1 and $ivs0 = $ivs0->dummy(1);
644              
645 1         48 my %ret;
646              
647 1         6 $ret{model0} = { $self->ols_t( $ivs0, $opt ) };
648 1         5 $ret{model1} = { $self->ols_t( $ivs1, $opt ) };
649              
650 1         5 $ret{R2_change} = $ret{model1}->{R2} - $ret{model0}->{R2};
651             $ret{F_df}
652             = pdl(my $df0 = $ivs1->dim(1) - $ivs0->dim(1),
653 1         25 my $df1 = $ret{model1}->{F_df}->slice('(1)') );
654             $ret{F_change}
655             = $ret{R2_change} * $df1
656 1         110 / ( (1-$ret{model1}->{R2}) * $df0 );
657 1 50       113 $ret{F_p} = 1 - $ret{F_change}->gsl_cdf_fdist_P( $df0, $df1 )
658             if $CDF;
659              
660 1 100       5 for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze };
  5         90  
661              
662 1         43 %ret;
663             }
664              
665             =head1 METHODS
666              
667             =head2 anova
668              
669             =for ref
670              
671             Analysis of variance. Uses type III sum of squares for unbalanced data.
672              
673             Dependent variable should be a 1D pdl. Independent variables can be
674             passed as 1D perl array ref or 1D pdl.
675              
676             Will only calculate p-value (C) if there are more samples than the
677             product of categories of all the IVs.
678              
679             Supports bad value (by ignoring missing or BAD values in dependent and
680             independent variables list-wise).
681              
682             For more on ANOVA, see L.
683              
684             =for options
685              
686             Default options (case insensitive):
687              
688             V => 1, # carps if bad value in variables
689             IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
690             PLOT => 0, # plots highest order effect
691             # can set plot_means options here
692             WIN => undef, # for plotting
693              
694             =for usage
695              
696             Usage:
697              
698             # suppose this is ratings for 12 apples
699              
700             pdl> p $y = qsort ceil( random(12)*5 )
701             [1 1 2 2 2 3 3 4 4 4 5 5]
702              
703             # IV for types of apple
704              
705             pdl> p $a = sequence(12) % 3 + 1
706             [1 2 3 1 2 3 1 2 3 1 2 3]
707              
708             # IV for whether we baked the apple
709              
710             pdl> @b = qw( y y y y y y n n n n n n )
711              
712             pdl> %m = $y->anova( $a, \@b, { IVNM=>['apple', 'bake'] } )
713              
714             pdl> p "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m
715             F 2.46666666666667
716             F_df [5 6]
717             F_p 0.151168719948632
718             ms_model 3.08333333333333
719             ms_residual 1.25
720             ss_model 15.4166666666667
721             ss_residual 7.5
722             ss_total 22.9166666666667
723             | apple | F 0.466666666666667
724             | apple | F_p 0.648078345471096
725             | apple | df 2
726             | apple | m [2.75 3 3.5]
727             | apple | ms 0.583333333333334
728             | apple | se [0.85391256 0.81649658 0.64549722]
729             | apple | ss 1.16666666666667
730             | apple || err df 6
731             | apple ~ bake | F 0.0666666666666671
732             | apple ~ bake | F_p 0.936190104380701
733             | apple ~ bake | df 2
734             | apple ~ bake | m [
735             [1.5 2 2.5]
736             [ 4 4 4.5]
737             ]
738             | apple ~ bake | ms 0.0833333333333339
739             | apple ~ bake | se [
740             [0.5 1 0.5]
741             [ 1 1 0.5]
742             ]
743             | apple ~ bake | ss 0.166666666666668
744             | apple ~ bake || err df 6
745             | bake | F 11.2666666666667
746             | bake | F_p 0.015294126084452
747             | bake | df 1
748             | bake | m [2 4.1666667]
749             | bake | ms 14.0833333333333
750             | bake | se [0.36514837 0.40138649]
751             | bake | ss 14.0833333333333
752             | bake || err df 6
753              
754             This is implemented as a call to L, with an C
755             subjects vector.
756              
757             =cut
758              
759             *anova = \&PDL::anova;
760             sub PDL::anova {
761 6     6 0 85104 my ($y, @args) = @_;
762 6         30 anova_rptd($y, undef, @args);
763             }
764              
765             sub _interactions {
766 25     25   72 my ($ivs_ref, $idv) = @_;
767 25         53 my (@inter, @idv_inter);
768 25         84 for my $nway ( 2 .. @$ivs_ref ) {
769 28         142 my $iter_idv = _combinations( $nway, [0..$#$ivs_ref] );
770 28         85 while ( my @v = &$iter_idv() ) {
771 48         153 push @inter, interaction_code(@$ivs_ref[@v]);
772 48         285 push @idv_inter, join ' ~ ', @$idv[@v];
773             }
774             }
775 25         93 (\@inter, \@idv_inter);
776             }
777              
778             # now prepare for cell mean
779             sub _interactions_cm {
780 19     19   47 my ($ivs_ref, $pdl_ivs_raw) = @_;
781 19         93 my ($dim0, @inter_cm, @inter_cmo) = $ivs_ref->[0]->dim(0);
782 19         61 for my $nway ( 2 .. @$ivs_ref ) {
783 21         526 my $iter_idv = _combinations( $nway, [0..$#$ivs_ref] );
784 21         77 while ( my @v = &$iter_idv() ) {
785 37         59 my @i_cm;
786 37         109 for my $o ( 0 .. $dim0 - 1 ) {
787 1714         199463 push @i_cm, join '', map $_->slice("($o)"), @$pdl_ivs_raw[@v];
788             }
789 37         4277 my ($inter, $map) = effect_code( \@i_cm );
790 37         114 push @inter_cm, $inter;
791             # get the order to put means in correct multi dim pdl pos
792             # this is order in var_e dim(1)
793 37         324 my @levels = sort { $map->{$a} <=> $map->{$b} } keys %$map;
  719         1229  
794             # this is order needed for cell mean
795 37         179 my @i_cmo = sort { reverse($levels[$a]) cmp reverse($levels[$b]) }
  772         1253  
796             0 .. $#levels;
797 37         197 push @inter_cmo, pdl @i_cmo;
798             }
799             }
800 19         122 (\@inter_cmo, \@inter_cm);
801             }
802              
803             sub _cell_means {
804 19     19   62 my ($data, $ivs_cm_ref, $i_cmo_ref, $idv, $pdl_ivs_raw) = @_;
805 19         36 my %ind_id;
806 19         163 @ind_id{ @$idv } = 0..$#$idv;
807 19         39 my %cm;
808 19         39 my $i = 0;
809 19         56 for (@$ivs_cm_ref) {
810 77 50       246 confess "_cell_means passed empty ivs_cm_ref ndarray at pos $i" if $_->isempty;
811 77         765 my $last = zeroes $_->dim(0);
812 77         1306 my $i_neg = which $_->slice(':',0) == -1;
813 77         12299 $last->slice($i_neg) .= 1;
814 77         6713 $_->where($_ == -1) .= 0;
815 77         15473 $_ = $_->glue(1, $last);
816 77         16092 my @v = split ' ~ ', $idv->[$i];
817 77         436 my @shape = map $pdl_ivs_raw->[$_]->uniq->nelem, @ind_id{@v};
818 77         62322 my ($m, $ss) = $data->centroid( $_ );
819 77         318 $m = $m->slice($i_cmo_ref->[$i])->sever;
820 77         4918 $ss = $ss->slice($i_cmo_ref->[$i])->sever;
821 77         3713 $m = $m->reshape(@shape);
822 77         3839 my $se = sqrt( ($ss/($_->sumover - 1)) / $_->sumover )->reshape(@shape);
823 77         7466 $cm{ "| $idv->[$i] | m" } = $m;
824 77         248 $cm{ "| $idv->[$i] | se" } = $se;
825 77         484 $i++;
826             }
827 19         114 \%cm;
828             }
829              
830             # http://www.perlmonks.org/?node_id=371228
831             sub _combinations {
832 49     49   137 my ($num, $arr) = @_;
833 0     0   0 return sub { return }
834 49 50 33     304 if $num == 0 or $num > @$arr;
835 49         99 my @pick;
836             return sub {
837 134 100   134   3388 return @$arr[ @pick = ( 0 .. $num - 1 ) ]
838             unless @pick;
839 85         191 my $i = $#pick;
840 85   100     870 $i-- until $i < 0 or $pick[$i]++ < @$arr - $num + $i;
841 85 100       698 return if $i < 0;
842 36         129 @pick[$i .. $#pick] = $pick[$i] .. $#$arr;
843 36         145 return @$arr[@pick];
844 49         412 };
845             }
846              
847             =head2 anova_rptd
848              
849             =for ref
850              
851             Repeated measures and mixed model anova. Uses type III sum of squares.
852              
853             The standard error (se) for the means are based on the relevant mean
854             squared error from the anova, ie it is pooled across levels of the effect.
855             Will only calculate p-value (C) if there are more samples than the
856             product of categories of all the IVs.
857              
858             Uses L, so supports bad values.
859              
860             For more on repeated measures ANOVA, see
861             L,
862             and for mixed models, see
863             L.
864              
865             =for options
866              
867             Default options (case insensitive):
868              
869             V => 1, # carps if bad value in dv
870             IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
871             BTWN => [], # indices of between-subject IVs (matches IVNM indices)
872             PLOT => 0, # plots highest order effect
873             # see plot_means() for more options
874             WIN => undef, # for plotting
875              
876             =for usage
877              
878             Usage:
879              
880             Some fictional data: recall_w_beer_and_wings.txt
881              
882             Subject Beer Wings Recall
883             Alex 1 1 8
884             Alex 1 2 9
885             Alex 1 3 12
886             Alex 2 1 7
887             Alex 2 2 9
888             Alex 2 3 12
889             Brian 1 1 12
890             Brian 1 2 13
891             Brian 1 3 14
892             Brian 2 1 9
893             Brian 2 2 8
894             Brian 2 3 14
895             ...
896              
897             # rtable allows text only in 1st row and col
898             my ($data, $idv, $subj) = rtable 'recall_w_beer_and_wings.txt';
899              
900             my ($b, $w, $dv) = $data->dog;
901             # subj and IVs can be 1d pdl or @ ref
902             # subj must be the first argument
903             my %m = $dv->anova_rptd( $subj, $b, $w, {ivnm=>['Beer', 'Wings']} );
904              
905             print "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m
906              
907             ss_residual 19.0833333333333
908             ss_subject 24.8333333333333
909             ss_total 133.833333333333
910             | Beer | F 9.39130434782609
911             | Beer | F_p 0.0547977008378944
912             | Beer | df 1
913             | Beer | m [10.916667 8.9166667]
914             | Beer | ms 24
915             | Beer | se [0.4614791 0.4614791]
916             | Beer | ss 24
917             | Beer || err df 3
918             | Beer || err ms 2.55555555555556
919             | Beer || err ss 7.66666666666667
920             | Beer ~ Wings | F 0.510917030567687
921             | Beer ~ Wings | F_p 0.623881438624431
922             | Beer ~ Wings | df 2
923             | Beer ~ Wings | m [
924             [ 10 7]
925             [ 10.5 9.25]
926             [12.25 10.5]
927             ]
928             | Beer ~ Wings | ms 1.625
929             | Beer ~ Wings | se [
930             [0.89170561 0.89170561]
931             [0.89170561 0.89170561]
932             [0.89170561 0.89170561]
933             ]
934             | Beer ~ Wings | ss 3.25000000000001
935             | Beer ~ Wings || err df 6
936             | Beer ~ Wings || err ms 3.18055555555555
937             | Beer ~ Wings || err ss 19.0833333333333
938             | Wings | F 4.52851711026616
939             | Wings | F_p 0.0632754786153548
940             | Wings | df 2
941             | Wings | m [8.5 9.875 11.375]
942             | Wings | ms 16.5416666666667
943             | Wings | se [0.67571978 0.67571978 0.67571978]
944             | Wings | ss 33.0833333333333
945             | Wings || err df 6
946             | Wings || err ms 3.65277777777778
947             | Wings || err ss 21.9166666666667
948              
949             For mixed model anova, ie when there are between-subject IVs
950             involved, feed the IVs as above, but specify in BTWN which IVs are
951             between-subject. For example, if we had added age as a between-subject
952             IV in the above example, we would do
953              
954             my %m = $dv->anova_rptd( $subj, $age, $b, $w,
955             { ivnm=>['Age', 'Beer', 'Wings'], btwn=>[0] });
956              
957             =cut
958              
959             *anova_rptd = \&PDL::anova_rptd;
960             sub PDL::anova_rptd {
961 20 100   20 0 174624 my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
962 20         65 my ($y, $subj, @ivs_raw) = @_;
963 20 50       122 confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1;
964             croak "Mismatched number of elements in DV and IV. Are you passing IVs the old-and-abandoned way?"
965 20 50 66     110 if (ref $ivs_raw[0] eq 'ARRAY') and (@{ $ivs_raw[0] } != $y->nelem);
  3         38  
966 20         52 for (@ivs_raw) {
967 43 50 66     1473 croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1
968             }
969 20 100       937 my %opt = (
970             V => 1, # carps if bad value in dv
971             IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
972             ( !defined($subj) ? () : (
973             BTWN => [], # indices of between-subject IVs (matches IVNM indices)
974             )),
975             PLOT => 0, # plots highest order effect
976             WIN => undef, # for plotting
977             );
978 20 100       70 if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  19         199  
979 3         15 $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
980 20 100 66     95 if !$opt{IVNM} or !@{ $opt{IVNM} };
  20         106  
981 20         95 (my ($dsgn, $idv), $y, my ($sj, $ivs_ref_filtrd, $pdl_ivs_raw, $ivs_ref, $err_ref)) = $y->anova_design_matrix($subj, @ivs_raw, \%opt);
982 20 50       87 confess "anova_rptd: got singular matrix for X' x X"
983             if any +($dsgn->t x $dsgn)->det == 0;
984 20         782552 my $b_full = _ols_main(1, $y, $dsgn->t, {CONST=>0});
985              
986 20         447 my %ret = (ss_total => $y->ss,
987             ss_residual => $y->sse( sumover( $b_full * $dsgn ) ));
988              
989 20 100       892 if (defined $subj) {
990 14         72 my @full = (@$ivs_ref, @$err_ref);
991 14         32 my $has_btwn = @{ $opt{BTWN} };
  14         65  
992 14         36 my @is_btwn; $is_btwn[$_] = 1 for @{ $opt{BTWN} };
  14         26  
  14         68  
993 14         55 my @within_inds = 0 .. $#ivs_raw;
994 14 100       73 @within_inds = grep !$is_btwn[$_], @within_inds if $has_btwn;
995 14         108 my $within_df = pdl(map $_->dim(1), @full[@within_inds])->prodover->sclr;
996 14         1078 EFFECT: for my $k (0 .. $#full) {
997 122 100       3315 my $e = ($k > $#$ivs_ref)? '| err' : '';
998 122 100       355 my $i = ($k > $#$ivs_ref)? $k - @$ivs_ref : $k;
999 122 100       627 my $i_pref = $k == $#full ? undef : "| $idv->[$i] |";
1000              
1001 122 100       458 if (!defined $full[$k]) { # ss_residual as error
    100          
1002 25         97 $ret{ "$i_pref$e ss" } = $ret{ss_residual};
1003             # highest ord inter for purely within design, (p-1)*(q-1)*(n-1)
1004 25 100       137 my $factor = (ref $full[-1] ? $full[-1] : $err_ref->[$full[-1]])->dim(1);
1005 25         93 my $df = $ret{ "$i_pref$e df" } = $factor * $within_df;
1006 25 50       67 die "${i_pref}residual df = 0" if $df <= 0;
1007 25         84 $ret{ "$i_pref$e ms" } = $ret{ "$i_pref$e ss" } / $df;
1008             } elsif (ref $full[$k]) { # unique error term
1009             next EFFECT
1010 80 50 100     2160 unless my @G = grep $_ != $k && defined $full[$_], 0 .. $#full;
1011 80         632 my $G = ones($y->dim(0))->glue(1, grep ref $_, @full[@G]);
1012 80         77399 my $b_G = $y->ols_t( $G, {CONST=>0} );
1013             my $ss = $ret{$k == $#full ? 'ss_subject' : "$i_pref$e ss"}
1014 80 100       592 = $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual};
1015 80 100       6646 if ($k != $#full) {
1016 75         547 my $df = $ret{"$i_pref$e df"} = $full[$k]->dim(1);
1017 75 50       285 die "residual df = 0" if $df <= 0;
1018 75         262 $ret{"$i_pref$e ms"} = $ss / $df;
1019             }
1020             } else { # repeating error term
1021             my $ss = $ret{$k == $#full ? 'ss_subject' : "$i_pref$e ss"}
1022 17 100       110 = $ret{"| $idv->[$full[$k]] |$e ss"};
1023 17 100       92 if ($k != $#full) {
1024 8         63 my $df = $ret{"$i_pref$e df"} = $ret{"| $idv->[$full[$k]] |$e df"};
1025 8 50       21 die "residual df = 0" if $df <= 0;
1026 8         24 $ret{"$i_pref$e ms"} = $ss / $df;
1027             }
1028             }
1029             }
1030             } else {
1031 6         31 $ret{ss_model} = $ret{ss_total} - $ret{ss_residual};
1032 6         132 $ret{F_df} = pdl(my $F_df0 = $dsgn->dim(0) - 1, my $df1 = $y->nelem - $dsgn->dim(0));
1033 6         348 $ret{ms_model} = $ret{ss_model} / $F_df0;
1034 6         117 $ret{ms_residual} = $ret{ss_residual} / $df1;
1035 6         80 $ret{F} = $ret{ms_model} / $ret{ms_residual};
1036 6 50 33     78 $ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $F_df0, $df1 )
1037             if $CDF and $df1 > 0;
1038              
1039             # get IV ss from $ivs_ref instead of $dsgn pdl
1040 6         29 my $ones = ones($y->dim(0));
1041 6         273 for my $k (0 .. $#$ivs_ref) {
1042 24         716 my $G = $ones->glue(1, @$ivs_ref[grep $_ != $k, 0 .. $#$ivs_ref]);
1043 24         14229 my $b_G = $y->ols_t( $G, {CONST=>0} );
1044             $ret{ "| $idv->[$k] | ss" }
1045 24         126 = $y->sse( sumover($b_G * $G->transpose) ) - $ret{ss_residual};
1046 24         1877 my $df0 = $ret{ "| $idv->[$k] | df" } = $ivs_ref->[$k]->dim(1);
1047 24         116 $ret{ "| $idv->[$k] || err df" } = $df1;
1048 24 100       172 die "residual df = 0" if $df1 <= 0;
1049 23         84 $ret{ "| $idv->[$k] | ms" } = $ret{ "| $idv->[$k] | ss" } / $df0;
1050             }
1051             }
1052             # have all iv, inter, and error effects. get F and F_p
1053 19         237 for (0 .. $#$ivs_ref) {
1054 77 100       210 my $ms_residual = defined $subj ? $ret{ "| $idv->[$_] || err ms" } : $ret{ms_residual};
1055 77         297 my ($df0, $df1) = @ret{"| $idv->[$_] | df" , "| $idv->[$_] || err df"};
1056 77         545 my $F = $ret{ "| $idv->[$_] | F" } = $ret{ "| $idv->[$_] | ms" } / $ms_residual;
1057 77 50 33     1137 $ret{ "| $idv->[$_] | F_p" } = 1 - $F->gsl_cdf_fdist_P($df0, $df1)
1058             if $CDF and $df1 > 0;
1059             }
1060              
1061 19 100       157 for (keys %ret) {ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze};
  570         9519  
1062              
1063 19         388 my ($inter_cmo_ref, $inter_cm_ref)
1064             = _interactions_cm($ivs_ref_filtrd, $pdl_ivs_raw);
1065             # append inter info to cell means effects
1066 19         72 my $ivs_cm_ref = [@$ivs_ref_filtrd, @$inter_cm_ref];
1067 19         59 my @i_cmo_ref = map pdl(values %{ (effect_code($_))[1] })->qsort, @$pdl_ivs_raw;
  40         3389  
1068 19         1280 push @i_cmo_ref, @$inter_cmo_ref;
1069              
1070 19         112 my $cm_ref
1071             = _cell_means( $y, $ivs_cm_ref, \@i_cmo_ref, $idv, $pdl_ivs_raw );
1072 19 100       103 if (defined $subj) {
1073 14         45 my @ls = map { $_->uniq->nelem } @$pdl_ivs_raw;
  29         7156  
1074             $cm_ref
1075 14         6258 = _fix_rptd_se( $cm_ref, \%ret, $opt{IVNM}, \@ls, $sj->uniq->nelem );
1076             }
1077             # integrate mean and se into %ret
1078 19         248 @ret{ keys %$cm_ref } = values %$cm_ref;
1079              
1080 19         50 my $highest = join(' ~ ', @{ $opt{IVNM} });
  19         190  
1081             $cm_ref->{"| $highest | m"}->plot_means( $cm_ref->{"| $highest | se"},
1082             { %opt, IVNM=>$idv } )
1083 19 50       86 if $opt{PLOT};
1084              
1085 19         2174 %ret;
1086             }
1087              
1088             =head2 anova_design_matrix
1089              
1090             =for ref
1091              
1092             Effect-coded design matrix for anova, including repeated-measures and
1093             mixed-model. The C for use in linear regression i.e. C.
1094              
1095             Added in 0.854. See L for more.
1096              
1097             Supports bad value in the dependent and independent variables. It
1098             automatically removes bad data listwise, i.e. remove a subject's data if
1099             there is any cell missing for the subject.
1100              
1101             =for options
1102              
1103             Default options (case insensitive):
1104              
1105             V => 1, # carps if bad value in dv
1106             IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
1107             BTWN => [], # indices of between-subject IVs (matches IVNM indices)
1108              
1109             =for usage
1110              
1111             $matrix = $dv->anova_design_matrix(undef, $b, $w, {ivnm=>[qw(b w)]});
1112             $matrix = $dv->anova_design_matrix(
1113             $subj, $b, $w, {ivnm=>[qw(b w)]}); # repeated-measures
1114             $matrix = $dv->anova_design_matrix(
1115             $subj, $b, $w, {ivnm=>[qw(b w)], btwn=>['b']}); # mixed-model
1116             ($matrix, $ivnm_ref) = $dv->anova_design_matrix(
1117             $subj, $b, $w, {ivnm=>[qw(b w)], btwn=>['b']}); # list context also names
1118              
1119             =cut
1120              
1121             *anova_design_matrix = \&PDL::anova_design_matrix;
1122             sub PDL::anova_design_matrix {
1123 25 50   25 0 42871 my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
1124 25         90 my ($y, $subj, @ivs_raw) = @_;
1125 25 50       74 confess "No IVs: did you omit 'undef' for anova?" if !@ivs_raw;
1126 25 50       127 confess "Expected 1-D data, instead: ", $y->info if $y->ndims != 1;
1127 25         69 for (@ivs_raw) {
1128 53 50 66     1254 croak "too many dims in IV!" if ref $_ eq 'PDL' and $_->squeeze->ndims > 1;
1129             }
1130 25 100       940 my %opt = (
1131             V => 1, # carps if bad value in dv
1132             IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
1133             ( !defined($subj) ? () : (
1134             BTWN => [], # indices of between-subject IVs (matches IVNM indices)
1135             )),
1136             );
1137 25 50       93 if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
  25         938  
1138 3         13 $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
1139 25 100 66     99 if !$opt{IVNM} or !@{ $opt{IVNM} };
  25         126  
1140 25         45 my @idv_orig = @{ $opt{IVNM} };
  25         77  
1141 25         212 my @pdl_ivs_raw = map scalar PDL::Stats::Basic::code_ivs($_), @ivs_raw;
1142 25         87 my $pdl_ivs = pdl(\@pdl_ivs_raw);
1143             # explicit set badflag because pdl() removes badflag
1144 25         3686 $pdl_ivs->badflag( scalar grep $_->badflag, @pdl_ivs_raw );
1145 25         52 my $sj;
1146 25 100       90 if (defined($subj)) {
1147             # delete bad data listwise ie remove subj if any cell missing
1148 18         64 $sj = PDL::Stats::Basic::code_ivs($subj);
1149 18         336 my $ibad = which( $y->isbad | nbadover($pdl_ivs->transpose) );
1150 18         3038 my $sj_bad = $sj->slice($ibad)->uniq;
1151 18 100       1028 if ($sj_bad->nelem) {
1152             warn $sj_bad->nelem . " subjects with missing data removed\n"
1153 4 50       18 if $opt{V};
1154             $sj = $sj->setvaltobad($_)
1155 4         14 for (list $sj_bad);
1156 4         134 my $igood = which $sj->isgood;
1157 4         308 for ($y, $sj, @pdl_ivs_raw) {
1158 16         38 $_ = $_->slice( $igood )->sever;
1159 16         618 $_->badflag(0);
1160             }
1161             }
1162             } else {
1163 7         36 ($y, $pdl_ivs) = _rm_bad_value( $y, $pdl_ivs );
1164 7 50 66     288 if ($opt{V} and $y->nelem < $pdl_ivs_raw[0]->nelem) {
1165 0         0 warn sprintf "%d subjects with missing data removed\n", $pdl_ivs_raw[0]->nelem - $y->nelem;
1166             }
1167 7         116 @pdl_ivs_raw = $pdl_ivs->dog;
1168             }
1169 25         742 my @ivs_ref_fltrd = map scalar effect_code($_), @pdl_ivs_raw;
1170 25         140 my ($ivs_inter_ref, $idv_inter) = _interactions(\@ivs_ref_fltrd, \@idv_orig);
1171             # append inter info to main effects
1172 25         67 my $ivs_ref = [@ivs_ref_fltrd, @$ivs_inter_ref];
1173 25         81 my @idv = (@idv_orig, @$idv_inter);
1174             # matches $ivs_ref, with an extra last pdl for subj effect
1175             my $err_ref = !defined($subj) ? [] :
1176 25 100       156 _add_errors( $sj, $ivs_ref, \@idv, \@pdl_ivs_raw, $opt{BTWN} );
1177 25         151 for (grep ref $err_ref->[$_], 0..$#$err_ref) {
1178 32         950 my ($null_row_ids, $non_null_row_ids) = $err_ref->[$_]->zcover->which_both;
1179 32 50       5447 confess "got null columns $null_row_ids in error entry #$_ ($idv[$_])"
1180             if !$null_row_ids->isempty;
1181             }
1182 25         292 my $dsgn = PDL::glue(1, ones($y->dim(0)), @$ivs_ref, (grep ref($_), @$err_ref))->t;
1183 25 100       22324 !wantarray ? $dsgn : ($dsgn, \@idv, $y, $sj, \@ivs_ref_fltrd, \@pdl_ivs_raw, $ivs_ref, $err_ref);
1184             }
1185              
1186             # code (btwn group) subjects. Rutherford (2011) pp 208-209
1187             sub _code_btwn {
1188 18     18   38 my ($subj, $btwn) = @_;
1189 18         46 my (@grp, %grp_s);
1190 18         118 for my $n (0 .. $subj->nelem - 1) {
1191             # construct string to code group membership
1192             # something not treated as BAD by code_ivs to start off marking group membership
1193             # if no $btwn, everyone ends up in the same grp
1194 510         1195 my $s = '_' . join '', map $_->slice($n), @$btwn;
1195 510         22422 push @grp, $s; # group membership
1196 510         1052 $s .= $subj->slice($n); # keep track of total uniq subj
1197 510         27952 $grp_s{$s} = 1;
1198             }
1199 18         121 my $grp = PDL::Stats::Kmeans::iv_cluster \@grp;
1200 18         147 my $spdl = zeroes $subj->dim(0), keys(%grp_s) - $grp->dim(1);
1201 18         524 my $d1 = 0;
1202 18         100 for my $g (0 .. $grp->dim(1)-1) {
1203 33         139 my $col_inds = which $grp->slice(':',$g);
1204 33         4559 my $gsub = $subj->slice( $col_inds )->effect_code;
1205 33         314 my ($nobs, $nsub) = $gsub->dims;
1206 33         144 $spdl->slice($col_inds, [$d1,$d1+$nsub-1]) .= $gsub;
1207 33         2618 $d1 += $nsub;
1208             }
1209 18         215 $spdl;
1210             }
1211              
1212             sub _add_errors {
1213 18     18   54 my ($subj, $ivs_ref, $idv, $raw_ivs, $btwn) = @_;
1214 18         87 my $spdl = _code_btwn($subj, [@$raw_ivs[@$btwn]]);
1215             # if btwn factor involved, or highest order inter for within factors
1216             # elem is undef, so that
1217             # @errors ind matches @$ivs_ref, with an extra elem at the end for subj
1218             # mark btwn factors for error terms
1219             # same error term for B(wn) and A(btwn) x B(wn) (Rutherford, p205)
1220 18         117 my %is_btwn = map +($_=>1), @$idv[ @$btwn ];
1221 18         46 my $has_btwn = keys %is_btwn;
1222 18         155 my %idv2indx = map +($idv->[$_]=>$_), 0..$#$idv;
1223 18         42 my $ie_subj;
1224             my @errors = map {
1225 18         46 my @fs = split ' ~ ', $idv->[$_];
  64         189  
1226             # separate bw and wn factors
1227             # if only bw, error is bw x subj
1228             # if only wn or wn and bw, error is wn x subj
1229 64 100       195 my @bw = !$has_btwn ? () : grep $is_btwn{$_}, @fs;
1230 64 100       185 my @wn = !$has_btwn ? @fs : grep !$is_btwn{$_}, @fs;
1231 64 100 100     250 $ie_subj = $_ if !defined($ie_subj) and !@wn;
1232 64 100       161 my $err = join ' ~ ', @wn ? @wn : @bw;
1233             # highest order inter of within factors, use ss_residual as error
1234 64 100 100     279 if ( @wn == @$raw_ivs - @$btwn ) { undef }
  31 100       110  
    100          
    100          
1235             # repeating btwn factors use ss_subject as error
1236 2         5 elsif (!@wn and $_ > $ie_subj) { $ie_subj }
1237             # repeating error term
1238 6         13 elsif ($_ > $idv2indx{$err}) { $idv2indx{$err} }
1239 14         54 elsif (@wn) { interaction_code($ivs_ref->[$_], $spdl) }
1240 11         31 else { $spdl }
1241             } 0 .. $#$ivs_ref;
1242 18 100       70 push @errors, $has_btwn ? $ie_subj : $spdl;
1243 18         98 \@errors;
1244             }
1245              
1246             sub _fix_rptd_se {
1247             # if ivnm lvls_ref for within ss only this can work for mixed design
1248 14     14   6291 my ($cm_ref, $ret, $ivnm, $lvls_ref, $n) = @_;
1249 14 100       358 my @se = map /^\| (.+?) \| se$/ ? $1 : (), keys %$cm_ref;
1250             my @n_obs = map {
1251 14         44 my @ivs = split / ~ /, $_;
  54         1082  
1252 54         255 my $i_ivs = which_id $ivnm, \@ivs;
1253 54         2789 my $icollapsed = setops pdl(0 .. $#$ivnm), 'XOR', $i_ivs;
1254 54 100       68631 my $collapsed = $icollapsed->nelem?
1255             pdl( @$lvls_ref[(list $icollapsed)] )->prodover
1256             : 1
1257             ;
1258 54         3126 $n * $collapsed;
1259             } @se;
1260 14         265 for my $i (0 .. $#se) {
1261             $cm_ref->{"| $se[$i] | se"}
1262 54         1473 .= sqrt( $ret->{"| $se[$i] || err ms"} / $n_obs[$i] );
1263             }
1264 14         529 $cm_ref;
1265             }
1266              
1267             =head2 dummy_code
1268              
1269             =for ref
1270              
1271             Dummy coding of nominal variable (perl @ ref or 1d pdl) for use in regression.
1272              
1273             Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD).
1274              
1275             =for usage
1276              
1277             pdl> @a = qw(a a a b b b c c c)
1278             pdl> p $a = dummy_code(\@a)
1279             [
1280             [1 1 1 0 0 0 0 0 0]
1281             [0 0 0 1 1 1 0 0 0]
1282             ]
1283              
1284             =cut
1285              
1286             *dummy_code = \&PDL::dummy_code;
1287             sub PDL::dummy_code {
1288 0     0 0 0 my ($var_ref) = @_;
1289 0         0 my $var_e = effect_code( $var_ref );
1290 0         0 $var_e->where( $var_e == -1 ) .= 0;
1291 0         0 $var_e;
1292             }
1293              
1294             =head2 effect_code
1295              
1296             =for ref
1297              
1298             Unweighted effect coding of nominal variable (perl @ ref or 1d pdl) for
1299             use in regression. returns in @ context coded pdl and % ref to level -
1300             pdl->dim(1) index.
1301              
1302             Supports BAD value (missing or 'BAD' values result in the corresponding
1303             pdl elements being marked as BAD).
1304              
1305             =for usage
1306              
1307             my @var = qw( a a a b b b c c c );
1308             my ($var_e, $map) = effect_code( \@var );
1309              
1310             print $var_e . $var_e->info . "\n";
1311              
1312             [
1313             [ 1 1 1 0 0 0 -1 -1 -1]
1314             [ 0 0 0 1 1 1 -1 -1 -1]
1315             ]
1316             PDL: Double D [9,2]
1317              
1318             print "$_\t$map->{$_}\n" for sort keys %$map
1319             a 0
1320             b 1
1321             c 2
1322              
1323             =cut
1324              
1325             *effect_code = \&PDL::effect_code;
1326             sub PDL::effect_code {
1327 171     171 0 27024 my ($var_ref) = @_;
1328 171         753 my ($var, $map_ref) = PDL::Stats::Basic::code_ivs( $var_ref );
1329 171         728 my $var_max = $var->max;
1330 171 100       3322 confess "effect_code called with only one unique value" if $var_max < 1;
1331 169         13285 my $var_e = yvals( float, $var->nelem, $var_max ) == $var;
1332 169         18236 $var_e->slice(which( $var == $var_max ), ) .= -1;
1333 169 100       42721 $var_e = $var_e->setbadif( $var->isbad ) if $var->badflag;
1334 169 100       2706 wantarray ? ($var_e, $map_ref) : $var_e;
1335             }
1336              
1337             =head2 effect_code_w
1338              
1339             =for ref
1340              
1341             Weighted effect code for nominal variable. returns in @ context coded pdl and % ref to level - pdl->dim(1) index.
1342              
1343             Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD).
1344              
1345             =for usage
1346              
1347             pdl> @a = qw( a a b b b c c )
1348             pdl> p $a = effect_code_w(\@a)
1349             [
1350             [ 1 1 0 0 0 -1 -1]
1351             [ 0 0 1 1 1 -1.5 -1.5]
1352             ]
1353              
1354             =cut
1355              
1356             *effect_code_w = \&PDL::effect_code_w;
1357             sub PDL::effect_code_w {
1358 2     2 0 3723 my ($var_ref) = @_;
1359 2         7 my ($var_e, $map_ref) = effect_code( $var_ref );
1360 1 0       8 return wantarray ? ($var_e, $map_ref) : $var_e if $var_e->sum == 0;
    50          
1361 1         143 my $pos = $var_e == 1;
1362 1         23 my $neg = $var_e == -1;
1363 1         30 my $w = $pos->sumover / $neg->sumover;
1364 1         26 my $neg_ind = $neg->whichND;
1365 1         552 $var_e->indexND($neg_ind) *= $w->slice($neg_ind->slice('(1)'));
1366 1 50       386 wantarray ? ($var_e, $map_ref) : $var_e;
1367             }
1368              
1369             =head2 interaction_code
1370              
1371             Returns the coded interaction term for effect-coded variables.
1372              
1373             Supports BAD value (missing or 'BAD' values result in the corresponding
1374             pdl elements being marked as BAD).
1375              
1376             =for usage
1377              
1378             pdl> $a = sequence(6) > 2
1379             pdl> p $a = $a->effect_code
1380             [
1381             [ 1 1 1 -1 -1 -1]
1382             ]
1383              
1384             pdl> $b = pdl( qw( 0 1 2 0 1 2 ) )
1385             pdl> p $b = $b->effect_code
1386             [
1387             [ 1 0 -1 1 0 -1]
1388             [ 0 1 -1 0 1 -1]
1389             ]
1390              
1391             pdl> p $ab = interaction_code( $a, $b )
1392             [
1393             [ 1 0 -1 -1 -0 1]
1394             [ 0 1 -1 -0 -1 1]
1395             ]
1396              
1397             =cut
1398              
1399             *interaction_code = \&PDL::interaction_code;
1400             sub PDL::interaction_code {
1401 72     72 0 1177 my $i = ones( $_[0]->dim(0), 1 );
1402 72         3832 $i = ($i * $_->dummy(1))->clump(1,2) for @_;
1403 72         22883 $i;
1404             }
1405              
1406             =head2 ols
1407              
1408             =for ref
1409              
1410             Ordinary least squares regression, aka linear regression.
1411              
1412             Unlike B, ols is not broadcastable, but it can handle bad value (by
1413             ignoring observations with bad value in dependent or independent variables
1414             list-wise) and returns the full model in list context with various stats.
1415              
1416             IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply
1417             the constant vector in $x. Intercept is automatically added and returned
1418             as FIRST of the coeffs if CONST=>1. Returns full model in list context
1419             and coeff in scalar context.
1420              
1421             For more on multiple linear regression see
1422             L.
1423              
1424             =for options
1425              
1426             Default options (case insensitive):
1427              
1428             CONST => 1,
1429             PLOT => 0, # see plot_residuals() for plot options
1430             WIN => undef, # for plotting
1431              
1432             =for usage
1433              
1434             Usage:
1435              
1436             # suppose this is a person's ratings for top 10 box office movies
1437             # ascending sorted by box office
1438              
1439             pdl> $y = pdl '[1 1 2 2 2 2 4 4 5 5]'
1440              
1441             # construct IV with linear and quadratic component
1442              
1443             pdl> p $x = cat sequence(10), sequence(10)**2
1444             [
1445             [ 0 1 2 3 4 5 6 7 8 9]
1446             [ 0 1 4 9 16 25 36 49 64 81]
1447             ]
1448              
1449             pdl> %m = $y->ols( $x )
1450              
1451             pdl> p "$_\t@{[$m{$_} =~ /^\n*(.*?)\n*\z/s]}\n" for sort keys %m
1452              
1453             F 40.4225352112676
1454             F_df [2 7]
1455             F_p 0.000142834216344756
1456             R2 0.920314253647587
1457              
1458             # coeff constant linear quadratic
1459             b [0.981818 0.212121 0.030303]
1460             b_p [0.039910 0.328001 0.203034]
1461             b_se [0.389875 0.201746 0.021579]
1462             b_t [2.518284 1.051422 1.404218]
1463             ss_model 19.8787878787879
1464             ss_residual 1.72121212121212
1465             ss_total 21.6
1466             y_pred [0.98181818 1.2242424 1.5272727 ... 4.6181818 5.3454545]
1467              
1468             =cut
1469              
1470             *ols = \&PDL::ols;
1471             sub PDL::ols {
1472 4     4 0 8761 _ols_common(0, @_);
1473             }
1474              
1475             # ivs = [nobs x nivs] so can `dog` retval
1476             sub _rm_bad_value {
1477 11     11   30 my ($y, $ivs) = @_;
1478 11 50 66     41 return ($y, $ivs, undef) if !$y->check_badflag and !$ivs->check_badflag;
1479 3         1018 my $idx = which($y->isgood & (nbadover ($ivs->transpose)==0));
1480 3         596 $_ = $_->slice($idx)->sever for $y, $ivs;
1481 3         343 $_->badflag(0) for $y, $ivs;
1482 3         17 ($y, $ivs, $idx);
1483             }
1484              
1485             =head2 ols_rptd
1486              
1487             =for ref
1488              
1489             Repeated measures linear regression.
1490             Handles purely within-subject design for now.
1491              
1492             (Lorch & Myers, 1990; Van den Noortgate & Onghena, 2006).
1493             See F for an example using the Lorch and Myers data.
1494              
1495             =for usage
1496              
1497             Usage:
1498              
1499             # This is the example from Lorch and Myers (1990),
1500             # a study on how characteristics of sentences affected reading time
1501             # Three within-subject IVs:
1502             # SP -- serial position of sentence
1503             # WORDS -- number of words in sentence
1504             # NEW -- number of new arguments in sentence
1505              
1506             # $subj can be 1D pdl or @ ref and must be the first argument
1507             # IV can be 1D @ ref or pdl
1508             # 1D @ ref is effect coded internally into pdl
1509             # pdl is left as is
1510              
1511             my %r = $rt->ols_rptd( $subj, $sp, $words, $new );
1512              
1513             print "$_\t$r{$_}\n" for sort keys %r;
1514              
1515             ss_residual 58.3754646504336
1516             ss_subject 51.8590337714286
1517             ss_total 405.188241771429
1518             # SP WORDS NEW
1519             F [ 7.208473 61.354153 1.0243311]
1520             F_p [0.025006181 2.619081e-05 0.33792837]
1521             coeff [0.33337285 0.45858933 0.15162986]
1522             df [1 1 1]
1523             df_err [9 9 9]
1524             ms [ 18.450705 73.813294 0.57026483]
1525             ms_err [ 2.5595857 1.2030692 0.55671923]
1526             ss [ 18.450705 73.813294 0.57026483]
1527             ss_err [ 23.036272 10.827623 5.0104731]
1528              
1529             =cut
1530              
1531             *ols_rptd = \&PDL::ols_rptd;
1532             sub PDL::ols_rptd {
1533 1     1 0 131 my ($y, $subj, @ivs_raw) = @_;
1534              
1535 1         5 $y = $y->squeeze;
1536 1 50       46 $y->getndims > 1 and
1537             croak "ols_rptd does not support broadcasting";
1538              
1539 1 50 33     3 my @ivs = map { (ref $_ eq 'PDL' and $_->ndims > 1)? $_
  3 50       127  
1540             : ref $_ eq 'PDL' ? $_->dummy(1)
1541             : scalar effect_code($_)
1542             ;
1543             } @ivs_raw;
1544              
1545 1         36 my %r;
1546              
1547 1         16 $r{ss_total} = $y->ss;
1548              
1549             # STEP 1: subj
1550 1         6 my $s = effect_code $subj;
1551 1         5 my $b_s = $y->ols_t($s);
1552 1         5 my $pred = sumover($b_s->slice('1:-1') * $s->transpose) + $b_s->slice(0);
1553 1         104 $r{ss_subject} = $r{ss_total} - $y->sse( $pred );
1554              
1555             # STEP 2: add predictor variables
1556 1         21 my $iv_p = $s->glue(1, @ivs);
1557 1         412 my $b_p = $y->ols_t($iv_p);
1558              
1559             # only care about coeff for predictor vars. no subj or const coeff
1560 1         17 $r{coeff} = $b_p->slice([-@ivs,-1])->sever;
1561              
1562             # get total sse for this step
1563 1         41 $pred = sumover($b_p->slice('1:-1') * $iv_p->transpose) + $b_p->slice(0);
1564 1         128 my $ss_pe = $y->sse( $pred );
1565              
1566             # get predictor ss by successively reducing the model
1567 1         8 $r{ss} = zeroes scalar(@ivs);
1568 1         19 for my $i (0 .. $#ivs) {
1569 3         219 my $iv = $s->glue(1, @ivs[ grep $_ != $i, 0..$#ivs ]);
1570 3         987 my $b = $y->ols_t($iv);
1571 3         22 $pred = sumover($b->slice('1:-1') * $iv->transpose) + $b->slice(0);
1572 3         678 $r{ss}->slice($i) .= $y->sse($pred) - $ss_pe;
1573             }
1574              
1575             # STEP 3: get predictor x subj interaction as error term
1576 1         79 my $iv_e = PDL::glue 1, map interaction_code( $s, $_ ), @ivs;
1577              
1578             # get total sse for this step. full model now.
1579 1         431 my $b_f = $y->ols_t( $iv_p->glue(1,$iv_e) );
1580 1         6 $pred = sumover($b_f->slice('1:-1') * $iv_p->glue(1,$iv_e)->transpose) + $b_f->slice(0);
1581 1         204 $r{ss_residual} = $y->sse( $pred );
1582              
1583             # get predictor x subj ss by successively reducing the error term
1584 1         6 $r{ss_err} = zeroes scalar(@ivs);
1585 1         17 for my $i (0 .. $#ivs) {
1586 3         167 my $iv = $iv_p->glue(1, map interaction_code($s, $_),
1587             @ivs[grep $_ != $i, 0..$#ivs]);
1588 3         926 my $b = $y->ols_t($iv);
1589 3         14 my $pred = sumover($b->slice('1:-1') * $iv->transpose) + $b->slice(0);
1590 3         335 $r{ss_err}->slice($i) .= $y->sse($pred) - $r{ss_residual};
1591             }
1592              
1593             # Finally, get MS, F, etc
1594              
1595 1         78 $r{df} = pdl( map $_->squeeze->ndims, @ivs );
1596 1         251 $r{ms} = $r{ss} / $r{df};
1597              
1598 1         25 $r{df_err} = $s->dim(1) * $r{df};
1599 1         24 $r{ms_err} = $r{ss_err} / $r{df_err};
1600              
1601 1         15 $r{F} = $r{ms} / $r{ms_err};
1602              
1603             $r{F_p} = 1 - $r{F}->gsl_cdf_fdist_P( $r{df}, $r{df_err} )
1604 1 50       18 if $CDF;
1605              
1606 1         53 %r;
1607             }
1608              
1609              
1610             =head2 logistic
1611              
1612             =for ref
1613              
1614             Logistic regression with maximum likelihood estimation using L.
1615              
1616             IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply
1617             the constant vector in $x. It is included in the model and returned as
1618             LAST of coeff. Returns full model in list context and coeff in scalar
1619             context.
1620              
1621             The significance tests are likelihood ratio tests (-2LL deviance)
1622             tests. IV significance is tested by comparing deviances between the
1623             reduced model (ie with the IV in question removed) and the full model.
1624              
1625             ***NOTE: the results here are qualitatively similar to but not identical
1626             with results from R, because different algorithms are used for the
1627             nonlinear parameter fit. Use with discretion***
1628              
1629             =for options
1630              
1631             Default options (case insensitive):
1632              
1633             INITP => zeroes( $x->dim(1) + 1 ), # n_iv + 1
1634             MAXIT => 1000,
1635             EPS => 1e-7,
1636              
1637             =for usage
1638              
1639             Usage:
1640              
1641             # suppose this is whether a person had rented 10 movies
1642              
1643             pdl> p $y = ushort( random(10)*2 )
1644             [0 0 0 1 1 0 0 1 1 1]
1645              
1646             # IV 1 is box office ranking
1647              
1648             pdl> p $x1 = sequence(10)
1649             [0 1 2 3 4 5 6 7 8 9]
1650              
1651             # IV 2 is whether the movie is action- or chick-flick
1652              
1653             pdl> p $x2 = sequence(10) % 2
1654             [0 1 0 1 0 1 0 1 0 1]
1655              
1656             # concatenate the IVs together
1657              
1658             pdl> p $x = cat $x1, $x2
1659             [
1660             [0 1 2 3 4 5 6 7 8 9]
1661             [0 1 0 1 0 1 0 1 0 1]
1662             ]
1663              
1664             pdl> %m = $y->logistic( $x )
1665              
1666             pdl> p "$_\t$m{$_}\n" for sort keys %m
1667              
1668             D0 13.8629436111989
1669             Dm 9.8627829791575
1670             Dm_chisq 4.00016063204141
1671             Dm_df 2
1672             Dm_p 0.135324414081692
1673             # ranking genre constant
1674             b [0.41127706 0.53876358 -2.1201285]
1675             b_chisq [ 3.5974504 0.16835559 2.8577151]
1676             b_p [0.057868258 0.6815774 0.090936587]
1677             iter 12
1678             y_pred [0.10715577 0.23683909 ... 0.76316091 0.89284423]
1679              
1680             # to get the covariance out, supply a true value for the COV option:
1681             pdl> %m = $y->logistic( $x, {COV=>1} )
1682             pdl> p $m{cov};
1683              
1684             =cut
1685              
1686             *logistic = \&PDL::logistic;
1687 0     0 0 0 sub PDL::logistic {
1688 0         0 require PDL::Fit::LM;
1689 0         0 my ( $self, $ivs, $opt ) = @_;
1690             $self = $self->squeeze;
1691 0 0       0 # make compatible w multiple var cases
1692 0 0       0 $ivs->getndims == 1 and $ivs = $ivs->dummy(1);
1693             $self->dim(0) != $ivs->dim(0) and
1694 0         0 carp "mismatched n btwn DV and IV!";
1695             my %opt = (
1696             INITP => zeroes( $ivs->dim(1) + 1 ), # n_ivs + 1
1697             MAXIT => 1000,
1698             EPS => 1e-7,
1699 0 0       0 );
  0         0  
1700             if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
1701 0         0 # not using it atm
1702             $opt{WT} = 1;
1703             # Use lmfit. Fourth input argument is reference to user-defined
1704             # copy INITP so we have the original value when needed
1705             my ($yfit,$coeff,$cov,$iter)
1706 0         0 = PDL::Fit::LM::lmfit($ivs, $self, $opt{WT}, \&_logistic, $opt{INITP}->copy,
1707             { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
1708             # apparently at least coeff is child of some pdl
1709 0         0 # which is changed in later lmfit calls
1710 0         0 $yfit = $yfit->copy;
1711 0 0       0 $coeff = $coeff->copy;
1712 0         0 return $coeff unless wantarray;
1713 0         0 my %ret;
1714 0         0 my $n0 = $self->where($self == 0)->nelem;
1715 0 0       0 my $n1 = $self->nelem - $n0;
1716 0         0 $ret{cov} = $cov if $opt{COV};
1717 0         0 $ret{D0} = -2*($n0 * log($n0 / $self->nelem) + $n1 * log($n1 / $self->nelem));
1718 0         0 $ret{Dm} = sum( $self->dvrs( $yfit ) ** 2 );
1719 0         0 $ret{Dm_chisq} = $ret{D0} - $ret{Dm};
1720             $ret{Dm_df} = $ivs->dim(1);
1721             $ret{Dm_p}
1722 0 0       0 = 1 - PDL::GSL::CDF::gsl_cdf_chisq_P( $ret{Dm_chisq}, $ret{Dm_df} )
1723 0         0 if $CDF;
1724 0 0       0 my $coeff_chisq = zeroes $opt{INITP}->nelem;
1725 0         0 if ( $ivs->dim(1) > 1 ) {
1726 0         0 for my $k (0 .. $ivs->dim(1)-1) {
  0         0  
1727 0         0 my @G = grep { $_ != $k } (0 .. $ivs->dim(1)-1);
1728 0         0 my $G = $ivs->dice_axis(1, \@G);
1729             my $init = $opt{INITP}->dice([ @G, $opt{INITP}->dim(0)-1 ])->copy;
1730             my $y_G
1731 0         0 = PDL::Fit::LM::lmfit( $G, $self, $opt{WT}, \&_logistic, $init,
1732 0         0 { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
1733             $coeff_chisq->slice($k) .= $self->dm( $y_G ) - $ret{Dm};
1734             }
1735             }
1736             else {
1737 0         0 # d0 is, by definition, the deviance with only intercept
1738             $coeff_chisq->slice(0) .= $ret{D0} - $ret{Dm};
1739             }
1740             my $y_c
1741 0         0 = PDL::Fit::LM::lmfit( $ivs, $self, $opt{WT}, \&_logistic_no_intercept, $opt{INITP}->slice('0:-2')->sever,
1742 0         0 { Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } );
1743 0         0 $coeff_chisq->slice(-1) .= $self->dm( $y_c ) - $ret{Dm};
1744 0         0 $ret{b} = $coeff;
1745 0 0       0 $ret{b_chisq} = $coeff_chisq;
1746             $ret{b_p} = 1 - $ret{b_chisq}->gsl_cdf_chisq_P( 1 )
1747 0         0 if $CDF;
1748 0         0 $ret{y_pred} = $yfit;
1749 0 0       0 $ret{iter} = $iter;
  0         0  
1750 0         0 for (keys %ret) { ref $ret{$_} eq 'PDL' and $ret{$_} = $ret{$_}->squeeze };
1751             %ret;
1752             }
1753              
1754 0     0   0 sub _logistic {
1755             my ($x,$par,$ym,$dyda) = @_;
1756 0         0 # $b and $c are fit parameters slope and intercept
1757 0         0 my $b = $par->slice([0,$x->dim(1) - 1])->sever;
1758             my $c = $par->slice(-1)->sever;
1759             # Write function with dependent variable $ym,
1760             # independent variable $x, and fit parameters as specified above.
1761             # Use the .= (dot equals) assignment operator to express the equality
1762 0         0 # (not just a plain equals)
1763 0         0 $ym .= 1 / ( 1 + exp( -1 * (sumover($b * $x->transpose) + $c) ) );
1764             my @dy = map $dyda->slice(",($_)"), 0 .. $par->dim(0)-1;
1765             # Partial derivative of the function with respect to each slope
1766             # fit parameter ($b in this case). Again, note .= assignment
1767             # operator (not just "equals")
1768 0         0 $dy[$_] .= $x->slice(':',$_) * $ym * (1 - $ym)
1769             for (0 .. $b->dim(0)-1);
1770 0         0 # Partial derivative of the function re intercept par
1771             $dy[-1] .= $ym * (1 - $ym);
1772             }
1773              
1774 0     0   0 sub _logistic_no_intercept {
1775 0         0 my ($x,$par,$ym,$dyda) = @_;
1776             my $b = $par->slice([0,$x->dim(1) - 1])->sever;
1777             # Write function with dependent variable $ym,
1778             # independent variable $x, and fit parameters as specified above.
1779             # Use the .= (dot equals) assignment operator to express the equality
1780 0         0 # (not just a plain equals)
1781 0         0 $ym .= 1 / ( 1 + exp( -1 * sumover($b * $x->transpose) ) );
  0         0  
1782             my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1);
1783             # Partial derivative of the function with respect to each slope
1784             # fit parameter ($b in this case). Again, note .= assignment
1785             # operator (not just "equals")
1786 0         0 $dy[$_] .= $x->slice(':',$_) * $ym * (1 - $ym)
1787             for 0 .. $b->dim(0)-1;
1788             }
1789              
1790             =head2 pca
1791              
1792             =for ref
1793              
1794             Principal component analysis. Based on corr instead of cov.
1795              
1796             Bad values are ignored pair-wise. OK when bad values are few but otherwise
1797             probably should fill_m etc before pca). Uses L.
1798              
1799             =for options
1800              
1801             Default options (case insensitive):
1802              
1803             CORR => 1, # boolean. use correlation or covariance
1804             PLOT => 0, # calls plot_screes by default
1805             # can set plot_screes options here
1806             WIN => undef, # for plotting
1807              
1808             =for usage
1809              
1810             Usage:
1811              
1812             my $d = qsort random 10, 5; # 10 obs on 5 variables
1813             my %r = $d->pca( \%opt );
1814             print "$_\t$r{$_}\n" for (keys %r);
1815              
1816             eigenvalue # variance accounted for by each component
1817             [4.70192 0.199604 0.0471421 0.0372981 0.0140346]
1818              
1819             eigenvector # dim var x comp. weights for mapping variables to component
1820             [
1821             [ -0.451251 -0.440696 -0.457628 -0.451491 -0.434618]
1822             [ -0.274551 0.582455 0.131494 0.255261 -0.709168]
1823             [ 0.43282 0.500662 -0.139209 -0.735144 -0.0467834]
1824             [ 0.693634 -0.428171 0.125114 0.128145 -0.550879]
1825             [ 0.229202 0.180393 -0.859217 0.4173 0.0503155]
1826             ]
1827              
1828             loadings # dim var x comp. correlation between variable and component
1829             [
1830             [ -0.978489 -0.955601 -0.992316 -0.97901 -0.942421]
1831             [ -0.122661 0.260224 0.0587476 0.114043 -0.316836]
1832             [ 0.0939749 0.108705 -0.0302253 -0.159616 -0.0101577]
1833             [ 0.13396 -0.0826915 0.0241629 0.0247483 -0.10639]
1834             [ 0.027153 0.0213708 -0.101789 0.0494365 0.00596076]
1835             ]
1836              
1837             pct_var # percent variance accounted for by each component
1838             [0.940384 0.0399209 0.00942842 0.00745963 0.00280691]
1839              
1840             Plot scores along the first two components,
1841              
1842             $d->plot_scores( $r{eigenvector} );
1843              
1844             =cut
1845              
1846             *pca = \&PDL::pca;
1847 3     3 0 16578 sub PDL::pca {
1848 3         17 my ($self, $opt) = @_;
1849             my %opt = (
1850             CORR => 1,
1851             PLOT => 0,
1852             WIN => undef, # for plotting
1853 3 50       14 );
  3         21  
1854 3 100       344 if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
1855             my $var_var = $opt{CORR}? $self->corr_table : $self->cov_table;
1856 3         67 # value is axis pdl and score is var x axis
1857 3         4008 my ($eigvec, $eigval) = $var_var->eigens_sym;
1858             $eigvec = $eigvec->transpose; # compatibility with PDL::Slatec::eigsys
1859 3         182 # ind is sticky point for broadcasting
1860 3         196 my $ind_sorted = $eigval->qsorti->slice('-1:0');
1861 3         409 $eigvec = $eigvec->slice(':',$ind_sorted)->sever;
1862             $eigval = $eigval->slice($ind_sorted)->sever;
1863 3         289 # var x axis
1864 3         221 my $var = $eigval / $eigval->sum->sclr;
1865 3 100       14 my $loadings;
1866 2         8 if ($opt{CORR}) {
1867             $loadings = $eigvec * sqrt( $eigval->transpose );
1868             }
1869 1         25 else {
1870 1         185 my $scores = $eigvec x $self->dev_m;
1871             $loadings = $self->corr( $scores->dummy(1) );
1872             }
1873 3 50       389 $var->plot_screes(\%opt)
1874 3         99 if $opt{PLOT};
1875             ( eigenvalue=>$eigval, eigenvector=>$eigvec,
1876             pct_var=>$var, loadings=>$loadings );
1877             }
1878              
1879             =head2 pca_sorti
1880              
1881             Determine by which vars a component is best represented. Descending sort
1882             vars by size of association with that component. Returns sorted var and
1883             relevant component indices.
1884              
1885             =for options
1886              
1887             Default options (case insensitive):
1888              
1889             NCOMP => 10, # maximum number of components to consider
1890              
1891             =for usage
1892              
1893             Usage:
1894              
1895             # let's see if we replicated the Osgood et al. (1957) study
1896             pdl> ($data, $idv, $ido) = rtable 'osgood_exp.csv', {v=>0}
1897              
1898             # select a subset of var to do pca
1899             pdl> $ind = which_id $idv, [qw( ACTIVE BASS BRIGHT CALM FAST GOOD HAPPY HARD LARGE HEAVY )]
1900             pdl> $data = $data( ,$ind)->sever
1901             pdl> @$idv = @$idv[list $ind]
1902              
1903             pdl> %m = $data->pca
1904              
1905             pdl> ($iv, $ic) = $m{loadings}->pca_sorti()
1906              
1907             pdl> p "$idv->[$_]\t" . $m{loadings}->($_,$ic)->flat . "\n" for (list $iv)
1908              
1909             # COMP0 COMP1 COMP2 COMP3
1910             HAPPY [0.860191 0.364911 0.174372 -0.10484]
1911             GOOD [0.848694 0.303652 0.198378 -0.115177]
1912             CALM [0.821177 -0.130542 0.396215 -0.125368]
1913             BRIGHT [0.78303 0.232808 -0.0534081 -0.0528796]
1914             HEAVY [-0.623036 0.454826 0.50447 0.073007]
1915             HARD [-0.679179 0.0505568 0.384467 0.165608]
1916             ACTIVE [-0.161098 0.760778 -0.44893 -0.0888592]
1917             FAST [-0.196042 0.71479 -0.471355 0.00460276]
1918             LARGE [-0.241994 0.594644 0.634703 -0.00618055]
1919             BASS [-0.621213 -0.124918 0.0605367 -0.765184]
1920              
1921             =cut
1922              
1923             *pca_sorti = \&PDL::pca_sorti;
1924             sub PDL::pca_sorti {
1925 1     1 0 9917 # $self is pdl (var x component)
1926 1         4 my ($self, $opt) = @_;
1927             my %opt = (
1928             NCOMP => 10, # maximum number of components to consider
1929 1 50       6 );
  0         0  
1930 1         8 if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
1931 1         187 my $ncomp = pdl($opt{NCOMP}, $self->dim(1))->min;
1932 1         244 $self = $self->dice_axis( 1, pdl(0..$ncomp-1) );
1933             my $icomp = $self->transpose->abs->maximum_ind;
1934 1         95 # sort between comp
1935 1         6 my $ivar_sort = $icomp->qsorti;
1936             $self = $self->slice($ivar_sort)->sever;
1937 1         74 # sort within comp
1938 1         12 my $ic = $icomp->slice($ivar_sort)->iv_cluster;
1939 3         893 for my $comp (0 .. $ic->dim(1)-1) {
1940 3         739 my $i = $self->slice(which($ic->slice(':',$comp)), "($comp)")->qsorti->slice('-1:0');
1941             $ivar_sort->slice(which $ic->slice(':',$comp))
1942             .= $ivar_sort->slice(which $ic->slice(':',$comp))->slice($i);
1943 1 50       421 }
1944             wantarray ? ($ivar_sort, pdl(0 .. $ic->dim(1)-1)) : $ivar_sort;
1945             }
1946              
1947             =head2 plot_means
1948              
1949             Plots means anova style. Can handle up to 4-way interactions (ie 4D pdl).
1950              
1951             =for options
1952              
1953             Default options (case insensitive):
1954              
1955             IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'],
1956             DVNM => 'DV',
1957             AUTO => 1, # auto set dims to be on x-axis, line, panel
1958             # if set 0, dim 0 goes on x-axis, dim 1 as lines
1959             # dim 2+ as panels
1960             # see PDL::Graphics::Simple for next option
1961             WIN => undef, # pgswin object. not closed here if passed
1962             # allows comparing multiple lines in same plot
1963             SIZE => 640, # individual square panel size in pixels
1964              
1965             =for usage
1966              
1967             Usage:
1968              
1969             # see anova for mean / se pdl structure
1970             $mean->plot_means( $se, {IVNM=>['apple', 'bake']} );
1971              
1972             Or like this:
1973              
1974             $m{'| apple ~ bake | m'}->plot_means;
1975              
1976             =cut
1977              
1978             *plot_means = \&PDL::plot_means;
1979 0     0 0   sub PDL::plot_means {
1980 0 0         require PDL::Graphics::Simple;
1981 0           my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
1982 0           my ($self, $se) = @_;
1983 0 0         $self = $self->squeeze;
1984 0           if ($self->ndims > 4) {
1985 0           carp "Data is > 4D. No plot here.";
1986             return;
1987 0           }
1988             my %opt = (
1989             IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'],
1990             DVNM => 'DV',
1991             AUTO => 1, # auto set vars to be on X axis, line, panel
1992             WIN => undef, # PDL::Graphics::Simple object
1993             SIZE => 640, # individual square panel size in pixels
1994 0 0         );
  0            
1995             if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt }
1996             # decide which vars to plot as x axis, lines, panels
1997             # put var w most levels on x axis
1998 0           # put var w least levels on diff panels
1999 0           my @iD = 0..3;
2000             my @dims = (1, 1, 1, 1);
2001 0           # splice ARRAY,OFFSET,LENGTH,LIST
2002 0           splice @dims, 0, $self->ndims, $self->dims;
2003 0 0         $self = $self->reshape(@dims)->sever;
2004             $se = $se->reshape(@dims)->sever
2005             if defined $se;
2006 0 0         @iD = reverse list qsorti pdl @dims
2007             if $opt{AUTO};
2008             # $iD[0] on x axis
2009 0           # $iD[1] as separate lines
2010 0           my $nx = $self->dim($iD[2]); # n xpanels
2011             my $ny = $self->dim($iD[3]); # n ypanels
2012 0   0       my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin(
2013 0           size=>[$opt{SIZE}*$nx, $opt{SIZE}*$ny,'px']);
2014 0           my $seq0 = sequence(my $dim0 = $self->dim($iD[0]));
2015 0           my ($pcount, @plots) = 0;
2016 0           for my $y (0..$ny-1) {
2017 0           for my $x (0..$nx-1) {
2018 0 0         my $key_prefix = "$opt{IVNM}[$iD[0]]|";
2019 0 0         $key_prefix .= $opt{IVNM}[$iD[2]] . "=$x|" if $nx > 1;
2020 0           $key_prefix .= $opt{IVNM}[$iD[3]] . "=$y|" if $ny > 1;
2021 0           for (0 .. $self->dim($iD[1]) - 1) {
2022 0           my $ke = "$key_prefix$opt{IVNM}[$iD[1]]=$_";
2023 0           my $ydiced = $self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_)->squeeze;
2024             push @plots, with=>'lines', ke=>"$ke mean", style=>$pcount,
2025 0 0         $seq0+$pcount*0.05, $ydiced;
2026             push @plots, with=>'errorbars', ke=>"$ke error", style=>$pcount,
2027             $seq0+$pcount*0.05, $ydiced,
2028             $se->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)
2029             ->dice_axis($iD[1],$_)->squeeze
2030 0           if defined($se);
2031             $pcount++;
2032             }
2033             }
2034 0 0         }
2035             my ($ymin, $ymax) = pdl($self, !defined $se ? () : ($self+$se, $self-$se))->minmax;
2036             $w->plot(@plots,
2037 0           { xlabel=>$opt{IVNM}[$iD[0]], ylabel=>$opt{DVNM},
2038             xrange=>[-0.05,$dim0-1+$pcount*0.05], yrange=>[$ymin-0.05,$ymax+0.05] }
2039 0           );
2040             $w;
2041             }
2042              
2043             =head2 plot_residuals
2044              
2045             Plots residuals against predicted values.
2046              
2047             =for usage
2048              
2049             Usage:
2050              
2051             use PDL::Graphics::Simple;
2052             $w = pgswin();
2053             $y->plot_residuals( $y_pred, { win=>$w } );
2054              
2055             =for options
2056              
2057             Default options (case insensitive):
2058              
2059             # see PDL::Graphics::Simple for more info
2060             WIN => undef, # pgswin object. not closed here if passed
2061             # allows comparing multiple lines in same plot
2062             SIZE => 640, # plot size in pixels
2063             COLOR => 1,
2064              
2065             =cut
2066              
2067             *plot_residuals = \&PDL::plot_residuals;
2068 0     0 0   sub PDL::plot_residuals {
2069 0 0         require PDL::Graphics::Simple;
2070 0           my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2071 0           my ($y, $y_pred) = @_;
2072             my %opt = (
2073             # see PDL::Graphics::Simple for next options
2074             WIN => undef, # pgswin object. not closed here if passed
2075             # allows comparing multiple lines in same plot
2076             SIZE => 640, # plot size in pixels
2077             COLOR => 1,
2078 0 0         );
  0            
2079 0           if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
2080 0   0       my $residuals = $y - $y_pred;
2081             my $win = $opt{WIN} || PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']);
2082             $win->plot(
2083 0           with=>'points', style=>$opt{COLOR}, $y_pred, $residuals,
2084             with=>'lines', style=>$opt{COLOR}, pdl($y_pred->minmax), pdl(0,0), # 0-line
2085             {xlabel=>'predicted value', ylabel=>'residuals'},
2086             );
2087             }
2088              
2089              
2090             =head2 plot_scores
2091              
2092             Plots standardized original and PCA transformed scores against two components. (Thank you, Bob MacCallum, for the documentation suggestion that led to this function.)
2093              
2094             =for options
2095              
2096             Default options (case insensitive):
2097              
2098             CORR => 1, # boolean. PCA was based on correlation or covariance
2099             COMP => [0,1], # indices to components to plot
2100             # see PDL::Graphics::Simple for next options
2101             WIN => undef, # pgswin object. not closed here if passed
2102             # allows comparing multiple lines in same plot
2103             SIZE => 640, # plot size in pixels
2104             COLOR => [1,2], # color for original and rotated scores
2105              
2106             =for usage
2107              
2108             Usage:
2109              
2110             my %p = $data->pca();
2111             $data->plot_scores( $p{eigenvector}, \%opt );
2112              
2113             =cut
2114              
2115 0     0 0   *plot_scores = \&PDL::plot_scores;
2116 0 0         sub PDL::plot_scores {
2117 0           require PDL::Graphics::Simple;
2118 0           my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2119             my ($self, $eigvec) = @_;
2120             my %opt = (
2121             CORR => 1, # boolean. PCA was based on correlation or covariance
2122             COMP => [0,1], # indices to components to plot
2123             # see PDL::Graphics::Simple for next options
2124             WIN => undef, # pgswin object. not closed here if passed
2125             # allows comparing multiple lines in same plot
2126             SIZE => 640, # plot size in pixels
2127 0 0         COLOR => [1,2], # color for original and transformed scoress
  0            
2128 0           );
2129 0 0         if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt }
2130             my $i = pdl $opt{COMP};
2131 0           my $z = $opt{CORR} ? $self->stddz : $self->dev_m;
2132 0           # transformed normed values
2133 0   0       my $scores = sumover($eigvec->slice(':',$i) * $z->transpose->dummy(1))->transpose;
2134             $z = $z->slice(':',$i)->sever;
2135             my $win = $opt{WIN} || PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']);
2136 0           $win->plot(
2137             with=>'points', style=>$opt{COLOR}[0], ke=>'original', $z->slice(',(0)'), $z->slice(',(1)'),
2138             with=>'points', style=>$opt{COLOR}[1], ke=>'transformed', $scores->slice(',(0)'), $scores->slice(',(1)'),
2139             {xlabel=>"Component $opt{COMP}[0]", ylabel=>"Component $opt{COMP}[1]"},
2140             );
2141             }
2142              
2143              
2144             =head2 plot_screes
2145              
2146             Scree plot. Plots proportion of variance accounted for by PCA components.
2147              
2148             =for options
2149              
2150             Default options (case insensitive):
2151              
2152             NCOMP => 20, # max number of components to plot
2153             CUT => 0, # set to plot cutoff line after this many components
2154             # undef to plot suggested cutoff line for NCOMP comps
2155             # see PDL::Graphics::Simple for next options
2156             WIN => undef, # pgswin object. not closed here if passed
2157             # allows comparing multiple lines in same plot
2158             SIZE => 640, # plot size in pixels
2159              
2160             =for usage
2161              
2162             Usage:
2163              
2164             # variance should be in descending order
2165             $d = qsort random 10, 5; # 10 obs on 5 variables
2166             %pca = $d->pca( \%opt );
2167             $pca{pct_var}->plot_screes( {ncomp=>16, win=>$win=PDL::Graphics::Simple::pgswin()} );
2168              
2169             Or, because NCOMP is used so often, it is allowed a shortcut,
2170              
2171             $pca{pct_var}->plot_screes( 16 );
2172              
2173             =cut
2174              
2175 0     0 0   *plot_scree = \&PDL::plot_screes; # here for now for compatibility
2176 0 0         *plot_screes = \&PDL::plot_screes;
2177 0           sub PDL::plot_screes {
2178 0           require PDL::Graphics::Simple;
2179             my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2180             my ($self, $ncomp) = @_;
2181             my %opt = (
2182             NCOMP => 20, # max number of components to plot
2183             CUT => 0, # set to plot cutoff line after this many components
2184             # undef to plot suggested cutoff line for NCOMP comps
2185             # see PDL::Graphics::Simple for next options
2186             WIN => undef, # pgswin object. not closed here if passed
2187 0 0         # allows comparing multiple lines in same plot
  0            
2188 0 0         SIZE => 640, # plot size in pixels
2189             );
2190             if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
2191 0 0         $opt{NCOMP} = $ncomp
2192 0           if $ncomp;
2193             # re-use $ncomp below
2194 0 0         $ncomp = ($self->dim(0) < $opt{NCOMP})? $self->dim(0) : $opt{NCOMP};
2195             my $self_comp = $self->slice([0,$ncomp-1]);
2196 0   0       $opt{CUT} = PDL::Stats::Kmeans::_scree_ind $self_comp
2197             if !defined $opt{CUT};
2198             my $win = $opt{WIN} ||
2199 0 0         PDL::Graphics::Simple::pgswin(size=>[@opt{qw(SIZE SIZE)}, 'px']);
2200             $win->plot(
2201             with=>'lines', ke=>'scree', sequence($ncomp), $self_comp,
2202             !$opt{CUT} ? () : (with=>'lines', ke=>'cut', pdl($opt{CUT}-.5, $opt{CUT}-.5), pdl(-.05, $self->max->sclr+.05)),
2203             {xlabel=>'Component', ylabel=>'Proportion of Variance Accounted for',
2204             xrange=>[-0.05,$ncomp-0.95], yrange=>[0,1], le=>'tr'},
2205             );
2206             }
2207              
2208             =head2 plot_stripchart
2209              
2210             Stripchart plot. Plots ANOVA-style data, categorised against given IVs.
2211              
2212             =for options
2213              
2214             Default options (case insensitive):
2215              
2216             IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
2217             DVNM => 'DV',
2218             # see PDL::Graphics::Simple for next options
2219             WIN => undef, # pgswin object. not closed here if passed
2220              
2221             =for usage
2222              
2223             Usage:
2224              
2225             %m = $y->plot_stripchart( $a, \@b, { IVNM=>[qw(apple bake)] } );
2226              
2227             =cut
2228              
2229 0     0 0   my $CHART_GAP = 0.1;
2230 0 0         *plot_stripchart = \&PDL::plot_stripchart;
2231 0           sub PDL::plot_stripchart {
2232 0           require PDL::Graphics::Simple;
2233             my $opt = ref($_[-1]) eq 'HASH' ? pop @_ : undef;
2234             my ($y, @ivs_raw) = @_;
2235             my %opt = (
2236             IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ]
2237 0 0         DVNM => 'DV',
  0            
2238 0           WIN => undef, # pgswin object. not closed here if passed
2239 0 0 0       );
  0            
2240 0   0       if ($opt) { $opt{uc $_} = $opt->{$_} for keys %$opt; }
2241 0           $opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ]
2242             if !$opt{IVNM} or !@{ $opt{IVNM} };
2243 0           my $w = $opt{WIN} || PDL::Graphics::Simple::pgswin();
  0            
2244 0           my @codes = map [code_ivs($_)], @ivs_raw;
  0            
2245             my @levels = map {
2246 0           my $map = $_->[1];
2247 0           [sort {$map->{$a} <=> $map->{$b}} keys %$map];
2248 0           } @codes;
2249 0           my $xjitter = $y->random * $CHART_GAP;
2250 0           my ($pcount, @plots) = 0;
2251 0           push @plots, with=>'points', ke=>"all data", $xjitter+$pcount, $y;
2252 0           $pcount++;
2253 0           for my $i (0..$#ivs_raw) {
2254 0           my $levs = $levels[$i];
2255 0           my $name = $opt{IVNM}[$i];
2256 0           my $coded = $codes[$i][0];
2257             for my $j (0..$#$levs) {
2258             my $inds = which($coded == $j);
2259 0           push @plots, with=>'points', ke=>"$name=$levs->[$j]",
2260             $xjitter->slice($inds)+$pcount+$j*$CHART_GAP, $y->slice($inds);
2261 0           }
2262 0           $pcount++;
  0            
2263             }
2264             my ($ymin, $ymax) = $y->minmax;
2265 0           my $xmax = $pcount-1 + $CHART_GAP*($#{$levels[-1]} + 2);
2266             $w->plot(@plots,
2267 0           { xlabel=>'IV', ylabel=>$opt{DVNM},
2268             xrange=>[-1,$xmax], yrange=>[$ymin-$CHART_GAP,$ymax+$CHART_GAP] }
2269             );
2270             $w;
2271             }
2272              
2273             =head1 SEE ALSO
2274              
2275             L
2276              
2277             L
2278              
2279             =head1 REFERENCES
2280              
2281             Cohen, J., Cohen, P., West, S.G., & Aiken, L.S. (2003). Applied
2282             Multiple Regression/correlation Analysis for the Behavioral Sciences
2283             (3rd ed.). Mahwah, NJ: Lawrence Erlbaum Associates Publishers.
2284              
2285             Hosmer, D.W., & Lemeshow, S. (2000). Applied Logistic Regression (2nd
2286             ed.). New York, NY: Wiley-Interscience.
2287              
2288             Lorch, R.F., & Myers, J.L. (1990). Regression analyses of repeated
2289             measures data in cognitive research. Journal of Experimental Psychology:
2290             Learning, Memory, & Cognition, 16, 149-157.
2291              
2292             Osgood C.E., Suci, G.J., & Tannenbaum, P.H. (1957). The Measurement of
2293             Meaning. Champaign, IL: University of Illinois Press.
2294              
2295             Rutherford, A. (2011). ANOVA and ANCOVA: A GLM Approach (2nd ed.). Wiley.
2296              
2297             Shlens, J. (2009). A Tutorial on Principal Component Analysis. Retrieved
2298             April 10, 2011 from http://citeseerx.ist.psu.edu/
2299              
2300             The GLM procedure: unbalanced ANOVA for two-way design with
2301             interaction. (2008). SAS/STAT(R) 9.2 User's Guide. Retrieved June 18,
2302             2009 from http://support.sas.com/
2303              
2304             Van den Noortgate, W., & Onghena, P. (2006). Analysing repeated
2305             measures data in cognitive research: A comment on regression coefficient
2306             analyses. European Journal of Cognitive Psychology, 18, 937-952.
2307              
2308             =head1 AUTHOR
2309              
2310             Copyright (C) 2009 Maggie J. Xiong
2311              
2312             All rights reserved. There is no warranty. You are allowed to redistribute
2313             this software / documentation as described in the file COPYING in the
2314             PDL distribution.
2315              
2316             =cut
2317              
2318             EOD
2319              
2320             pp_done();