line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# GENERATED WITH PDLA::PP! Don't modify! |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
package PDLA::Stats::GLM; |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
@EXPORT_OK = qw( ols_t anova anova_rptd dummy_code effect_code effect_code_w interaction_code ols ols_rptd r2_change logistic pca pca_sorti plot_means plot_residuals plot_screes PDLA::PP fill_m PDLA::PP fill_rand PDLA::PP dev_m PDLA::PP stddz PDLA::PP sse PDLA::PP mse PDLA::PP rmse PDLA::PP pred_logistic PDLA::PP d0 PDLA::PP dm PDLA::PP dvrs ); |
8
|
|
|
|
|
|
|
%EXPORT_TAGS = (Func=>[@EXPORT_OK]); |
9
|
|
|
|
|
|
|
|
10
|
2
|
|
|
2
|
|
1427
|
use PDLA::Core; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
12
|
|
11
|
2
|
|
|
2
|
|
459
|
use PDLA::Exporter; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
10
|
|
12
|
2
|
|
|
2
|
|
52
|
use DynaLoader; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
93
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
@ISA = ( 'PDLA::Exporter','DynaLoader' ); |
18
|
|
|
|
|
|
|
push @PDLA::Core::PP, __PACKAGE__; |
19
|
|
|
|
|
|
|
bootstrap PDLA::Stats::GLM ; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
|
25
|
2
|
|
|
2
|
|
10
|
use strict; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
49
|
|
26
|
2
|
|
|
2
|
|
10
|
use warnings; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
85
|
|
27
|
|
|
|
|
|
|
|
28
|
2
|
|
|
2
|
|
21
|
use Carp; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
114
|
|
29
|
2
|
|
|
2
|
|
9
|
use PDLA::LiteF; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
13
|
|
30
|
2
|
|
|
2
|
|
3009
|
use PDLA::MatrixOps; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
13
|
|
31
|
2
|
|
|
2
|
|
251
|
use PDLA::NiceSlice; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
13
|
|
32
|
2
|
|
|
2
|
|
118770
|
use PDLA::Stats::Basic; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
23
|
|
33
|
2
|
|
|
2
|
|
1621
|
use PDLA::Stats::Kmeans; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
15
|
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
$PDLA::onlinedoc->scan(__FILE__) if $PDLA::onlinedoc; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
eval { require PDLA::GSL::CDF; }; |
38
|
|
|
|
|
|
|
my $CDF = 1 if !$@; |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
eval { require PDLA::Slatec; }; |
41
|
|
|
|
|
|
|
my $SLATEC = 1 if !$@; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
eval { |
44
|
|
|
|
|
|
|
require PDLA::Graphics::PGPLOT::Window; |
45
|
|
|
|
|
|
|
PDLA::Graphics::PGPLOT::Window->import( 'pgwin' ); |
46
|
|
|
|
|
|
|
}; |
47
|
|
|
|
|
|
|
my $PGPLOT = 1 if !$@; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
my $DEV = ($^O =~ /win/i)? '/png' : '/xs'; |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=head1 NAME |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
PDLA::Stats::GLM -- general and generalized linear modeling methods such as ANOVA, linear regression, PCA, and logistic regression. |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=head1 DESCRIPTION |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
The terms FUNCTIONS and METHODS are arbitrarily used to refer to methods that are threadable and methods that are NOT threadable, respectively. FUNCTIONS except B support bad value. B strongly recommended for most METHODS, and it is required for B. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
P-values, where appropriate, are provided if PDLA::GSL::CDF is installed. |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
=head1 SYNOPSIS |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
use PDLA::LiteF; |
64
|
|
|
|
|
|
|
use PDLA::NiceSlice; |
65
|
|
|
|
|
|
|
use PDLA::Stats::GLM; |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# do a multiple linear regression and plot the residuals |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
my $y = pdl( 8, 7, 7, 0, 2, 5, 0 ); |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
my $x = pdl( [ 0, 1, 2, 3, 4, 5, 6 ], # linear component |
72
|
|
|
|
|
|
|
[ 0, 1, 4, 9, 16, 25, 36 ] ); # quadratic component |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
my %m = $y->ols( $x, {plot=>1} ); |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
print "$_\t$m{$_}\n" for (sort keys %m); |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=cut |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=head1 FUNCTIONS |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=cut |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=head2 fill_m |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=for sig |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Signature: (a(n); float+ [o]b(n)) |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=for ref |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
Replaces bad values with sample mean. Mean is set to 0 if all obs are bad. Can be done inplace. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=for usage |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
perldl> p $data |
112
|
|
|
|
|
|
|
[ |
113
|
|
|
|
|
|
|
[ 5 BAD 2 BAD] |
114
|
|
|
|
|
|
|
[ 7 3 7 BAD] |
115
|
|
|
|
|
|
|
] |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
perldl> p $data->fill_m |
118
|
|
|
|
|
|
|
[ |
119
|
|
|
|
|
|
|
[ 5 3.5 2 3.5] |
120
|
|
|
|
|
|
|
[ 7 3 7 5.66667] |
121
|
|
|
|
|
|
|
] |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=for bad |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
The output pdl badflag is cleared. |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=cut |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
*fill_m = \&PDLA::fill_m; |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=head2 fill_rand |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=for sig |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
Signature: (a(n); [o]b(n)) |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=for ref |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
Replaces bad values with random sample (with replacement) of good observations from the same variable. Can be done inplace. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=for usage |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
perldl> p $data |
159
|
|
|
|
|
|
|
[ |
160
|
|
|
|
|
|
|
[ 5 BAD 2 BAD] |
161
|
|
|
|
|
|
|
[ 7 3 7 BAD] |
162
|
|
|
|
|
|
|
] |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
perldl> p $data->fill_rand |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
[ |
167
|
|
|
|
|
|
|
[5 2 2 5] |
168
|
|
|
|
|
|
|
[7 3 7 7] |
169
|
|
|
|
|
|
|
] |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=for bad |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
The output pdl badflag is cleared. |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=cut |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
*fill_rand = \&PDLA::fill_rand; |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=head2 dev_m |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=for sig |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
Signature: (a(n); float+ [o]b(n)) |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=for ref |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
Replaces values with deviations from the mean. Can be done inplace. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=for bad |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
dev_m processes bad values. |
210
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=cut |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
*dev_m = \&PDLA::dev_m; |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=head2 stddz |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=for sig |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
Signature: (a(n); float+ [o]b(n)) |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=for ref |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
Standardize ie replace values with z_scores based on sample standard deviation from the mean (replace with 0s if stdv==0). Can be done inplace. |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=for bad |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
stddz processes bad values. |
243
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=cut |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
*stddz = \&PDLA::stddz; |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=head2 sse |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=for sig |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
Signature: (a(n); b(n); float+ [o]c()) |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=for ref |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
Sum of squared errors between actual and predicted values. |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
=for bad |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
sse processes bad values. |
277
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=cut |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
*sse = \&PDLA::sse; |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
=head2 mse |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=for sig |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
Signature: (a(n); b(n); float+ [o]c()) |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
=for ref |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
Mean of squared errors between actual and predicted values, ie variance around predicted value. |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=for bad |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
mse processes bad values. |
311
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=cut |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
*mse = \&PDLA::mse; |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
=head2 rmse |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
=for sig |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
Signature: (a(n); b(n); float+ [o]c()) |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
=for ref |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
Root mean squared error, ie stdv around predicted value. |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
=for bad |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
rmse processes bad values. |
345
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
=cut |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
*rmse = \&PDLA::rmse; |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
=head2 pred_logistic |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
=for sig |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
Signature: (a(n,m); b(m); float+ [o]c(n)) |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
=for ref |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
Calculates predicted prob value for logistic regression. |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=for usage |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
# glue constant then apply coeff returned by the logistic method |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
$pred = $x->glue(1,ones($x->dim(0)))->pred_logistic( $m{b} ); |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
=for bad |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
pred_logistic processes bad values. |
385
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=cut |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
*pred_logistic = \&PDLA::pred_logistic; |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
=head2 d0 |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=for sig |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Signature: (a(n); float+ [o]c()) |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=for usage |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
my $d0 = $y->d0(); |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
=for ref |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
Null deviance for logistic regression. |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
=for bad |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
d0 processes bad values. |
422
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=cut |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
*d0 = \&PDLA::d0; |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=head2 dm |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=for sig |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
Signature: (a(n); b(n); float+ [o]c()) |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
=for usage |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
my $dm = $y->dm( $y_pred ); |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
# null deviance |
450
|
|
|
|
|
|
|
my $d0 = $y->dm( ones($y->nelem) * $y->avg ); |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=for ref |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
Model deviance for logistic regression. |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
=for bad |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
dm processes bad values. |
462
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
=cut |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
*dm = \&PDLA::dm; |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=head2 dvrs |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=for sig |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
Signature: (a(); b(); float+ [o]c()) |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=for ref |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
Deviance residual for logistic regression. |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
=for bad |
494
|
|
|
|
|
|
|
|
495
|
|
|
|
|
|
|
dvrs processes bad values. |
496
|
|
|
|
|
|
|
It will set the bad-value flag of all output piddles if the flag is set for any of the input piddles. |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
=cut |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
*dvrs = \&PDLA::dvrs; |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# my tmp var for PDLA 2.007 slice upate |
512
|
|
|
|
|
|
|
my $_tmp; |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
=head2 ols_t |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
=for ref |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
Threaded version of ordinary least squares regression (B). The price of threading was losing significance tests for coefficients (but see B). The fitting function was shamelessly copied then modified from PDLA::Fit::Linfit. Uses PDLA::Slatec when possible but otherwise uses PDLA::MatrixOps. Intercept is LAST of coeff if CONST => 1. |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
ols_t does not handle bad values. consider B or B if there are bad values. |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
=for options |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
Default options (case insensitive): |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
CONST => 1, |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
=for usage |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
Usage: |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
# DV, 2 person's ratings for top-10 box office movies |
533
|
|
|
|
|
|
|
# ascending sorted by box office numbers |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
perldl> p $y = qsort ceil( random(10, 2)*5 ) |
536
|
|
|
|
|
|
|
[ |
537
|
|
|
|
|
|
|
[1 1 2 4 4 4 4 5 5 5] |
538
|
|
|
|
|
|
|
[1 2 2 2 3 3 3 3 5 5] |
539
|
|
|
|
|
|
|
] |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
# model with 2 IVs, a linear and a quadratic trend component |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
perldl> $x = cat sequence(10), sequence(10)**2 |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# suppose our novice modeler thinks this creates 3 different models |
546
|
|
|
|
|
|
|
# for predicting movie ratings |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
perldl> p $x = cat $x, $x * 2, $x * 3 |
549
|
|
|
|
|
|
|
[ |
550
|
|
|
|
|
|
|
[ |
551
|
|
|
|
|
|
|
[ 0 1 2 3 4 5 6 7 8 9] |
552
|
|
|
|
|
|
|
[ 0 1 4 9 16 25 36 49 64 81] |
553
|
|
|
|
|
|
|
] |
554
|
|
|
|
|
|
|
[ |
555
|
|
|
|
|
|
|
[ 0 2 4 6 8 10 12 14 16 18] |
556
|
|
|
|
|
|
|
[ 0 2 8 18 32 50 72 98 128 162] |
557
|
|
|
|
|
|
|
] |
558
|
|
|
|
|
|
|
[ |
559
|
|
|
|
|
|
|
[ 0 3 6 9 12 15 18 21 24 27] |
560
|
|
|
|
|
|
|
[ 0 3 12 27 48 75 108 147 192 243] |
561
|
|
|
|
|
|
|
] |
562
|
|
|
|
|
|
|
] |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
perldl> p $x->info |
565
|
|
|
|
|
|
|
PDLA: Double D [10,2,3] |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
# insert a dummy dim between IV and the dim (model) to be threaded |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
perldl> %m = $y->ols_t( $x->dummy(2) ) |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
perldl> p "$_\t$m{$_}\n" for (sort keys %m) |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# 2 persons' ratings, eached fitted with 3 "different" models |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
F |
576
|
|
|
|
|
|
|
[ |
577
|
|
|
|
|
|
|
[ 38.314159 25.087209] |
578
|
|
|
|
|
|
|
[ 38.314159 25.087209] |
579
|
|
|
|
|
|
|
[ 38.314159 25.087209] |
580
|
|
|
|
|
|
|
] |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
# df is the same across dv and iv models |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
F_df [2 7] |
585
|
|
|
|
|
|
|
F_p |
586
|
|
|
|
|
|
|
[ |
587
|
|
|
|
|
|
|
[0.00016967051 0.00064215074] |
588
|
|
|
|
|
|
|
[0.00016967051 0.00064215074] |
589
|
|
|
|
|
|
|
[0.00016967051 0.00064215074] |
590
|
|
|
|
|
|
|
] |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
R2 |
593
|
|
|
|
|
|
|
[ |
594
|
|
|
|
|
|
|
[ 0.9162963 0.87756762] |
595
|
|
|
|
|
|
|
[ 0.9162963 0.87756762] |
596
|
|
|
|
|
|
|
[ 0.9162963 0.87756762] |
597
|
|
|
|
|
|
|
] |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
b |
600
|
|
|
|
|
|
|
[ # linear quadratic constant |
601
|
|
|
|
|
|
|
[ |
602
|
|
|
|
|
|
|
[ 0.99015152 -0.056818182 0.66363636] # person 1 |
603
|
|
|
|
|
|
|
[ 0.18939394 0.022727273 1.4] # person 2 |
604
|
|
|
|
|
|
|
] |
605
|
|
|
|
|
|
|
[ |
606
|
|
|
|
|
|
|
[ 0.49507576 -0.028409091 0.66363636] |
607
|
|
|
|
|
|
|
[ 0.09469697 0.011363636 1.4] |
608
|
|
|
|
|
|
|
] |
609
|
|
|
|
|
|
|
[ |
610
|
|
|
|
|
|
|
[ 0.33005051 -0.018939394 0.66363636] |
611
|
|
|
|
|
|
|
[ 0.063131313 0.0075757576 1.4] |
612
|
|
|
|
|
|
|
] |
613
|
|
|
|
|
|
|
] |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
# our novice modeler realizes at this point that |
616
|
|
|
|
|
|
|
# the 3 models only differ in the scaling of the IV coefficients |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
ss_model |
619
|
|
|
|
|
|
|
[ |
620
|
|
|
|
|
|
|
[ 20.616667 13.075758] |
621
|
|
|
|
|
|
|
[ 20.616667 13.075758] |
622
|
|
|
|
|
|
|
[ 20.616667 13.075758] |
623
|
|
|
|
|
|
|
] |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
ss_residual |
626
|
|
|
|
|
|
|
[ |
627
|
|
|
|
|
|
|
[ 1.8833333 1.8242424] |
628
|
|
|
|
|
|
|
[ 1.8833333 1.8242424] |
629
|
|
|
|
|
|
|
[ 1.8833333 1.8242424] |
630
|
|
|
|
|
|
|
] |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
ss_total [22.5 14.9] |
633
|
|
|
|
|
|
|
y_pred |
634
|
|
|
|
|
|
|
[ |
635
|
|
|
|
|
|
|
[ |
636
|
|
|
|
|
|
|
[0.66363636 1.5969697 2.4166667 3.1227273 ... 4.9727273] |
637
|
|
|
|
|
|
|
... |
638
|
|
|
|
|
|
|
|
639
|
|
|
|
|
|
|
=cut |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
*ols_t = \&PDLA::ols_t; |
642
|
|
|
|
|
|
|
sub PDLA::ols_t { |
643
|
|
|
|
|
|
|
# y [n], ivs [n x attr] pdl |
644
|
105
|
|
|
105
|
0
|
9936
|
my ($y, $ivs, $opt) = @_; |
645
|
105
|
|
|
|
|
332
|
my %opt = ( CONST => 1 ); |
646
|
105
|
|
33
|
|
|
840
|
$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
# $y = $y->squeeze; |
649
|
105
|
100
|
|
|
|
535
|
$ivs = $ivs->dummy(1) if $ivs->getndims == 1; |
650
|
|
|
|
|
|
|
# set up ivs and const as ivs |
651
|
|
|
|
|
|
|
$opt{CONST} and |
652
|
105
|
100
|
|
|
|
472
|
$ivs = $ivs->glue( 1, ones($ivs->dim(0)) ); |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
# Internally normalise data |
655
|
|
|
|
|
|
|
# (double) it or ushort y and sequence iv won't work right |
656
|
105
|
|
|
|
|
3261
|
my $ymean = $y->abs->sumover->double / $y->dim(0); |
657
|
105
|
|
|
|
|
9145
|
($_tmp = $ymean->where( $ymean==0 )) .= 1; |
658
|
105
|
|
|
|
|
8098
|
my $y2 = $y / $ymean->dummy(0); |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
# Do the fit |
661
|
|
|
|
|
|
|
|
662
|
105
|
|
|
|
|
3741
|
my $Y = $ivs x $y2->dummy(0); |
663
|
|
|
|
|
|
|
|
664
|
105
|
|
|
|
|
6648
|
my $C; |
665
|
|
|
|
|
|
|
|
666
|
105
|
50
|
|
|
|
291
|
if ( $SLATEC ) { |
667
|
0
|
|
|
|
|
0
|
$C = PDLA::Slatec::matinv( $ivs x $ivs->xchg(0,1) ); |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
else { |
670
|
105
|
|
|
|
|
569
|
$C = inv( $ivs x $ivs->xchg(0,1) ); |
671
|
|
|
|
|
|
|
} |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
# Fitted coefficients vector |
674
|
105
|
|
|
|
|
4626069
|
my $coeff = PDLA::squeeze( $C x $Y ); |
675
|
|
|
|
|
|
|
|
676
|
105
|
100
|
100
|
|
|
11671
|
$coeff = $coeff->dummy(0) |
677
|
|
|
|
|
|
|
if $coeff->getndims == 1 and $y->getndims > 1; |
678
|
105
|
|
|
|
|
481
|
$coeff *= $ymean->dummy(0); # Un-normalise |
679
|
|
|
|
|
|
|
|
680
|
105
|
100
|
|
|
|
4759
|
return $coeff |
681
|
|
|
|
|
|
|
unless wantarray; |
682
|
|
|
|
|
|
|
|
683
|
4
|
|
|
|
|
6
|
my %ret; |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
# ***$coeff x $ivs looks nice but produces nan on successive tries*** |
686
|
4
|
|
|
|
|
14
|
$ret{y_pred} = sumover( $coeff->dummy(1) * $ivs->xchg(0,1) ); |
687
|
4
|
100
|
|
|
|
240
|
$ret{ss_total} = $opt{CONST}? $y->ss : sumover( $y ** 2 ); |
688
|
4
|
|
|
|
|
47
|
$ret{ss_residual} = $y->sse( $ret{y_pred} ); |
689
|
4
|
|
|
|
|
33
|
$ret{ss_model} = $ret{ss_total} - $ret{ss_residual}; |
690
|
4
|
|
|
|
|
25
|
$ret{R2} = $ret{ss_model} / $ret{ss_total}; |
691
|
|
|
|
|
|
|
|
692
|
4
|
100
|
|
|
|
17
|
my $n_var = $opt{CONST}? $ivs->dim(1) - 1 : $ivs->dim(1); |
693
|
4
|
|
|
|
|
18
|
$ret{F_df} = pdl( $n_var, $y->dim(0) - $ivs->dim(1) ); |
694
|
|
|
|
|
|
|
$ret{F} |
695
|
4
|
|
|
|
|
84
|
= $ret{ss_model} / $ret{F_df}->(0) / ($ret{ss_residual} / $ret{F_df}->(1)); |
696
|
|
|
|
|
|
|
$ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $ret{F_df}->dog ) |
697
|
4
|
50
|
|
|
|
141
|
if $CDF; |
698
|
|
|
|
|
|
|
|
699
|
4
|
50
|
|
|
|
13
|
for (keys %ret) { ref $ret{$_} eq 'PDLA' and $ret{$_} = $ret{$_}->squeeze }; |
|
28
|
|
|
|
|
829
|
|
700
|
|
|
|
|
|
|
|
701
|
4
|
|
|
|
|
140
|
$ret{b} = $coeff; |
702
|
|
|
|
|
|
|
|
703
|
4
|
|
|
|
|
48
|
return %ret; |
704
|
|
|
|
|
|
|
} |
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
=head2 r2_change |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
=for ref |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
Significance test for the incremental change in R2 when new variable(s) are added to an ols regression model. Returns the change stats as well as stats for both models. Based on B. (One way to make up for the lack of significance tests for coeffs in ols_t). |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
=for options |
713
|
|
|
|
|
|
|
|
714
|
|
|
|
|
|
|
Default options (case insensitive): |
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
CONST => 1, |
717
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
=for usage |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
Usage: |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
# suppose these are two persons' ratings for top 10 box office movies |
723
|
|
|
|
|
|
|
# ascending sorted by box office |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
perldl> p $y = qsort ceil(random(10, 2) * 5) |
726
|
|
|
|
|
|
|
[ |
727
|
|
|
|
|
|
|
[1 1 2 2 2 3 4 4 4 4] |
728
|
|
|
|
|
|
|
[1 2 2 3 3 3 4 4 5 5] |
729
|
|
|
|
|
|
|
] |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
# first IV is a simple linear trend |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
perldl> p $x1 = sequence 10 |
734
|
|
|
|
|
|
|
[0 1 2 3 4 5 6 7 8 9] |
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
# the modeler wonders if adding a quadratic trend improves the fit |
737
|
|
|
|
|
|
|
|
738
|
|
|
|
|
|
|
perldl> p $x2 = sequence(10) ** 2 |
739
|
|
|
|
|
|
|
[0 1 4 9 16 25 36 49 64 81] |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
# two difference models are given in two pdls |
742
|
|
|
|
|
|
|
# each as would be pass on to ols_t |
743
|
|
|
|
|
|
|
# the 1st model includes only linear trend |
744
|
|
|
|
|
|
|
# the 2nd model includes linear and quadratic trends |
745
|
|
|
|
|
|
|
# when necessary use dummy dim so both models have the same ndims |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
perldl> %c = $y->r2_change( $x1->dummy(1), cat($x1, $x2) ) |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
perldl> p "$_\t$c{$_}\n" for (sort keys %c) |
750
|
|
|
|
|
|
|
# person 1 person 2 |
751
|
|
|
|
|
|
|
F_change [0.72164948 0.071283096] |
752
|
|
|
|
|
|
|
# df same for both persons |
753
|
|
|
|
|
|
|
F_df [1 7] |
754
|
|
|
|
|
|
|
F_p [0.42370145 0.79717232] |
755
|
|
|
|
|
|
|
R2_change [0.0085966043 0.00048562549] |
756
|
|
|
|
|
|
|
model0 HASH(0x8c10828) |
757
|
|
|
|
|
|
|
model1 HASH(0x8c135c8) |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
# the answer here is no. |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
=cut |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
*r2_change = \&PDLA::r2_change; |
764
|
|
|
|
|
|
|
sub PDLA::r2_change { |
765
|
1
|
|
|
1
|
0
|
2104
|
my ($self, $ivs0, $ivs1, $opt) = @_; |
766
|
1
|
50
|
|
|
|
6
|
$ivs0->getndims == 1 and $ivs0 = $ivs0->dummy(1); |
767
|
|
|
|
|
|
|
|
768
|
1
|
|
|
|
|
26
|
my %ret; |
769
|
|
|
|
|
|
|
|
770
|
1
|
|
|
|
|
3
|
$ret{model0} = { $self->ols_t( $ivs0, $opt ) }; |
771
|
1
|
|
|
|
|
4
|
$ret{model1} = { $self->ols_t( $ivs1, $opt ) }; |
772
|
|
|
|
|
|
|
|
773
|
1
|
|
|
|
|
10
|
$ret{R2_change} = $ret{model1}->{R2} - $ret{model0}->{R2}; |
774
|
|
|
|
|
|
|
$ret{F_df} |
775
|
|
|
|
|
|
|
= pdl($ivs1->dim(1) - $ivs0->dim(1), |
776
|
1
|
|
|
|
|
7
|
$ret{model1}->{F_df}->((1)) ); |
777
|
|
|
|
|
|
|
$ret{F_change} |
778
|
|
|
|
|
|
|
= $ret{R2_change} * $ret{F_df}->((1)) |
779
|
1
|
|
|
|
|
34
|
/ ( (1-$ret{model1}->{R2}) * $ret{F_df}->((0)) ); |
780
|
|
|
|
|
|
|
$ret{F_p} = 1 - $ret{F_change}->gsl_cdf_fdist_P( $ret{F_df}->dog ) |
781
|
1
|
50
|
|
|
|
47
|
if $CDF; |
782
|
|
|
|
|
|
|
|
783
|
1
|
100
|
|
|
|
3
|
for (keys %ret) { ref $ret{$_} eq 'PDLA' and $ret{$_} = $ret{$_}->squeeze }; |
|
5
|
|
|
|
|
73
|
|
784
|
|
|
|
|
|
|
|
785
|
1
|
|
|
|
|
33
|
return %ret; |
786
|
|
|
|
|
|
|
} |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
=head1 METHODS |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
=head2 anova |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=for ref |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
Analysis of variance. Uses type III sum of squares for unbalanced data. |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
Dependent variable should be a 1D pdl. Independent variables can be passed as 1D perl array ref or 1D pdl. |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
Supports bad value (by ignoring missing or BAD values in dependent and independent variables list-wise). |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
=for options |
801
|
|
|
|
|
|
|
|
802
|
|
|
|
|
|
|
Default options (case insensitive): |
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
V => 1, # carps if bad value in variables |
805
|
|
|
|
|
|
|
IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] |
806
|
|
|
|
|
|
|
PLOT => 1, # plots highest order effect |
807
|
|
|
|
|
|
|
# can set plot_means options here |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
=for usage |
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
Usage: |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
# suppose this is ratings for 12 apples |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
perldl> p $y = qsort ceil( random(12)*5 ) |
816
|
|
|
|
|
|
|
[1 1 2 2 2 3 3 4 4 4 5 5] |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
# IV for types of apple |
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
perldl> p $a = sequence(12) % 3 + 1 |
821
|
|
|
|
|
|
|
[1 2 3 1 2 3 1 2 3 1 2 3] |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
# IV for whether we baked the apple |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
perldl> @b = qw( y y y y y y n n n n n n ) |
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
perldl> %m = $y->anova( $a, \@b, { IVNM=>['apple', 'bake'] } ) |
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
perldl> p "$_\t$m{$_}\n" for (sort keys %m) |
830
|
|
|
|
|
|
|
# apple # m |
831
|
|
|
|
|
|
|
[ |
832
|
|
|
|
|
|
|
[2.5 3 3.5] |
833
|
|
|
|
|
|
|
] |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
# apple # se |
836
|
|
|
|
|
|
|
[ |
837
|
|
|
|
|
|
|
[0.64549722 0.91287093 0.64549722] |
838
|
|
|
|
|
|
|
] |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
# apple ~ bake # m |
841
|
|
|
|
|
|
|
[ |
842
|
|
|
|
|
|
|
[1.5 1.5 2.5] |
843
|
|
|
|
|
|
|
[3.5 4.5 4.5] |
844
|
|
|
|
|
|
|
] |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
# apple ~ bake # se |
847
|
|
|
|
|
|
|
[ |
848
|
|
|
|
|
|
|
[0.5 0.5 0.5] |
849
|
|
|
|
|
|
|
[0.5 0.5 0.5] |
850
|
|
|
|
|
|
|
] |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
# bake # m |
853
|
|
|
|
|
|
|
[ |
854
|
|
|
|
|
|
|
[ 1.8333333 4.1666667] |
855
|
|
|
|
|
|
|
] |
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
# bake # se |
858
|
|
|
|
|
|
|
[ |
859
|
|
|
|
|
|
|
[0.30731815 0.30731815] |
860
|
|
|
|
|
|
|
] |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
F 7.6 |
863
|
|
|
|
|
|
|
F_df [5 6] |
864
|
|
|
|
|
|
|
F_p 0.0141586545851857 |
865
|
|
|
|
|
|
|
ms_model 3.8 |
866
|
|
|
|
|
|
|
ms_residual 0.5 |
867
|
|
|
|
|
|
|
ss_model 19 |
868
|
|
|
|
|
|
|
ss_residual 3 |
869
|
|
|
|
|
|
|
ss_total 22 |
870
|
|
|
|
|
|
|
| apple | F 2 |
871
|
|
|
|
|
|
|
| apple | F_df [2 6] |
872
|
|
|
|
|
|
|
| apple | F_p 0.216 |
873
|
|
|
|
|
|
|
| apple | ms 1 |
874
|
|
|
|
|
|
|
| apple | ss 2 |
875
|
|
|
|
|
|
|
| apple ~ bake | F 0.666666666666667 |
876
|
|
|
|
|
|
|
| apple ~ bake | F_df [2 6] |
877
|
|
|
|
|
|
|
| apple ~ bake | F_p 0.54770848985725 |
878
|
|
|
|
|
|
|
| apple ~ bake | ms 0.333333333333334 |
879
|
|
|
|
|
|
|
| apple ~ bake | ss 0.666666666666667 |
880
|
|
|
|
|
|
|
| bake | F 32.6666666666667 |
881
|
|
|
|
|
|
|
| bake | F_df [1 6] |
882
|
|
|
|
|
|
|
| bake | F_p 0.00124263849516693 |
883
|
|
|
|
|
|
|
| bake | ms 16.3333333333333 |
884
|
|
|
|
|
|
|
| bake | ss 16.3333333333333 |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
=cut |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
*anova = \&PDLA::anova; |
889
|
|
|
|
|
|
|
sub PDLA::anova { |
890
|
4
|
50
|
|
4
|
0
|
4654
|
my $opt = pop @_ |
891
|
|
|
|
|
|
|
if ref $_[-1] eq 'HASH'; |
892
|
4
|
|
|
|
|
11
|
my ($y, @ivs_raw) = @_; |
893
|
|
|
|
|
|
|
croak "Mismatched number of elements in DV and IV. Are you passing IVs the old-and-abandoned way?" |
894
|
4
|
50
|
66
|
|
|
14
|
if (ref $ivs_raw[0] eq 'ARRAY') and (@{ $ivs_raw[0] } != $y->nelem); |
|
3
|
|
|
|
|
15
|
|
895
|
|
|
|
|
|
|
|
896
|
4
|
|
|
|
|
9
|
for (@ivs_raw) { |
897
|
10
|
50
|
66
|
|
|
144
|
croak "too many dims in IV!" |
898
|
|
|
|
|
|
|
if ref $_ eq 'PDLA' and $_->squeeze->ndims > 1; |
899
|
|
|
|
|
|
|
} |
900
|
|
|
|
|
|
|
|
901
|
4
|
|
|
|
|
145
|
my %opt = ( |
902
|
|
|
|
|
|
|
IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] |
903
|
|
|
|
|
|
|
PLOT => 1, # plots highest order effect |
904
|
|
|
|
|
|
|
V => 1, # carps if bad value |
905
|
|
|
|
|
|
|
); |
906
|
4
|
|
33
|
|
|
34
|
$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); |
907
|
1
|
|
|
|
|
4
|
$opt{IVNM} = [ map { "IV_$_" } (0 .. $#ivs_raw) ] |
908
|
4
|
100
|
66
|
|
|
14
|
if !$opt{IVNM} or !@{$opt{IVNM}}; |
|
4
|
|
|
|
|
16
|
|
909
|
4
|
|
|
|
|
6
|
my @idv = @{ $opt{IVNM} }; |
|
4
|
|
|
|
|
9
|
|
910
|
|
|
|
|
|
|
|
911
|
4
|
|
|
|
|
7
|
my %ret; |
912
|
|
|
|
|
|
|
|
913
|
4
|
|
|
|
|
10
|
$y = $y->squeeze; |
914
|
|
|
|
|
|
|
# create new vars here so we don't mess up original caller @ |
915
|
|
|
|
|
|
|
my @pdl_ivs_raw = map { |
916
|
4
|
100
|
|
|
|
121
|
my $var = (ref $_ eq 'PDLA')? [list $_] : $_; |
|
10
|
|
|
|
|
34
|
|
917
|
10
|
|
|
|
|
183
|
scalar PDLA::Stats::Basic::_array_to_pdl $var; |
918
|
|
|
|
|
|
|
} @ivs_raw; |
919
|
|
|
|
|
|
|
|
920
|
4
|
|
|
|
|
12
|
my $pdl_ivs_raw = pdl \@pdl_ivs_raw; |
921
|
|
|
|
|
|
|
# explicit set badflag if any iv had bad value because pdl() removes badflag |
922
|
4
|
|
|
|
|
92
|
$pdl_ivs_raw->badflag( scalar grep { $_->badflag } @pdl_ivs_raw ); |
|
10
|
|
|
|
|
26
|
|
923
|
|
|
|
|
|
|
|
924
|
4
|
|
|
|
|
13
|
($y, $pdl_ivs_raw) = _rm_bad_value( $y, $pdl_ivs_raw ); |
925
|
|
|
|
|
|
|
|
926
|
4
|
100
|
100
|
|
|
34
|
if ($opt{V} and $y->nelem < $pdl_ivs_raw[0]->nelem) { |
927
|
1
|
|
|
|
|
41
|
printf STDERR "%d subjects with missing data removed\n", $pdl_ivs_raw[0]->nelem - $y->nelem; |
928
|
|
|
|
|
|
|
} |
929
|
|
|
|
|
|
|
|
930
|
|
|
|
|
|
|
# dog preserves data flow |
931
|
4
|
|
|
|
|
19
|
@pdl_ivs_raw = map {$_->copy} $pdl_ivs_raw->dog; |
|
10
|
|
|
|
|
340
|
|
932
|
|
|
|
|
|
|
|
933
|
4
|
|
|
|
|
162
|
my ($ivs_ref, $i_cmo_ref) |
934
|
|
|
|
|
|
|
= _effect_code_ivs( \@pdl_ivs_raw ); |
935
|
|
|
|
|
|
|
|
936
|
4
|
|
|
|
|
13
|
($ivs_ref, $i_cmo_ref, my( $idv, $ivs_cm_ref )) |
937
|
|
|
|
|
|
|
= _add_interactions( $ivs_ref, $i_cmo_ref, \@idv, \@pdl_ivs_raw ); |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
# add const here |
940
|
4
|
|
|
|
|
17
|
my $ivs = PDLA->null->glue( 1, @$ivs_ref ); |
941
|
4
|
|
|
|
|
722
|
$ivs = $ivs->glue(1, ones $ivs->dim(0)); |
942
|
|
|
|
|
|
|
|
943
|
4
|
|
|
|
|
525
|
my $b_full = $y->ols_t( $ivs, {CONST=>0} ); |
944
|
|
|
|
|
|
|
|
945
|
4
|
|
|
|
|
75
|
$ret{ss_total} = $y->ss; |
946
|
4
|
|
|
|
|
152
|
$ret{ss_residual} = $y->sse( sumover( $b_full * $ivs->xchg(0,1) ) ); |
947
|
4
|
|
|
|
|
49
|
$ret{ss_model} = $ret{ss_total} - $ret{ss_residual}; |
948
|
4
|
|
|
|
|
31
|
$ret{F_df} = pdl($ivs->dim(1) - 1, $y->nelem - ($ivs->dim(1) - 1) -1); |
949
|
4
|
|
|
|
|
94
|
$ret{ms_model} = $ret{ss_model} / $ret{F_df}->(0); |
950
|
4
|
|
|
|
|
85
|
$ret{ms_residual} = $ret{ss_residual} / $ret{F_df}->(1); |
951
|
4
|
|
|
|
|
79
|
$ret{F} = $ret{ms_model} / $ret{ms_residual}; |
952
|
|
|
|
|
|
|
$ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $ret{F_df}->dog ) |
953
|
4
|
50
|
|
|
|
16
|
if $CDF; |
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
# get IV ss from $ivs_ref instead of $ivs pdl |
956
|
|
|
|
|
|
|
|
957
|
4
|
|
|
|
|
11
|
for my $k (0 .. $#$ivs_ref) { |
958
|
22
|
|
|
|
|
45
|
my (@G, $G, $b_G); |
959
|
22
|
|
|
|
|
58
|
@G = grep { $_ != $k } (0 .. $#$ivs_ref); |
|
148
|
|
|
|
|
220
|
|
960
|
|
|
|
|
|
|
|
961
|
22
|
100
|
|
|
|
66
|
if (@G) { |
962
|
21
|
|
|
|
|
57
|
$G = PDLA->null->glue( 1, @$ivs_ref[@G] ); |
963
|
21
|
|
|
|
|
3020
|
$G = $G->glue(1, ones $G->dim(0)); |
964
|
|
|
|
|
|
|
} |
965
|
|
|
|
|
|
|
else { |
966
|
1
|
|
|
|
|
5
|
$G = ones( $y->dim(0) ); |
967
|
|
|
|
|
|
|
} |
968
|
22
|
|
|
|
|
2817
|
$b_G = $y->ols_t( $G, {CONST=>0} ); |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
$ret{ "| $idv->[$k] | ss" } |
971
|
22
|
|
|
|
|
170
|
= $y->sse( sumover($b_G * $G->transpose) ) - $ret{ss_residual}; |
972
|
|
|
|
|
|
|
$ret{ "| $idv->[$k] | F_df" } |
973
|
22
|
|
|
|
|
1295
|
= pdl( $ivs_ref->[$k]->dim(1), $ret{F_df}->(1)->copy )->squeeze; |
974
|
|
|
|
|
|
|
$ret{ "| $idv->[$k] | ms" } |
975
|
22
|
|
|
|
|
2138
|
= $ret{ "| $idv->[$k] | ss" } / $ret{ "| $idv->[$k] | F_df" }->(0); |
976
|
|
|
|
|
|
|
$ret{ "| $idv->[$k] | F" } |
977
|
22
|
|
|
|
|
570
|
= $ret{ "| $idv->[$k] | ms" } / $ret{ms_residual}; |
978
|
|
|
|
|
|
|
$ret{ "| $idv->[$k] | F_p" } |
979
|
22
|
50
|
|
|
|
206
|
= 1 - $ret{ "| $idv->[$k] | F" }->gsl_cdf_fdist_P( $ret{ "| $idv->[$k] | F_df" }->dog ) |
980
|
|
|
|
|
|
|
if $CDF; |
981
|
|
|
|
|
|
|
} |
982
|
|
|
|
|
|
|
|
983
|
4
|
|
|
|
|
29
|
for (keys %ret) { $ret{$_} = $ret{$_}->squeeze }; |
|
116
|
|
|
|
|
3082
|
|
984
|
|
|
|
|
|
|
|
985
|
4
|
|
|
|
|
130
|
my $cm_ref = _cell_means( $y, $ivs_cm_ref, $i_cmo_ref, $idv, \@pdl_ivs_raw ); |
986
|
|
|
|
|
|
|
# sort bc we can't count on perl % internal key order implementation |
987
|
4
|
|
|
|
|
54
|
@ret{ sort keys %$cm_ref } = @$cm_ref{ sort keys %$cm_ref }; |
988
|
|
|
|
|
|
|
|
989
|
4
|
|
|
|
|
10
|
my $highest = join(' ~ ', @{ $opt{IVNM} }); |
|
4
|
|
|
|
|
19
|
|
990
|
|
|
|
|
|
|
$cm_ref->{"# $highest # m"}->plot_means( $cm_ref->{"# $highest # se"}, {%opt, IVNM=>$idv} ) |
991
|
4
|
50
|
|
|
|
13
|
if $opt{PLOT}; |
992
|
|
|
|
|
|
|
|
993
|
4
|
|
|
|
|
228
|
return %ret; |
994
|
|
|
|
|
|
|
} |
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
sub _old_interface_check { |
997
|
0
|
|
|
0
|
|
0
|
my ($n, $ivs_ref) = @_; |
998
|
|
|
|
|
|
|
return 1 |
999
|
0
|
0
|
0
|
|
|
0
|
if (ref $ivs_ref->[0][0] eq 'ARRAY') and (@{ $ivs_ref->[0][0] } != $n); |
|
0
|
|
|
|
|
0
|
|
1000
|
|
|
|
|
|
|
} |
1001
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
sub _effect_code_ivs { |
1003
|
12
|
|
|
12
|
|
30
|
my $ivs = shift; |
1004
|
|
|
|
|
|
|
|
1005
|
12
|
|
|
|
|
24
|
my (@i_iv, @i_cmo); |
1006
|
12
|
|
|
|
|
32
|
for (@$ivs) { |
1007
|
28
|
|
|
|
|
379
|
my ($e, $map) = effect_code($_->squeeze); |
1008
|
28
|
50
|
|
|
|
128
|
my $var = ($e->getndims == 1)? $e->dummy(1) : $e; |
1009
|
28
|
|
|
|
|
52
|
push @i_iv, $var; |
1010
|
28
|
|
|
|
|
133
|
my @indices = sort { $a<=>$b } values %$map; |
|
59
|
|
|
|
|
112
|
|
1011
|
28
|
|
|
|
|
64
|
push @i_cmo, pdl @indices; |
1012
|
|
|
|
|
|
|
} |
1013
|
12
|
|
|
|
|
289
|
return \@i_iv, \@i_cmo; |
1014
|
|
|
|
|
|
|
} |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
sub _add_interactions { |
1017
|
12
|
|
|
12
|
|
28
|
my ($var_ref, $i_cmo_ref, $idv, $raw_ref) = @_; |
1018
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
# append info re inter to main effects |
1020
|
12
|
|
|
|
|
18
|
my (@inter, @idv_inter, @inter_cm, @inter_cmo); |
1021
|
12
|
|
|
|
|
44
|
for my $nway ( 2 .. @$var_ref ) { |
1022
|
16
|
|
|
|
|
64
|
my $iter_idv = _combinations( $nway, [0..$#$var_ref] ); |
1023
|
|
|
|
|
|
|
|
1024
|
16
|
|
|
|
|
38
|
while ( my @v = &$iter_idv() ) { |
1025
|
32
|
|
|
|
|
155
|
my $i = ones( $var_ref->[0]->dim(0), 1 ); |
1026
|
32
|
|
|
|
|
1674
|
for (@v) { |
1027
|
74
|
|
|
|
|
2023
|
$i = $i * $var_ref->[$_]->dummy(1); |
1028
|
74
|
|
|
|
|
3358
|
$i = $i->clump(1,2); |
1029
|
|
|
|
|
|
|
} |
1030
|
32
|
|
|
|
|
1265
|
push @inter, $i; |
1031
|
|
|
|
|
|
|
|
1032
|
32
|
|
|
|
|
82
|
my $e = join( ' ~ ', @$idv[@v] ); |
1033
|
32
|
|
|
|
|
50
|
push @idv_inter, $e; |
1034
|
|
|
|
|
|
|
|
1035
|
|
|
|
|
|
|
# now prepare for cell mean |
1036
|
32
|
|
|
|
|
46
|
my @i_cm = (); |
1037
|
32
|
|
|
|
|
129
|
for my $o ( 0 .. $raw_ref->[0]->dim(0) - 1 ) { |
1038
|
1540
|
|
|
|
|
81160
|
my @cell = map { $_($o)->squeeze } @$raw_ref[@v]; |
|
3594
|
|
|
|
|
82820
|
|
1039
|
1540
|
|
|
|
|
57942
|
push @i_cm, join('', @cell); |
1040
|
|
|
|
|
|
|
} |
1041
|
32
|
|
|
|
|
1776
|
my ($inter, $map) = effect_code( \@i_cm ); |
1042
|
32
|
|
|
|
|
66
|
push @inter_cm, $inter; |
1043
|
|
|
|
|
|
|
|
1044
|
|
|
|
|
|
|
# get the order to put means in correct multi dim pdl pos |
1045
|
|
|
|
|
|
|
# this is order in var_e dim(1) |
1046
|
32
|
|
|
|
|
180
|
my @levels = sort { $map->{$a} <=> $map->{$b} } keys %$map; |
|
658
|
|
|
|
|
805
|
|
1047
|
|
|
|
|
|
|
# this is order needed for cell mean |
1048
|
32
|
|
|
|
|
91
|
my @i_cmo = sort { reverse($levels[$a]) cmp reverse($levels[$b]) } |
|
699
|
|
|
|
|
962
|
|
1049
|
|
|
|
|
|
|
0 .. $#levels; |
1050
|
32
|
|
|
|
|
93
|
push @inter_cmo, pdl @i_cmo; |
1051
|
|
|
|
|
|
|
} |
1052
|
|
|
|
|
|
|
} |
1053
|
|
|
|
|
|
|
# append info re inter to main effects |
1054
|
12
|
|
|
|
|
88
|
return ([@$var_ref, @inter], [@$i_cmo_ref, @inter_cmo], |
1055
|
|
|
|
|
|
|
[@$idv, @idv_inter], [@$var_ref, @inter_cm] ); |
1056
|
|
|
|
|
|
|
} |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
sub _cell_means { |
1059
|
12
|
|
|
12
|
|
36
|
my ($data, $ivs_ref, $i_cmo_ref, $ids, $raw_ref) = @_; |
1060
|
|
|
|
|
|
|
|
1061
|
12
|
|
|
|
|
17
|
my %ind_id; |
1062
|
12
|
|
|
|
|
87
|
@ind_id{ @$ids } = 0..$#$ids; |
1063
|
|
|
|
|
|
|
|
1064
|
12
|
|
|
|
|
23
|
my %cm; |
1065
|
12
|
|
|
|
|
23
|
my $i = 0; |
1066
|
12
|
|
|
|
|
35
|
for (@$ivs_ref) { |
1067
|
60
|
|
|
|
|
206
|
my $last = zeroes $_->dim(0); |
1068
|
60
|
|
|
|
|
2847
|
my $i_neg = which $_( ,0) == -1; |
1069
|
60
|
|
|
|
|
3517
|
($_tmp = $last($i_neg)) .= 1; |
1070
|
60
|
|
|
|
|
3605
|
($_tmp = $_->where($_ == -1)) .= 0; |
1071
|
60
|
|
|
|
|
4757
|
$_ = $_->glue(1, $last); |
1072
|
|
|
|
|
|
|
|
1073
|
60
|
|
|
|
|
5106
|
my @v = split ' ~ ', $ids->[$i]; |
1074
|
60
|
|
|
|
|
162
|
my @shape = map { $raw_ref->[$_]->uniq->nelem } @ind_id{@v}; |
|
102
|
|
|
|
|
7661
|
|
1075
|
|
|
|
|
|
|
|
1076
|
60
|
|
|
|
|
11272
|
my ($m, $ss) = $data->centroid( $_ ); |
1077
|
60
|
|
|
|
|
238
|
$m = $m($i_cmo_ref->[$i])->sever; |
1078
|
60
|
|
|
|
|
2547
|
$ss = $ss($i_cmo_ref->[$i])->sever; |
1079
|
60
|
|
|
|
|
2117
|
$m = $m->reshape(@shape); |
1080
|
60
|
100
|
|
|
|
2130
|
$m->getndims == 1 and $m = $m->dummy(1); |
1081
|
60
|
|
|
|
|
2217
|
my $se = sqrt( ($ss/($_->sumover - 1)) / $_->sumover )->reshape(@shape); |
1082
|
60
|
100
|
|
|
|
2731
|
$se->getndims == 1 and $se = $se->dummy(1); |
1083
|
60
|
|
|
|
|
908
|
$cm{ "# $ids->[$i] # m" } = $m; |
1084
|
60
|
|
|
|
|
142
|
$cm{ "# $ids->[$i] # se" } = $se; |
1085
|
60
|
|
|
|
|
280
|
$i++; |
1086
|
|
|
|
|
|
|
} |
1087
|
12
|
|
|
|
|
56
|
return \%cm; |
1088
|
|
|
|
|
|
|
} |
1089
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
# http://www.perlmonks.org/?node_id=371228 |
1091
|
|
|
|
|
|
|
sub _combinations { |
1092
|
16
|
|
|
16
|
|
35
|
my ($num, $arr) = @_; |
1093
|
|
|
|
|
|
|
|
1094
|
0
|
|
|
0
|
|
0
|
return sub { return } |
1095
|
16
|
50
|
33
|
|
|
77
|
if $num == 0 or $num > @$arr; |
1096
|
|
|
|
|
|
|
|
1097
|
16
|
|
|
|
|
20
|
my @pick; |
1098
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
return sub { |
1100
|
48
|
100
|
|
48
|
|
924
|
return @$arr[ @pick = ( 0 .. $num - 1 ) ] |
1101
|
|
|
|
|
|
|
unless @pick; |
1102
|
|
|
|
|
|
|
|
1103
|
32
|
|
|
|
|
55
|
my $i = $#pick; |
1104
|
32
|
|
100
|
|
|
213
|
$i-- until $i < 0 or $pick[$i]++ < @$arr - $num + $i; |
1105
|
32
|
100
|
|
|
|
177
|
return if $i < 0; |
1106
|
|
|
|
|
|
|
|
1107
|
16
|
|
|
|
|
42
|
@pick[$i .. $#pick] = $pick[$i] .. $#$arr; |
1108
|
|
|
|
|
|
|
|
1109
|
16
|
|
|
|
|
43
|
return @$arr[@pick]; |
1110
|
16
|
|
|
|
|
97
|
}; |
1111
|
|
|
|
|
|
|
} |
1112
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
=head2 anova_rptd |
1114
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
Repeated measures and mixed model anova. Uses type III sum of squares. The standard error (se) for the means are based on the relevant mean squared error from the anova, ie it is pooled across levels of the effect. |
1116
|
|
|
|
|
|
|
|
1117
|
|
|
|
|
|
|
anova_rptd supports bad value in the dependent and independent variables. It automatically removes bad data listwise, ie remove a subject's data if there is any cell missing for the subject. |
1118
|
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
Default options (case insensitive): |
1120
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
V => 1, # carps if bad value in dv |
1122
|
|
|
|
|
|
|
IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] |
1123
|
|
|
|
|
|
|
BTWN => [], # indices of between-subject IVs (matches IVNM indices) |
1124
|
|
|
|
|
|
|
PLOT => 1, # plots highest order effect |
1125
|
|
|
|
|
|
|
# see plot_means() for more options |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
Usage: |
1128
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
Some fictional data: recall_w_beer_and_wings.txt |
1130
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
Subject Beer Wings Recall |
1132
|
|
|
|
|
|
|
Alex 1 1 8 |
1133
|
|
|
|
|
|
|
Alex 1 2 9 |
1134
|
|
|
|
|
|
|
Alex 1 3 12 |
1135
|
|
|
|
|
|
|
Alex 2 1 7 |
1136
|
|
|
|
|
|
|
Alex 2 2 9 |
1137
|
|
|
|
|
|
|
Alex 2 3 12 |
1138
|
|
|
|
|
|
|
Brian 1 1 12 |
1139
|
|
|
|
|
|
|
Brian 1 2 13 |
1140
|
|
|
|
|
|
|
Brian 1 3 14 |
1141
|
|
|
|
|
|
|
Brian 2 1 9 |
1142
|
|
|
|
|
|
|
Brian 2 2 8 |
1143
|
|
|
|
|
|
|
Brian 2 3 14 |
1144
|
|
|
|
|
|
|
... |
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
# rtable allows text only in 1st row and col |
1147
|
|
|
|
|
|
|
my ($data, $idv, $subj) = rtable 'recall_w_beer_and_wings.txt'; |
1148
|
|
|
|
|
|
|
|
1149
|
|
|
|
|
|
|
my ($b, $w, $dv) = $data->dog; |
1150
|
|
|
|
|
|
|
# subj and IVs can be 1d pdl or @ ref |
1151
|
|
|
|
|
|
|
# subj must be the first argument |
1152
|
|
|
|
|
|
|
my %m = $dv->anova_rptd( $subj, $b, $w, {ivnm=>['Beer', 'Wings']} ); |
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
print "$_\t$m{$_}\n" for (sort keys %m); |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
# Beer # m |
1157
|
|
|
|
|
|
|
[ |
1158
|
|
|
|
|
|
|
[ 10.916667 8.9166667] |
1159
|
|
|
|
|
|
|
] |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
# Beer # se |
1162
|
|
|
|
|
|
|
[ |
1163
|
|
|
|
|
|
|
[ 0.4614791 0.4614791] |
1164
|
|
|
|
|
|
|
] |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
# Beer ~ Wings # m |
1167
|
|
|
|
|
|
|
[ |
1168
|
|
|
|
|
|
|
[ 10 7] |
1169
|
|
|
|
|
|
|
[ 10.5 9.25] |
1170
|
|
|
|
|
|
|
[12.25 10.5] |
1171
|
|
|
|
|
|
|
] |
1172
|
|
|
|
|
|
|
|
1173
|
|
|
|
|
|
|
# Beer ~ Wings # se |
1174
|
|
|
|
|
|
|
[ |
1175
|
|
|
|
|
|
|
[0.89170561 0.89170561] |
1176
|
|
|
|
|
|
|
[0.89170561 0.89170561] |
1177
|
|
|
|
|
|
|
[0.89170561 0.89170561] |
1178
|
|
|
|
|
|
|
] |
1179
|
|
|
|
|
|
|
|
1180
|
|
|
|
|
|
|
# Wings # m |
1181
|
|
|
|
|
|
|
[ |
1182
|
|
|
|
|
|
|
[ 8.5 9.875 11.375] |
1183
|
|
|
|
|
|
|
] |
1184
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
# Wings # se |
1186
|
|
|
|
|
|
|
[ |
1187
|
|
|
|
|
|
|
[0.67571978 0.67571978 0.67571978] |
1188
|
|
|
|
|
|
|
] |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
ss_residual 19.0833333333333 |
1191
|
|
|
|
|
|
|
ss_subject 24.8333333333333 |
1192
|
|
|
|
|
|
|
ss_total 133.833333333333 |
1193
|
|
|
|
|
|
|
| Beer | F 9.39130434782609 |
1194
|
|
|
|
|
|
|
| Beer | F_p 0.0547977008378944 |
1195
|
|
|
|
|
|
|
| Beer | df 1 |
1196
|
|
|
|
|
|
|
| Beer | ms 24 |
1197
|
|
|
|
|
|
|
| Beer | ss 24 |
1198
|
|
|
|
|
|
|
| Beer || err df 3 |
1199
|
|
|
|
|
|
|
| Beer || err ms 2.55555555555556 |
1200
|
|
|
|
|
|
|
| Beer || err ss 7.66666666666667 |
1201
|
|
|
|
|
|
|
| Beer ~ Wings | F 0.510917030567687 |
1202
|
|
|
|
|
|
|
| Beer ~ Wings | F_p 0.623881438624431 |
1203
|
|
|
|
|
|
|
| Beer ~ Wings | df 2 |
1204
|
|
|
|
|
|
|
| Beer ~ Wings | ms 1.625 |
1205
|
|
|
|
|
|
|
| Beer ~ Wings | ss 3.25000000000001 |
1206
|
|
|
|
|
|
|
| Beer ~ Wings || err df 6 |
1207
|
|
|
|
|
|
|
| Beer ~ Wings || err ms 3.18055555555555 |
1208
|
|
|
|
|
|
|
| Beer ~ Wings || err ss 19.0833333333333 |
1209
|
|
|
|
|
|
|
| Wings | F 4.52851711026616 |
1210
|
|
|
|
|
|
|
| Wings | F_p 0.0632754786153548 |
1211
|
|
|
|
|
|
|
| Wings | df 2 |
1212
|
|
|
|
|
|
|
| Wings | ms 16.5416666666667 |
1213
|
|
|
|
|
|
|
| Wings | ss 33.0833333333333 |
1214
|
|
|
|
|
|
|
| Wings || err df 6 |
1215
|
|
|
|
|
|
|
| Wings || err ms 3.65277777777778 |
1216
|
|
|
|
|
|
|
| Wings || err ss 21.9166666666667 |
1217
|
|
|
|
|
|
|
|
1218
|
|
|
|
|
|
|
For mixed model anova, ie when there are between-subject IVs involved, feed the IVs as above, but specify in BTWN which IVs are between-subject. For example, if we had added age as a between-subject IV in the above example, we would do |
1219
|
|
|
|
|
|
|
|
1220
|
|
|
|
|
|
|
my %m = $dv->anova_rptd( $subj, $age, $b, $w, |
1221
|
|
|
|
|
|
|
{ ivnm=>['Age', 'Beer', 'Wings'], btwn=>[0] }); |
1222
|
|
|
|
|
|
|
|
1223
|
|
|
|
|
|
|
=cut |
1224
|
|
|
|
|
|
|
|
1225
|
|
|
|
|
|
|
*anova_rptd = \&PDLA::anova_rptd; |
1226
|
|
|
|
|
|
|
sub PDLA::anova_rptd { |
1227
|
8
|
50
|
|
8
|
0
|
12615
|
my $opt = pop @_ |
1228
|
|
|
|
|
|
|
if ref $_[-1] eq 'HASH'; |
1229
|
8
|
|
|
|
|
29
|
my ($y, $subj, @ivs_raw) = @_; |
1230
|
|
|
|
|
|
|
|
1231
|
8
|
|
|
|
|
23
|
for (@ivs_raw) { |
1232
|
18
|
50
|
33
|
|
|
443
|
croak "too many dims in IV!" |
1233
|
|
|
|
|
|
|
if ref $_ eq 'PDLA' and $_->squeeze->ndims > 1 |
1234
|
|
|
|
|
|
|
} |
1235
|
|
|
|
|
|
|
|
1236
|
8
|
|
|
|
|
291
|
my %opt = ( |
1237
|
|
|
|
|
|
|
V => 1, # carps if bad value in dv |
1238
|
|
|
|
|
|
|
IVNM => [], # auto filled as ['IV_0', 'IV_1', ... ] |
1239
|
|
|
|
|
|
|
BTWN => [], # indices of between-subject IVs (matches IVNM indices) |
1240
|
|
|
|
|
|
|
PLOT => 1, # plots highest order effect |
1241
|
|
|
|
|
|
|
); |
1242
|
8
|
|
33
|
|
|
76
|
$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); |
1243
|
1
|
|
|
|
|
5
|
$opt{IVNM} = [ map { "IV_$_" } 0 .. $#ivs_raw ] |
1244
|
8
|
100
|
66
|
|
|
29
|
if !$opt{IVNM} or !@{ $opt{IVNM} }; |
|
8
|
|
|
|
|
35
|
|
1245
|
8
|
|
|
|
|
14
|
my @idv = @{ $opt{IVNM} }; |
|
8
|
|
|
|
|
22
|
|
1246
|
|
|
|
|
|
|
|
1247
|
8
|
|
|
|
|
13
|
my %ret; |
1248
|
|
|
|
|
|
|
|
1249
|
|
|
|
|
|
|
# create new vars here so we don't mess up original caller @ |
1250
|
|
|
|
|
|
|
my ($sj, @pdl_ivs_raw) |
1251
|
8
|
100
|
|
|
|
21
|
= map { my $var = (ref $_ eq 'PDLA')? [list $_] : $_; |
|
26
|
|
|
|
|
90
|
|
1252
|
26
|
|
|
|
|
580
|
scalar PDLA::Stats::Basic::_array_to_pdl $var; |
1253
|
|
|
|
|
|
|
} ( $subj, @ivs_raw ); |
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
# delete bad data listwise ie remove subj if any cell missing |
1256
|
8
|
|
|
|
|
35
|
$y = $y->squeeze; |
1257
|
8
|
|
|
|
|
289
|
my $pdl_ivs_raw = pdl \@pdl_ivs_raw; |
1258
|
|
|
|
|
|
|
# explicit set badflag because pdl() removes badflag |
1259
|
8
|
|
|
|
|
163
|
$pdl_ivs_raw->badflag( scalar grep { $_->badflag } @pdl_ivs_raw ); |
|
18
|
|
|
|
|
49
|
|
1260
|
|
|
|
|
|
|
|
1261
|
8
|
|
|
|
|
103
|
my $ibad = which( $y->isbad | nbadover($pdl_ivs_raw->transpose) ); |
1262
|
|
|
|
|
|
|
|
1263
|
8
|
|
|
|
|
455
|
my $sj_bad = $sj($ibad)->uniq; |
1264
|
8
|
100
|
|
|
|
844
|
if ($sj_bad->nelem) { |
1265
|
|
|
|
|
|
|
print STDERR $sj_bad->nelem . " subjects with missing data removed\n" |
1266
|
3
|
50
|
|
|
|
11
|
if $opt{V}; |
1267
|
|
|
|
|
|
|
$sj = $sj->setvaltobad($_) |
1268
|
3
|
|
|
|
|
10
|
for (list $sj_bad); |
1269
|
3
|
|
|
|
|
101
|
my $igood = which $sj->isgood; |
1270
|
3
|
|
|
|
|
84
|
for ($y, $sj, @pdl_ivs_raw) { |
1271
|
12
|
|
|
|
|
29
|
$_ = $_( $igood )->sever; |
1272
|
12
|
|
|
|
|
444
|
$_->badflag(0); |
1273
|
|
|
|
|
|
|
} |
1274
|
|
|
|
|
|
|
} |
1275
|
|
|
|
|
|
|
# code for ivs and cell mean in diff @s: effect_code vs iv_cluster |
1276
|
8
|
|
|
|
|
29
|
my ($ivs_ref, $i_cmo_ref) |
1277
|
|
|
|
|
|
|
= _effect_code_ivs( \@pdl_ivs_raw ); |
1278
|
|
|
|
|
|
|
|
1279
|
8
|
|
|
|
|
33
|
($ivs_ref, $i_cmo_ref, my( $idv, $ivs_cm_ref)) |
1280
|
|
|
|
|
|
|
= _add_interactions( $ivs_ref, $i_cmo_ref, \@idv, \@pdl_ivs_raw ); |
1281
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
# matches $ivs_ref, with an extra last pdl for subj effect |
1283
|
8
|
|
|
|
|
46
|
my $err_ref |
1284
|
|
|
|
|
|
|
= _add_errors( $sj, $ivs_ref, $idv, \@pdl_ivs_raw, \%opt ); |
1285
|
|
|
|
|
|
|
|
1286
|
|
|
|
|
|
|
# stitch together |
1287
|
8
|
|
|
|
|
31
|
my $ivs = PDLA->null->glue( 1, @$ivs_ref ); |
1288
|
8
|
100
|
|
|
|
1113
|
$ivs = $ivs->glue(1, grep { defined($_) and ref($_) } @$err_ref); |
|
46
|
|
|
|
|
131
|
|
1289
|
8
|
|
|
|
|
834
|
$ivs = $ivs->glue(1, ones $ivs->dim(0)); |
1290
|
8
|
|
|
|
|
963
|
my $b_full = $y->ols_t( $ivs, {CONST=>0} ); |
1291
|
|
|
|
|
|
|
|
1292
|
8
|
|
|
|
|
153
|
$ret{ss_total} = $y->ss; |
1293
|
8
|
|
|
|
|
260
|
$ret{ss_residual} = $y->sse( sumover( $b_full * $ivs->xchg(0,1) ) ); |
1294
|
|
|
|
|
|
|
|
1295
|
8
|
|
|
|
|
71
|
my @full = (@$ivs_ref, @$err_ref); |
1296
|
8
|
|
|
|
|
28
|
EFFECT: for my $k (0 .. $#full) { |
1297
|
84
|
100
|
|
|
|
304
|
my $e = ($k > $#$ivs_ref)? '| err' : ''; |
1298
|
84
|
100
|
|
|
|
225
|
my $i = ($k > $#$ivs_ref)? $k - @$ivs_ref : $k; |
1299
|
|
|
|
|
|
|
|
1300
|
84
|
100
|
|
|
|
315
|
if (!defined $full[$k]) { # ss_residual as error |
|
|
100
|
|
|
|
|
|
1301
|
14
|
|
|
|
|
55
|
$ret{ "| $idv->[$i] |$e ss" } = $ret{ss_residual}; |
1302
|
|
|
|
|
|
|
# highest ord inter for purely within design, (p-1)*(q-1)*(n-1) |
1303
|
|
|
|
|
|
|
$ret{ "| $idv->[$i] |$e df" } |
1304
|
14
|
|
|
|
|
42
|
= pdl(map { $_->dim(1) } @full[0 .. $#ivs_raw])->prodover; |
|
36
|
|
|
|
|
96
|
|
1305
|
14
|
100
|
|
|
|
475
|
$ret{ "| $idv->[$i] |$e df" } |
1306
|
|
|
|
|
|
|
*= ref($full[-1])? $full[-1]->dim(1) |
1307
|
|
|
|
|
|
|
: $err_ref->[$err_ref->[-1]]->dim(1) |
1308
|
|
|
|
|
|
|
; |
1309
|
|
|
|
|
|
|
$ret{ "| $idv->[$i] |$e ms" } |
1310
|
14
|
|
|
|
|
294
|
= $ret{ "| $idv->[$i] |$e ss" } / $ret{ "| $idv->[$i] |$e df" }; |
1311
|
|
|
|
|
|
|
} |
1312
|
|
|
|
|
|
|
elsif (ref $full[$k]) { # unique error term |
1313
|
58
|
|
|
|
|
138
|
my (@G, $G, $b_G); |
1314
|
58
|
100
|
|
|
|
186
|
@G = grep { $_ != $k and defined $full[$_] } (0 .. $#full); |
|
942
|
|
|
|
|
2231
|
|
1315
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
next EFFECT |
1317
|
58
|
50
|
|
|
|
183
|
unless @G; |
1318
|
|
|
|
|
|
|
|
1319
|
58
|
|
|
|
|
175
|
$G = PDLA->null->glue( 1, grep { ref $_ } @full[@G] ); |
|
760
|
|
|
|
|
1692
|
|
1320
|
58
|
|
|
|
|
11469
|
$G = $G->glue(1, ones $G->dim(0)); |
1321
|
58
|
|
|
|
|
7626
|
$b_G = $y->ols_t( $G, {CONST=>0} ); |
1322
|
|
|
|
|
|
|
|
1323
|
58
|
100
|
|
|
|
300
|
if ($k == $#full) { |
1324
|
|
|
|
|
|
|
$ret{ss_subject} |
1325
|
4
|
|
|
|
|
22
|
= $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual}; |
1326
|
|
|
|
|
|
|
} |
1327
|
|
|
|
|
|
|
else { |
1328
|
|
|
|
|
|
|
$ret{ "| $idv->[$i] |$e ss" } |
1329
|
54
|
|
|
|
|
339
|
= $y->sse(sumover($b_G * $G->transpose)) - $ret{ss_residual}; |
1330
|
54
|
|
|
|
|
3323
|
$ret{ "| $idv->[$i] |$e df" } |
1331
|
|
|
|
|
|
|
= $full[$k]->dim(1); |
1332
|
|
|
|
|
|
|
$ret{ "| $idv->[$i] |$e ms" } |
1333
|
54
|
|
|
|
|
1235
|
= $ret{ "| $idv->[$i] |$e ss" } / $ret{ "| $idv->[$i] |$e df" }; |
1334
|
|
|
|
|
|
|
} |
1335
|
|
|
|
|
|
|
} |
1336
|
|
|
|
|
|
|
else { # repeating error term |
1337
|
12
|
100
|
|
|
|
24
|
if ($k == $#full) { |
1338
|
4
|
|
|
|
|
21
|
$ret{ss_subject} = $ret{"| $idv->[$full[$k]] |$e ss"}; |
1339
|
|
|
|
|
|
|
} |
1340
|
|
|
|
|
|
|
else { |
1341
|
8
|
|
|
|
|
37
|
$ret{ "| $idv->[$i] |$e ss" } = $ret{"| $idv->[$full[$k]] |$e ss"}; |
1342
|
8
|
|
|
|
|
27
|
$ret{ "| $idv->[$i] |$e df" } = $ret{"| $idv->[$full[$k]] |$e df"}; |
1343
|
|
|
|
|
|
|
$ret{ "| $idv->[$i] |$e ms" } |
1344
|
8
|
|
|
|
|
98
|
= $ret{ "| $idv->[$i] |$e ss" } / $ret{ "| $idv->[$i] |$e df" }; |
1345
|
|
|
|
|
|
|
} |
1346
|
|
|
|
|
|
|
} |
1347
|
|
|
|
|
|
|
} |
1348
|
|
|
|
|
|
|
# have all iv, inter, and error effects. get F and F_p |
1349
|
8
|
|
|
|
|
219
|
for (0 .. $#$ivs_ref) { |
1350
|
|
|
|
|
|
|
$ret{ "| $idv->[$_] | F" } |
1351
|
38
|
|
|
|
|
304
|
= $ret{ "| $idv->[$_] | ms" } / $ret{ "| $idv->[$_] || err ms" }; |
1352
|
|
|
|
|
|
|
$ret{ "| $idv->[$_] | F_p" } |
1353
|
|
|
|
|
|
|
= 1 - $ret{ "| $idv->[$_] | F" }->gsl_cdf_fdist_P( |
1354
|
38
|
50
|
|
|
|
87
|
$ret{ "| $idv->[$_] | df" }, $ret{ "| $idv->[$_] || err df" } ) |
1355
|
|
|
|
|
|
|
if $CDF; |
1356
|
|
|
|
|
|
|
} |
1357
|
|
|
|
|
|
|
|
1358
|
8
|
100
|
|
|
|
62
|
for (keys %ret) {ref $ret{$_} eq 'PDLA' and $ret{$_} = $ret{$_}->squeeze}; |
|
290
|
|
|
|
|
5417
|
|
1359
|
|
|
|
|
|
|
|
1360
|
8
|
|
|
|
|
216
|
my $cm_ref |
1361
|
|
|
|
|
|
|
= _cell_means( $y, $ivs_cm_ref, $i_cmo_ref, $idv, \@pdl_ivs_raw ); |
1362
|
8
|
|
|
|
|
20
|
my @ls = map { $_->uniq->nelem } @pdl_ivs_raw; |
|
18
|
|
|
|
|
2001
|
|
1363
|
|
|
|
|
|
|
$cm_ref |
1364
|
8
|
|
|
|
|
1333
|
= _fix_rptd_se( $cm_ref, \%ret, $opt{'IVNM'}, \@ls, $sj->uniq->nelem ); |
1365
|
|
|
|
|
|
|
|
1366
|
|
|
|
|
|
|
# integrate mean and se into %ret |
1367
|
|
|
|
|
|
|
# sort bc we can't count on perl % internal key order implementation |
1368
|
8
|
|
|
|
|
132
|
@ret{ sort keys %$cm_ref } = @$cm_ref{ sort keys %$cm_ref }; |
1369
|
|
|
|
|
|
|
|
1370
|
8
|
|
|
|
|
25
|
my $highest = join(' ~ ', @{ $opt{IVNM} }); |
|
8
|
|
|
|
|
37
|
|
1371
|
|
|
|
|
|
|
$cm_ref->{"# $highest # m"}->plot_means( $cm_ref->{"# $highest # se"}, |
1372
|
|
|
|
|
|
|
{ %opt, IVNM=>$idv } ) |
1373
|
8
|
50
|
|
|
|
30
|
if $opt{PLOT}; |
1374
|
|
|
|
|
|
|
|
1375
|
8
|
|
|
|
|
638
|
return %ret; |
1376
|
|
|
|
|
|
|
} |
1377
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
sub _add_errors { |
1379
|
8
|
|
|
8
|
|
26
|
my ($subj, $ivs_ref, $idv, $raw_ivs, $opt) = @_; |
1380
|
|
|
|
|
|
|
|
1381
|
|
|
|
|
|
|
# code (btwn group) subjects. Rutherford (2001) pp 101-102 |
1382
|
|
|
|
|
|
|
|
1383
|
8
|
|
|
|
|
15
|
my (@grp, %grp_s); |
1384
|
8
|
|
|
|
|
42
|
for my $n (0 .. $subj->nelem - 1) { |
1385
|
|
|
|
|
|
|
# construct string to code group membership |
1386
|
|
|
|
|
|
|
# something not treated as BAD by _array_to_pdl to start off marking group membership |
1387
|
|
|
|
|
|
|
# if no $opt->{BTWN}, everyone ends up in the same grp |
1388
|
219
|
|
|
|
|
292
|
my $s = '_'; |
1389
|
|
|
|
|
|
|
$s .= $_->($n) |
1390
|
219
|
|
|
|
|
229
|
for (@$raw_ivs[@{ $opt->{BTWN} }]); |
|
219
|
|
|
|
|
466
|
|
1391
|
219
|
|
|
|
|
9171
|
push @grp, $s; # group membership |
1392
|
219
|
|
|
|
|
398
|
$s .= $subj($n); # keep track of total uniq subj |
1393
|
219
|
|
|
|
|
11723
|
$grp_s{$s} = 1; |
1394
|
|
|
|
|
|
|
} |
1395
|
8
|
|
|
|
|
42
|
my $grp = PDLA::Stats::Kmeans::iv_cluster \@grp; |
1396
|
|
|
|
|
|
|
|
1397
|
8
|
|
|
|
|
56
|
my $spdl = zeroes $subj->dim(0), keys(%grp_s) - $grp->dim(1); |
1398
|
8
|
|
|
|
|
413
|
my ($d0, $d1) = (0, 0); |
1399
|
8
|
|
|
|
|
40
|
for my $g (0 .. $grp->dim(1)-1) { |
1400
|
14
|
|
|
|
|
42
|
my $gsub = $subj( which $grp( ,$g) )->effect_code; |
1401
|
14
|
|
|
|
|
103
|
my ($nobs, $nsub) = $gsub->dims; |
1402
|
14
|
|
|
|
|
320
|
($_tmp = $spdl($d0:$d0+$nobs-1, $d1:$d1+$nsub-1)) .= $gsub; |
1403
|
14
|
|
|
|
|
437
|
$d0 += $nobs; |
1404
|
14
|
|
|
|
|
45
|
$d1 += $nsub; |
1405
|
|
|
|
|
|
|
} |
1406
|
|
|
|
|
|
|
|
1407
|
|
|
|
|
|
|
# if btwn factor involved, or highest order inter for within factors |
1408
|
|
|
|
|
|
|
# elem is undef, so that |
1409
|
|
|
|
|
|
|
# @errors ind matches @$ivs_ref, with an extra elem at the end for subj |
1410
|
|
|
|
|
|
|
|
1411
|
|
|
|
|
|
|
# mark btwn factors for error terms |
1412
|
|
|
|
|
|
|
# same error term for B(wn) and A(btwn) x B(wn) (Rutherford, p98) |
1413
|
8
|
|
|
|
|
16
|
my @qr = map { "(?:$idv->[$_])" } @{ $opt->{BTWN} }; |
|
5
|
|
|
|
|
20
|
|
|
8
|
|
|
|
|
22
|
|
1414
|
8
|
|
|
|
|
29
|
my $qr = join('|', @qr); |
1415
|
|
|
|
|
|
|
|
1416
|
8
|
|
|
|
|
13
|
my $ie_subj; |
1417
|
|
|
|
|
|
|
my @errors = map |
1418
|
8
|
|
|
|
|
27
|
{ my @fs = split ' ~ ', $idv->[$_]; |
|
38
|
|
|
|
|
1460
|
|
1419
|
|
|
|
|
|
|
# separate bw and wn factors |
1420
|
|
|
|
|
|
|
# if only bw, error is bw x subj |
1421
|
|
|
|
|
|
|
# if only wn or wn and bw, error is wn x subj |
1422
|
38
|
|
|
|
|
60
|
my (@wn, @bw); |
1423
|
38
|
100
|
|
|
|
59
|
if ($qr) { |
1424
|
24
|
|
|
|
|
37
|
for (@fs) { |
1425
|
44
|
100
|
|
|
|
246
|
/$qr/? push @bw, $_ : push @wn, $_; |
1426
|
|
|
|
|
|
|
} |
1427
|
|
|
|
|
|
|
} |
1428
|
|
|
|
|
|
|
else { |
1429
|
14
|
|
|
|
|
30
|
@wn = @fs; |
1430
|
|
|
|
|
|
|
} |
1431
|
38
|
100
|
|
|
|
76
|
$ie_subj = defined($ie_subj)? $ie_subj : $_ |
|
|
100
|
|
|
|
|
|
1432
|
|
|
|
|
|
|
if !@wn; |
1433
|
|
|
|
|
|
|
|
1434
|
38
|
100
|
|
|
|
120
|
my $err = @wn? join(' ~ ', @wn) : join(' ~ ', @bw); |
1435
|
38
|
|
|
|
|
49
|
my $ie; # mark repeating error term |
1436
|
38
|
|
|
|
|
69
|
for my $i (0 .. $#$ivs_ref) { |
1437
|
129
|
100
|
|
|
|
205
|
if ($idv->[$i] eq $err) { |
1438
|
38
|
|
|
|
|
48
|
$ie = $i; |
1439
|
38
|
|
|
|
|
47
|
last; |
1440
|
|
|
|
|
|
|
} |
1441
|
|
|
|
|
|
|
} |
1442
|
|
|
|
|
|
|
|
1443
|
|
|
|
|
|
|
# highest order inter of within factors, use ss_residual as error |
1444
|
38
|
100
|
100
|
|
|
53
|
if ( @wn == @$raw_ivs - @{$opt->{BTWN}} ) { undef } |
|
38
|
100
|
|
|
|
124
|
|
|
14
|
100
|
|
|
|
41
|
|
1445
|
|
|
|
|
|
|
# repeating btwn factors use ss_subject as error |
1446
|
2
|
|
|
|
|
5
|
elsif (!@wn and $_ > $ie_subj) { $ie_subj } |
1447
|
|
|
|
|
|
|
# repeating error term |
1448
|
6
|
|
|
|
|
12
|
elsif ($_ > $ie) { $ie } |
1449
|
16
|
|
|
|
|
50
|
else { PDLA::clump($ivs_ref->[$_] * $spdl->dummy(1), 1,2) } |
1450
|
|
|
|
|
|
|
} 0 .. $#$ivs_ref; |
1451
|
|
|
|
|
|
|
|
1452
|
8
|
100
|
|
|
|
15
|
@{$opt->{BTWN}}? push @errors, $ie_subj : push @errors, $spdl; |
|
8
|
|
|
|
|
26
|
|
1453
|
|
|
|
|
|
|
|
1454
|
8
|
|
|
|
|
60
|
return \@errors; |
1455
|
|
|
|
|
|
|
} |
1456
|
|
|
|
|
|
|
|
1457
|
|
|
|
|
|
|
sub _fix_rptd_se { |
1458
|
|
|
|
|
|
|
# if ivnm lvls_ref for within ss only this can work for mixed design |
1459
|
8
|
|
|
8
|
|
1281
|
my ($cm_ref, $ret, $ivnm, $lvls_ref, $n) = @_; |
1460
|
|
|
|
|
|
|
|
1461
|
8
|
|
|
|
|
107
|
my @se = grep /se$/, keys %$cm_ref; |
1462
|
8
|
|
|
|
|
24
|
@se = map { /^# (.+?) # se$/; $1; } @se; |
|
38
|
|
|
|
|
109
|
|
|
38
|
|
|
|
|
91
|
|
1463
|
|
|
|
|
|
|
|
1464
|
|
|
|
|
|
|
my @n_obs |
1465
|
|
|
|
|
|
|
= map { |
1466
|
8
|
|
|
|
|
26
|
my @ivs = split / ~ /, $_; |
|
38
|
|
|
|
|
121
|
|
1467
|
38
|
|
|
|
|
123
|
my $i_ivs = which_id $ivnm, \@ivs; |
1468
|
38
|
|
|
|
|
1092
|
my $icollapsed = setops pdl(0 .. $#$ivnm), 'XOR', $i_ivs; |
1469
|
|
|
|
|
|
|
|
1470
|
38
|
100
|
|
|
|
23029
|
my $collapsed = $icollapsed->nelem? |
1471
|
|
|
|
|
|
|
pdl( @$lvls_ref[(list $icollapsed)] )->prodover |
1472
|
|
|
|
|
|
|
: 1 |
1473
|
|
|
|
|
|
|
; |
1474
|
38
|
|
|
|
|
2040
|
$n * $collapsed; |
1475
|
|
|
|
|
|
|
} @se; |
1476
|
|
|
|
|
|
|
|
1477
|
8
|
|
|
|
|
41
|
for my $i (0 .. $#se) { |
1478
|
|
|
|
|
|
|
($_tmp = $cm_ref->{"# $se[$i] # se"}) |
1479
|
38
|
|
|
|
|
946
|
.= sqrt( $ret->{"| $se[$i] || err ms"} / $n_obs[$i] ); |
1480
|
|
|
|
|
|
|
} |
1481
|
|
|
|
|
|
|
|
1482
|
8
|
|
|
|
|
184
|
return $cm_ref; |
1483
|
|
|
|
|
|
|
} |
1484
|
|
|
|
|
|
|
|
1485
|
|
|
|
|
|
|
=head2 dummy_code |
1486
|
|
|
|
|
|
|
|
1487
|
|
|
|
|
|
|
=for ref |
1488
|
|
|
|
|
|
|
|
1489
|
|
|
|
|
|
|
Dummy coding of nominal variable (perl @ ref or 1d pdl) for use in regression. |
1490
|
|
|
|
|
|
|
|
1491
|
|
|
|
|
|
|
Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). |
1492
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
=for usage |
1494
|
|
|
|
|
|
|
|
1495
|
|
|
|
|
|
|
perldl> @a = qw(a a a b b b c c c) |
1496
|
|
|
|
|
|
|
perldl> p $a = dummy_code(\@a) |
1497
|
|
|
|
|
|
|
[ |
1498
|
|
|
|
|
|
|
[1 1 1 0 0 0 0 0 0] |
1499
|
|
|
|
|
|
|
[0 0 0 1 1 1 0 0 0] |
1500
|
|
|
|
|
|
|
] |
1501
|
|
|
|
|
|
|
|
1502
|
|
|
|
|
|
|
=cut |
1503
|
|
|
|
|
|
|
|
1504
|
|
|
|
|
|
|
*dummy_code = \&PDLA::dummy_code; |
1505
|
|
|
|
|
|
|
sub PDLA::dummy_code { |
1506
|
0
|
|
|
0
|
0
|
0
|
my ($var_ref) = @_; |
1507
|
|
|
|
|
|
|
|
1508
|
0
|
|
|
|
|
0
|
my $var_e = effect_code( $var_ref ); |
1509
|
|
|
|
|
|
|
|
1510
|
0
|
|
|
|
|
0
|
($_tmp = $var_e->where( $var_e == -1 )) .= 0; |
1511
|
|
|
|
|
|
|
|
1512
|
0
|
|
|
|
|
0
|
return $var_e; |
1513
|
|
|
|
|
|
|
} |
1514
|
|
|
|
|
|
|
|
1515
|
|
|
|
|
|
|
=head2 effect_code |
1516
|
|
|
|
|
|
|
|
1517
|
|
|
|
|
|
|
=for ref |
1518
|
|
|
|
|
|
|
|
1519
|
|
|
|
|
|
|
Unweighted effect coding of nominal variable (perl @ ref or 1d pdl) for use in regression. returns in @ context coded pdl and % ref to level - pdl->dim(1) index. |
1520
|
|
|
|
|
|
|
|
1521
|
|
|
|
|
|
|
Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). |
1522
|
|
|
|
|
|
|
|
1523
|
|
|
|
|
|
|
=for usage |
1524
|
|
|
|
|
|
|
|
1525
|
|
|
|
|
|
|
my @var = qw( a a a b b b c c c ); |
1526
|
|
|
|
|
|
|
my ($var_e, $map) = effect_code( \@var ); |
1527
|
|
|
|
|
|
|
|
1528
|
|
|
|
|
|
|
print $var_e . $var_e->info . "\n"; |
1529
|
|
|
|
|
|
|
|
1530
|
|
|
|
|
|
|
[ |
1531
|
|
|
|
|
|
|
[ 1 1 1 0 0 0 -1 -1 -1] |
1532
|
|
|
|
|
|
|
[ 0 0 0 1 1 1 -1 -1 -1] |
1533
|
|
|
|
|
|
|
] |
1534
|
|
|
|
|
|
|
PDLA: Double D [9,2] |
1535
|
|
|
|
|
|
|
|
1536
|
|
|
|
|
|
|
print "$_\t$map->{$_}\n" for (sort keys %$map) |
1537
|
|
|
|
|
|
|
a 0 |
1538
|
|
|
|
|
|
|
b 1 |
1539
|
|
|
|
|
|
|
c 2 |
1540
|
|
|
|
|
|
|
|
1541
|
|
|
|
|
|
|
=cut |
1542
|
|
|
|
|
|
|
|
1543
|
|
|
|
|
|
|
*effect_code = \&PDLA::effect_code; |
1544
|
|
|
|
|
|
|
sub PDLA::effect_code { |
1545
|
80
|
|
|
80
|
0
|
11556
|
my ($var_ref) = @_; |
1546
|
|
|
|
|
|
|
|
1547
|
|
|
|
|
|
|
# pdl->uniq sorts elems. so instead list it to maintain old order |
1548
|
80
|
100
|
|
|
|
207
|
if (ref $var_ref eq 'PDLA') { |
1549
|
44
|
|
|
|
|
95
|
$var_ref = $var_ref->squeeze; |
1550
|
44
|
50
|
|
|
|
1523
|
$var_ref->getndims > 1 and |
1551
|
|
|
|
|
|
|
croak "multidim pdl passed for single var!"; |
1552
|
44
|
|
|
|
|
105
|
$var_ref = [ list $var_ref ]; |
1553
|
|
|
|
|
|
|
} |
1554
|
|
|
|
|
|
|
|
1555
|
80
|
|
|
|
|
1195
|
my ($var, $map_ref) = PDLA::Stats::Basic::_array_to_pdl( $var_ref ); |
1556
|
80
|
|
|
|
|
224
|
my $var_e = zeroes float, $var->nelem, $var->max; |
1557
|
|
|
|
|
|
|
|
1558
|
80
|
|
|
|
|
7463
|
for my $l (0 .. $var->max - 1) { |
1559
|
347
|
|
|
|
|
25695
|
my $v = $var_e( ,$l); |
1560
|
347
|
|
|
|
|
6364
|
($_tmp = $v->index( which $var == $l )) .= 1; |
1561
|
347
|
|
|
|
|
17319
|
($_tmp = $v->index( which $var == $var->max )) .= -1; |
1562
|
|
|
|
|
|
|
} |
1563
|
|
|
|
|
|
|
|
1564
|
80
|
100
|
|
|
|
6946
|
if ($var->badflag) { |
1565
|
1
|
|
|
|
|
10
|
my $ibad = which $var->isbad; |
1566
|
1
|
|
|
|
|
27
|
($_tmp = $var_e($ibad, )) .= -99; |
1567
|
1
|
|
|
|
|
57
|
$var_e = $var_e->setvaltobad(-99); |
1568
|
|
|
|
|
|
|
} |
1569
|
|
|
|
|
|
|
|
1570
|
80
|
100
|
|
|
|
423
|
return wantarray? ($var_e, $map_ref) : $var_e; |
1571
|
|
|
|
|
|
|
} |
1572
|
|
|
|
|
|
|
|
1573
|
|
|
|
|
|
|
=head2 effect_code_w |
1574
|
|
|
|
|
|
|
|
1575
|
|
|
|
|
|
|
=for ref |
1576
|
|
|
|
|
|
|
|
1577
|
|
|
|
|
|
|
Weighted effect code for nominal variable. returns in @ context coded pdl and % ref to level - pdl->dim(1) index. |
1578
|
|
|
|
|
|
|
|
1579
|
|
|
|
|
|
|
Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). |
1580
|
|
|
|
|
|
|
|
1581
|
|
|
|
|
|
|
=for usage |
1582
|
|
|
|
|
|
|
|
1583
|
|
|
|
|
|
|
perldl> @a = qw( a a b b b c c ) |
1584
|
|
|
|
|
|
|
perldl> p $a = effect_code_w(\@a) |
1585
|
|
|
|
|
|
|
[ |
1586
|
|
|
|
|
|
|
[ 1 1 0 0 0 -1 -1] |
1587
|
|
|
|
|
|
|
[ 0 0 1 1 1 -1.5 -1.5] |
1588
|
|
|
|
|
|
|
] |
1589
|
|
|
|
|
|
|
|
1590
|
|
|
|
|
|
|
=cut |
1591
|
|
|
|
|
|
|
|
1592
|
|
|
|
|
|
|
*effect_code_w = \&PDLA::effect_code_w; |
1593
|
|
|
|
|
|
|
sub PDLA::effect_code_w { |
1594
|
1
|
|
|
1
|
0
|
1113
|
my ($var_ref) = @_; |
1595
|
|
|
|
|
|
|
|
1596
|
1
|
|
|
|
|
3
|
my ($var_e, $map_ref) = effect_code( $var_ref ); |
1597
|
|
|
|
|
|
|
|
1598
|
1
|
50
|
|
|
|
4
|
if ($var_e->sum == 0) { |
1599
|
0
|
0
|
|
|
|
0
|
return wantarray? ($var_e, $map_ref) : $var_e; |
1600
|
|
|
|
|
|
|
} |
1601
|
|
|
|
|
|
|
|
1602
|
1
|
|
|
|
|
41
|
for (0..$var_e->dim(1)-1) { |
1603
|
2
|
|
|
|
|
61
|
my $factor = $var_e( ,$_); |
1604
|
2
|
|
|
|
|
41
|
my $pos = which $factor == 1; |
1605
|
2
|
|
|
|
|
85
|
my $neg = which $factor == -1; |
1606
|
2
|
|
|
|
|
104
|
my $w = $pos->nelem / $neg->nelem; |
1607
|
2
|
|
|
|
|
7
|
($_tmp = $factor($neg)) *= $w; |
1608
|
|
|
|
|
|
|
} |
1609
|
|
|
|
|
|
|
|
1610
|
1
|
50
|
|
|
|
56
|
return wantarray? ($var_e, $map_ref) : $var_e; |
1611
|
|
|
|
|
|
|
} |
1612
|
|
|
|
|
|
|
|
1613
|
|
|
|
|
|
|
=head2 interaction_code |
1614
|
|
|
|
|
|
|
|
1615
|
|
|
|
|
|
|
Returns the coded interaction term for effect-coded variables. |
1616
|
|
|
|
|
|
|
|
1617
|
|
|
|
|
|
|
Supports BAD value (missing or 'BAD' values result in the corresponding pdl elements being marked as BAD). |
1618
|
|
|
|
|
|
|
|
1619
|
|
|
|
|
|
|
=for usage |
1620
|
|
|
|
|
|
|
|
1621
|
|
|
|
|
|
|
perldl> $a = sequence(6) > 2 |
1622
|
|
|
|
|
|
|
perldl> p $a = $a->effect_code |
1623
|
|
|
|
|
|
|
[ |
1624
|
|
|
|
|
|
|
[ 1 1 1 -1 -1 -1] |
1625
|
|
|
|
|
|
|
] |
1626
|
|
|
|
|
|
|
|
1627
|
|
|
|
|
|
|
perldl> $b = pdl( qw( 0 1 2 0 1 2 ) ) |
1628
|
|
|
|
|
|
|
perldl> p $b = $b->effect_code |
1629
|
|
|
|
|
|
|
[ |
1630
|
|
|
|
|
|
|
[ 1 0 -1 1 0 -1] |
1631
|
|
|
|
|
|
|
[ 0 1 -1 0 1 -1] |
1632
|
|
|
|
|
|
|
] |
1633
|
|
|
|
|
|
|
|
1634
|
|
|
|
|
|
|
perldl> p $ab = interaction_code( $a, $b ) |
1635
|
|
|
|
|
|
|
[ |
1636
|
|
|
|
|
|
|
[ 1 0 -1 -1 -0 1] |
1637
|
|
|
|
|
|
|
[ 0 1 -1 -0 -1 1] |
1638
|
|
|
|
|
|
|
] |
1639
|
|
|
|
|
|
|
|
1640
|
|
|
|
|
|
|
=cut |
1641
|
|
|
|
|
|
|
|
1642
|
|
|
|
|
|
|
*interaction_code = \&PDLA::interaction_code; |
1643
|
|
|
|
|
|
|
sub PDLA::interaction_code { |
1644
|
10
|
|
|
10
|
0
|
61
|
my @vars = @_; |
1645
|
|
|
|
|
|
|
|
1646
|
10
|
|
|
|
|
52
|
my $i = ones( $vars[0]->dim(0), 1 ); |
1647
|
10
|
|
|
|
|
575
|
for (@vars) { |
1648
|
21
|
|
|
|
|
582
|
$i = $i * $_->dummy(1); |
1649
|
21
|
|
|
|
|
1193
|
$i = $i->clump(1,2); |
1650
|
|
|
|
|
|
|
} |
1651
|
|
|
|
|
|
|
|
1652
|
10
|
|
|
|
|
429
|
return $i; |
1653
|
|
|
|
|
|
|
} |
1654
|
|
|
|
|
|
|
|
1655
|
|
|
|
|
|
|
=head2 ols |
1656
|
|
|
|
|
|
|
|
1657
|
|
|
|
|
|
|
=for ref |
1658
|
|
|
|
|
|
|
|
1659
|
|
|
|
|
|
|
Ordinary least squares regression, aka linear regression. Unlike B, ols is not threadable, but it can handle bad value (by ignoring observations with bad value in dependent or independent variables list-wise) and returns the full model in list context with various stats. |
1660
|
|
|
|
|
|
|
|
1661
|
|
|
|
|
|
|
IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply the constant vector in $x. Intercept is automatically added and returned as LAST of the coeffs if CONST=>1. Returns full model in list context and coeff in scalar context. |
1662
|
|
|
|
|
|
|
|
1663
|
|
|
|
|
|
|
=for options |
1664
|
|
|
|
|
|
|
|
1665
|
|
|
|
|
|
|
Default options (case insensitive): |
1666
|
|
|
|
|
|
|
|
1667
|
|
|
|
|
|
|
CONST => 1, |
1668
|
|
|
|
|
|
|
PLOT => 1, # see plot_residuals() for plot options |
1669
|
|
|
|
|
|
|
|
1670
|
|
|
|
|
|
|
=for usage |
1671
|
|
|
|
|
|
|
|
1672
|
|
|
|
|
|
|
Usage: |
1673
|
|
|
|
|
|
|
|
1674
|
|
|
|
|
|
|
# suppose this is a person's ratings for top 10 box office movies |
1675
|
|
|
|
|
|
|
# ascending sorted by box office |
1676
|
|
|
|
|
|
|
|
1677
|
|
|
|
|
|
|
perldl> p $y = qsort ceil( random(10) * 5 ) |
1678
|
|
|
|
|
|
|
[1 1 2 2 2 2 4 4 5 5] |
1679
|
|
|
|
|
|
|
|
1680
|
|
|
|
|
|
|
# construct IV with linear and quadratic component |
1681
|
|
|
|
|
|
|
|
1682
|
|
|
|
|
|
|
perldl> p $x = cat sequence(10), sequence(10)**2 |
1683
|
|
|
|
|
|
|
[ |
1684
|
|
|
|
|
|
|
[ 0 1 2 3 4 5 6 7 8 9] |
1685
|
|
|
|
|
|
|
[ 0 1 4 9 16 25 36 49 64 81] |
1686
|
|
|
|
|
|
|
] |
1687
|
|
|
|
|
|
|
|
1688
|
|
|
|
|
|
|
perldl> %m = $y->ols( $x ) |
1689
|
|
|
|
|
|
|
|
1690
|
|
|
|
|
|
|
perldl> p "$_\t$m{$_}\n" for (sort keys %m) |
1691
|
|
|
|
|
|
|
|
1692
|
|
|
|
|
|
|
F 40.4225352112676 |
1693
|
|
|
|
|
|
|
F_df [2 7] |
1694
|
|
|
|
|
|
|
F_p 0.000142834216344756 |
1695
|
|
|
|
|
|
|
R2 0.920314253647587 |
1696
|
|
|
|
|
|
|
|
1697
|
|
|
|
|
|
|
# coeff linear quadratic constant |
1698
|
|
|
|
|
|
|
|
1699
|
|
|
|
|
|
|
b [0.21212121 0.03030303 0.98181818] |
1700
|
|
|
|
|
|
|
b_p [0.32800118 0.20303404 0.039910509] |
1701
|
|
|
|
|
|
|
b_se [0.20174693 0.021579989 0.38987581] |
1702
|
|
|
|
|
|
|
b_t [ 1.0514223 1.404219 2.5182844] |
1703
|
|
|
|
|
|
|
ss_model 19.8787878787879 |
1704
|
|
|
|
|
|
|
ss_residual 1.72121212121212 |
1705
|
|
|
|
|
|
|
ss_total 21.6 |
1706
|
|
|
|
|
|
|
y_pred [0.98181818 1.2242424 1.5272727 ... 4.6181818 5.3454545] |
1707
|
|
|
|
|
|
|
|
1708
|
|
|
|
|
|
|
=cut |
1709
|
|
|
|
|
|
|
|
1710
|
|
|
|
|
|
|
*ols = \&PDLA::ols; |
1711
|
|
|
|
|
|
|
sub PDLA::ols { |
1712
|
|
|
|
|
|
|
# y [n], ivs [n x attr] pdl |
1713
|
4
|
|
|
4
|
0
|
1834
|
my ($y, $ivs, $opt) = @_; |
1714
|
4
|
|
|
|
|
9
|
my %opt = ( |
1715
|
|
|
|
|
|
|
CONST => 1, |
1716
|
|
|
|
|
|
|
PLOT => 1, |
1717
|
|
|
|
|
|
|
); |
1718
|
4
|
|
33
|
|
|
22
|
$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); |
1719
|
|
|
|
|
|
|
|
1720
|
4
|
|
|
|
|
12
|
$y = $y->squeeze; |
1721
|
4
|
50
|
|
|
|
152
|
$y->getndims > 1 and |
1722
|
|
|
|
|
|
|
croak "use ols_t for threaded version"; |
1723
|
|
|
|
|
|
|
|
1724
|
4
|
50
|
|
|
|
17
|
$ivs = $ivs->dummy(1) if $ivs->getndims == 1; |
1725
|
|
|
|
|
|
|
|
1726
|
4
|
|
|
|
|
115
|
($y, $ivs) = _rm_bad_value( $y, $ivs ); |
1727
|
|
|
|
|
|
|
|
1728
|
|
|
|
|
|
|
# set up ivs and const as ivs |
1729
|
|
|
|
|
|
|
$opt{CONST} and |
1730
|
4
|
100
|
|
|
|
17
|
$ivs = $ivs->glue( 1, ones($ivs->dim(0)) ); |
1731
|
|
|
|
|
|
|
|
1732
|
|
|
|
|
|
|
# Internally normalise data |
1733
|
|
|
|
|
|
|
|
1734
|
4
|
|
|
|
|
230
|
my $ymean = (abs($y)->sum)/($y->nelem); |
1735
|
4
|
50
|
|
|
|
197
|
$ymean = 1 if $ymean == 0; |
1736
|
4
|
|
|
|
|
34
|
my $y2 = $y / $ymean; |
1737
|
|
|
|
|
|
|
|
1738
|
|
|
|
|
|
|
# Do the fit |
1739
|
|
|
|
|
|
|
|
1740
|
4
|
|
|
|
|
15
|
my $Y = $ivs x $y2->dummy(0); |
1741
|
|
|
|
|
|
|
|
1742
|
4
|
|
|
|
|
246
|
my $C; |
1743
|
4
|
50
|
|
|
|
9
|
if ( $SLATEC ) { |
1744
|
0
|
|
|
|
|
0
|
$C = PDLA::Slatec::matinv( $ivs x $ivs->xchg(0,1) ); |
1745
|
|
|
|
|
|
|
} |
1746
|
|
|
|
|
|
|
else { |
1747
|
4
|
|
|
|
|
17
|
$C = inv( $ivs x $ivs->xchg(0,1) ); |
1748
|
|
|
|
|
|
|
} |
1749
|
|
|
|
|
|
|
|
1750
|
|
|
|
|
|
|
# Fitted coefficients vector |
1751
|
4
|
|
|
|
|
5330
|
my $coeff = PDLA::squeeze( $C x $Y ); |
1752
|
4
|
|
|
|
|
321
|
$coeff *= $ymean; # Un-normalise |
1753
|
|
|
|
|
|
|
|
1754
|
4
|
|
|
|
|
50
|
my %ret; |
1755
|
|
|
|
|
|
|
|
1756
|
|
|
|
|
|
|
# ***$coeff x $ivs looks nice but produces nan on successive tries*** |
1757
|
4
|
|
|
|
|
13
|
$ret{y_pred} = sumover( $coeff * $ivs->transpose ); |
1758
|
|
|
|
|
|
|
|
1759
|
4
|
50
|
|
|
|
91
|
$opt{PLOT} and $y->plot_residuals( $ret{y_pred}, \%opt ); |
1760
|
|
|
|
|
|
|
|
1761
|
4
|
100
|
|
|
|
23
|
return $coeff |
1762
|
|
|
|
|
|
|
unless wantarray; |
1763
|
|
|
|
|
|
|
|
1764
|
2
|
|
|
|
|
3
|
$ret{b} = $coeff; |
1765
|
2
|
50
|
|
|
|
20
|
$ret{ss_total} = $opt{CONST}? $y->ss : sum( $y ** 2 ); |
1766
|
2
|
|
|
|
|
18
|
$ret{ss_residual} = $y->sse( $ret{y_pred} ); |
1767
|
2
|
|
|
|
|
14
|
$ret{ss_model} = $ret{ss_total} - $ret{ss_residual}; |
1768
|
2
|
|
|
|
|
13
|
$ret{R2} = $ret{ss_model} / $ret{ss_total}; |
1769
|
|
|
|
|
|
|
|
1770
|
2
|
50
|
|
|
|
10
|
my $n_var = $opt{CONST}? $ivs->dim(1) - 1 : $ivs->dim(1); |
1771
|
2
|
|
|
|
|
9
|
$ret{F_df} = pdl( $n_var, $y->nelem - $ivs->dim(1) ); |
1772
|
|
|
|
|
|
|
$ret{F} = $ret{ss_model} / $ret{F_df}->(0) |
1773
|
2
|
|
|
|
|
43
|
/ ( $ret{ss_residual} / $ret{F_df}->(1) ); |
1774
|
|
|
|
|
|
|
$ret{F_p} = 1 - $ret{F}->gsl_cdf_fdist_P( $ret{F_df}->dog ) |
1775
|
2
|
50
|
|
|
|
72
|
if $CDF; |
1776
|
|
|
|
|
|
|
|
1777
|
2
|
50
|
|
|
|
6
|
my $se_b = ones( $coeff->dims? $coeff->dims : 1 ); |
1778
|
|
|
|
|
|
|
|
1779
|
|
|
|
|
|
|
$opt{CONST} and |
1780
|
2
|
50
|
|
|
|
154
|
($_tmp = $se_b(-1)) .= sqrt( $ret{ss_residual} / $ret{F_df}->(1) * $C(-1,-1) ); |
1781
|
|
|
|
|
|
|
|
1782
|
|
|
|
|
|
|
# get the se for bs by successivly regressing each iv by the rest ivs |
1783
|
2
|
50
|
|
|
|
132
|
if ($ivs->dim(1) > 1) { |
1784
|
2
|
|
|
|
|
6
|
for my $k (0 .. $n_var-1) { |
1785
|
2
|
|
|
|
|
4
|
my @G = grep { $_ != $k } (0 .. $n_var-1); |
|
2
|
|
|
|
|
6
|
|
1786
|
2
|
|
|
|
|
8
|
my $G = $ivs->dice_axis(1, \@G); |
1787
|
|
|
|
|
|
|
$opt{CONST} and |
1788
|
2
|
50
|
|
|
|
75
|
$G = $G->glue( 1, ones($ivs->dim(0)) ); |
1789
|
2
|
|
|
|
|
118
|
my $b_G = $ivs( ,$k)->ols( $G, {CONST=>0,PLOT=>0} ); |
1790
|
|
|
|
|
|
|
|
1791
|
2
|
|
|
|
|
10
|
my $ss_res_k = $ivs( ,$k)->squeeze->sse( sumover($b_G * $G->transpose) ); |
1792
|
|
|
|
|
|
|
|
1793
|
2
|
|
|
|
|
282
|
($_tmp = $se_b($k)) .= sqrt( $ret{ss_residual} / $ret{F_df}->(1) / $ss_res_k ); |
1794
|
|
|
|
|
|
|
} |
1795
|
|
|
|
|
|
|
} |
1796
|
|
|
|
|
|
|
else { |
1797
|
|
|
|
|
|
|
($_tmp = $se_b(0)) |
1798
|
0
|
|
|
|
|
0
|
.= sqrt( $ret{ss_residual} / $ret{F_df}->(1) / sum( $ivs( ,0)**2 ) ); |
1799
|
|
|
|
|
|
|
} |
1800
|
|
|
|
|
|
|
|
1801
|
2
|
|
|
|
|
107
|
$ret{b_se} = $se_b; |
1802
|
2
|
|
|
|
|
14
|
$ret{b_t} = $ret{b} / $ret{b_se}; |
1803
|
2
|
50
|
|
|
|
6
|
$ret{b_p} = 2 * ( 1 - $ret{b_t}->abs->gsl_cdf_tdist_P( $ret{F_df}->(1) ) ) |
1804
|
|
|
|
|
|
|
if $CDF; |
1805
|
|
|
|
|
|
|
|
1806
|
2
|
50
|
|
|
|
7
|
for (keys %ret) { ref $ret{$_} eq 'PDLA' and $ret{$_} = $ret{$_}->squeeze }; |
|
20
|
|
|
|
|
503
|
|
1807
|
|
|
|
|
|
|
|
1808
|
2
|
|
|
|
|
76
|
return %ret; |
1809
|
|
|
|
|
|
|
} |
1810
|
|
|
|
|
|
|
|
1811
|
|
|
|
|
|
|
sub _rm_bad_value { |
1812
|
8
|
|
|
8
|
|
15
|
my ($y, $ivs) = @_; |
1813
|
|
|
|
|
|
|
|
1814
|
8
|
|
|
|
|
11
|
my $idx; |
1815
|
8
|
100
|
66
|
|
|
22
|
if ($y->check_badflag or $ivs->check_badflag) { |
1816
|
3
|
|
|
|
|
186
|
$idx = which(($y->isbad==0) & (nbadover ($ivs->transpose)==0)); |
1817
|
3
|
|
|
|
|
192
|
$y = $y($idx)->sever; |
1818
|
3
|
|
|
|
|
125
|
$ivs = $ivs($idx,)->sever; |
1819
|
3
|
|
|
|
|
116
|
$ivs->badflag(0); |
1820
|
3
|
|
|
|
|
7
|
$y->badflag(0); |
1821
|
|
|
|
|
|
|
} |
1822
|
|
|
|
|
|
|
|
1823
|
8
|
|
|
|
|
81
|
return $y, $ivs, $idx; |
1824
|
|
|
|
|
|
|
} |
1825
|
|
|
|
|
|
|
|
1826
|
|
|
|
|
|
|
=head2 ols_rptd |
1827
|
|
|
|
|
|
|
|
1828
|
|
|
|
|
|
|
=for ref |
1829
|
|
|
|
|
|
|
|
1830
|
|
|
|
|
|
|
Repeated measures linear regression (Lorch & Myers, 1990; Van den Noortgate & Onghena, 2006). Handles purely within-subject design for now. See t/stats_ols_rptd.t for an example using the Lorch and Myers data. |
1831
|
|
|
|
|
|
|
|
1832
|
|
|
|
|
|
|
=for usage |
1833
|
|
|
|
|
|
|
|
1834
|
|
|
|
|
|
|
Usage: |
1835
|
|
|
|
|
|
|
|
1836
|
|
|
|
|
|
|
# This is the example from Lorch and Myers (1990), |
1837
|
|
|
|
|
|
|
# a study on how characteristics of sentences affected reading time |
1838
|
|
|
|
|
|
|
# Three within-subject IVs: |
1839
|
|
|
|
|
|
|
# SP -- serial position of sentence |
1840
|
|
|
|
|
|
|
# WORDS -- number of words in sentence |
1841
|
|
|
|
|
|
|
# NEW -- number of new arguments in sentence |
1842
|
|
|
|
|
|
|
|
1843
|
|
|
|
|
|
|
# $subj can be 1D pdl or @ ref and must be the first argument |
1844
|
|
|
|
|
|
|
# IV can be 1D @ ref or pdl |
1845
|
|
|
|
|
|
|
# 1D @ ref is effect coded internally into pdl |
1846
|
|
|
|
|
|
|
# pdl is left as is |
1847
|
|
|
|
|
|
|
|
1848
|
|
|
|
|
|
|
my %r = $rt->ols_rptd( $subj, $sp, $words, $new ); |
1849
|
|
|
|
|
|
|
|
1850
|
|
|
|
|
|
|
print "$_\t$r{$_}\n" for (sort keys %r); |
1851
|
|
|
|
|
|
|
|
1852
|
|
|
|
|
|
|
(ss_residual) 58.3754646504336 |
1853
|
|
|
|
|
|
|
(ss_subject) 51.8590337714286 |
1854
|
|
|
|
|
|
|
(ss_total) 405.188241771429 |
1855
|
|
|
|
|
|
|
# SP WORDS NEW |
1856
|
|
|
|
|
|
|
F [ 7.208473 61.354153 1.0243311] |
1857
|
|
|
|
|
|
|
F_p [0.025006181 2.619081e-05 0.33792837] |
1858
|
|
|
|
|
|
|
coeff [0.33337285 0.45858933 0.15162986] |
1859
|
|
|
|
|
|
|
df [1 1 1] |
1860
|
|
|
|
|
|
|
df_err [9 9 9] |
1861
|
|
|
|
|
|
|
ms [ 18.450705 73.813294 0.57026483] |
1862
|
|
|
|
|
|
|
ms_err [ 2.5595857 1.2030692 0.55671923] |
1863
|
|
|
|
|
|
|
ss [ 18.450705 73.813294 0.57026483] |
1864
|
|
|
|
|
|
|
ss_err [ 23.036272 10.827623 5.0104731] |
1865
|
|
|
|
|
|
|
|
1866
|
|
|
|
|
|
|
|
1867
|
|
|
|
|
|
|
=cut |
1868
|
|
|
|
|
|
|
|
1869
|
|
|
|
|
|
|
*ols_rptd = \&PDLA::ols_rptd; |
1870
|
|
|
|
|
|
|
sub PDLA::ols_rptd { |
1871
|
1
|
|
|
1
|
0
|
68
|
my ($y, $subj, @ivs_raw) = @_; |
1872
|
|
|
|
|
|
|
|
1873
|
1
|
|
|
|
|
6
|
$y = $y->squeeze; |
1874
|
1
|
50
|
|
|
|
63
|
$y->getndims > 1 and |
1875
|
|
|
|
|
|
|
croak "ols_rptd does not support threading"; |
1876
|
|
|
|
|
|
|
|
1877
|
1
|
50
|
66
|
|
|
4
|
my @ivs = map { (ref $_ eq 'PDLA' and $_->ndims > 1)? $_ |
|
3
|
100
|
|
|
|
59
|
|
1878
|
|
|
|
|
|
|
: ref $_ eq 'PDLA' ? $_->dummy(1) |
1879
|
|
|
|
|
|
|
: scalar effect_code($_) |
1880
|
|
|
|
|
|
|
; |
1881
|
|
|
|
|
|
|
} @ivs_raw; |
1882
|
|
|
|
|
|
|
|
1883
|
1
|
|
|
|
|
3
|
my %r; |
1884
|
|
|
|
|
|
|
|
1885
|
1
|
|
|
|
|
71
|
$r{'(ss_total)'} = $y->ss; |
1886
|
|
|
|
|
|
|
|
1887
|
|
|
|
|
|
|
# STEP 1: subj |
1888
|
|
|
|
|
|
|
|
1889
|
1
|
|
|
|
|
9
|
my $s = effect_code $subj; # gives same results as dummy_code |
1890
|
1
|
|
|
|
|
8
|
my $b_s = $y->ols_t($s); |
1891
|
1
|
|
|
|
|
9
|
my $pred = sumover($b_s(0:-2) * $s->transpose) + $b_s(-1); |
1892
|
1
|
|
|
|
|
161
|
$r{'(ss_subject)'} = $r{'(ss_total)'} - $y->sse( $pred ); |
1893
|
|
|
|
|
|
|
|
1894
|
|
|
|
|
|
|
# STEP 2: add predictor variables |
1895
|
|
|
|
|
|
|
|
1896
|
1
|
|
|
|
|
12
|
my $iv_p = $s->glue(1, @ivs); |
1897
|
1
|
|
|
|
|
108
|
my $b_p = $y->ols_t($iv_p); |
1898
|
|
|
|
|
|
|
|
1899
|
|
|
|
|
|
|
# only care about coeff for predictor vars. no subj or const coeff |
1900
|
1
|
|
|
|
|
8
|
$r{coeff} = $b_p(-(@ivs+1) : -2)->sever; |
1901
|
|
|
|
|
|
|
|
1902
|
|
|
|
|
|
|
# get total sse for this step |
1903
|
1
|
|
|
|
|
20
|
$pred = sumover($b_p(0:-2) * $iv_p->transpose) + $b_p(-1); |
1904
|
1
|
|
|
|
|
85
|
my $ss_pe = $y->sse( $pred ); |
1905
|
|
|
|
|
|
|
|
1906
|
|
|
|
|
|
|
# get predictor ss by successively reducing the model |
1907
|
1
|
|
|
|
|
7
|
$r{ss} = zeroes scalar(@ivs); |
1908
|
1
|
|
|
|
|
67
|
for my $i (0 .. $#ivs) { |
1909
|
3
|
|
|
|
|
135
|
my @i_rest = grep { $_ != $i } 0 .. $#ivs; |
|
9
|
|
|
|
|
21
|
|
1910
|
3
|
|
|
|
|
24
|
my $iv = $s->glue(1, @ivs[ @i_rest ]); |
1911
|
3
|
|
|
|
|
292
|
my $b = $y->ols_t($iv); |
1912
|
3
|
|
|
|
|
16
|
$pred = sumover($b(0:-2) * $iv->transpose) + $b(-1); |
1913
|
3
|
|
|
|
|
204
|
($_tmp = $r{ss}->($i)) .= $y->sse($pred) - $ss_pe; |
1914
|
|
|
|
|
|
|
} |
1915
|
|
|
|
|
|
|
|
1916
|
|
|
|
|
|
|
# STEP 3: get precitor x subj interaction as error term |
1917
|
|
|
|
|
|
|
|
1918
|
1
|
|
|
|
|
67
|
my $iv_e = PDLA::glue 1, map { interaction_code( $s, $_ ) } @ivs; |
|
3
|
|
|
|
|
10
|
|
1919
|
|
|
|
|
|
|
|
1920
|
|
|
|
|
|
|
# get total sse for this step. full model now. |
1921
|
1
|
|
|
|
|
180
|
my $b_f = $y->ols_t( $iv_p->glue(1,$iv_e) ); |
1922
|
1
|
|
|
|
|
12
|
$pred = sumover($b_f(0:-2) * $iv_p->glue(1,$iv_e)->transpose) + $b_f(-1); |
1923
|
1
|
|
|
|
|
189
|
$r{'(ss_residual)'} = $y->sse( $pred ); |
1924
|
|
|
|
|
|
|
|
1925
|
|
|
|
|
|
|
# get predictor x subj ss by successively reducing the error term |
1926
|
1
|
|
|
|
|
7
|
$r{ss_err} = zeroes scalar(@ivs); |
1927
|
1
|
|
|
|
|
83
|
for my $i (0 .. $#ivs) { |
1928
|
3
|
|
|
|
|
141
|
my @i_rest = grep { $_ != $i } 0 .. $#ivs; |
|
9
|
|
|
|
|
21
|
|
1929
|
3
|
|
|
|
|
11
|
my $e_rest = PDLA::glue 1, map { interaction_code( $s, $_ ) } @ivs[@i_rest]; |
|
6
|
|
|
|
|
20
|
|
1930
|
3
|
|
|
|
|
253
|
my $iv = $iv_p->glue(1, $e_rest); |
1931
|
3
|
|
|
|
|
126
|
my $b = $y->ols_t($iv); |
1932
|
3
|
|
|
|
|
14
|
my $pred = sumover($b(0:-2) * $iv->transpose) + $b(-1); |
1933
|
3
|
|
|
|
|
221
|
($_tmp = $r{ss_err}->($i)) .= $y->sse($pred) - $r{'(ss_residual)'}; |
1934
|
|
|
|
|
|
|
} |
1935
|
|
|
|
|
|
|
|
1936
|
|
|
|
|
|
|
# Finally, get MS, F, etc |
1937
|
|
|
|
|
|
|
|
1938
|
1
|
|
|
|
|
66
|
$r{df} = pdl( map { $_->squeeze->ndims } @ivs ); |
|
3
|
|
|
|
|
86
|
|
1939
|
1
|
|
|
|
|
67
|
$r{ms} = $r{ss} / $r{df}; |
1940
|
|
|
|
|
|
|
|
1941
|
1
|
|
|
|
|
14
|
$r{df_err} = $s->dim(1) * $r{df}; |
1942
|
1
|
|
|
|
|
11
|
$r{ms_err} = $r{ss_err} / $r{df_err}; |
1943
|
|
|
|
|
|
|
|
1944
|
1
|
|
|
|
|
8
|
$r{F} = $r{ms} / $r{ms_err}; |
1945
|
|
|
|
|
|
|
|
1946
|
|
|
|
|
|
|
$r{F_p} = 1 - $r{F}->gsl_cdf_fdist_P( $r{df}, $r{df_err} ) |
1947
|
1
|
50
|
|
|
|
6
|
if $CDF; |
1948
|
|
|
|
|
|
|
|
1949
|
1
|
|
|
|
|
73
|
return %r; |
1950
|
|
|
|
|
|
|
} |
1951
|
|
|
|
|
|
|
|
1952
|
|
|
|
|
|
|
|
1953
|
|
|
|
|
|
|
=head2 logistic |
1954
|
|
|
|
|
|
|
|
1955
|
|
|
|
|
|
|
=for ref |
1956
|
|
|
|
|
|
|
|
1957
|
|
|
|
|
|
|
Logistic regression with maximum likelihood estimation using PDLA::Fit::LM (requires PDLA::Slatec. Hence loaded with "require" in the sub instead of "use" at the beginning). |
1958
|
|
|
|
|
|
|
|
1959
|
|
|
|
|
|
|
IVs ($x) should be pdl dims $y->nelem or $y->nelem x n_iv. Do not supply the constant vector in $x. It is included in the model and returned as LAST of coeff. Returns full model in list context and coeff in scalar context. |
1960
|
|
|
|
|
|
|
|
1961
|
|
|
|
|
|
|
The significance tests are likelihood ratio tests (-2LL deviance) tests. IV significance is tested by comparing deviances between the reduced model (ie with the IV in question removed) and the full model. |
1962
|
|
|
|
|
|
|
|
1963
|
|
|
|
|
|
|
***NOTE: the results here are qualitatively similar to but not identical with results from R, because different algorithms are used for the nonlinear parameter fit. Use with discretion*** |
1964
|
|
|
|
|
|
|
|
1965
|
|
|
|
|
|
|
=for options |
1966
|
|
|
|
|
|
|
|
1967
|
|
|
|
|
|
|
Default options (case insensitive): |
1968
|
|
|
|
|
|
|
|
1969
|
|
|
|
|
|
|
INITP => zeroes( $x->dim(1) + 1 ), # n_iv + 1 |
1970
|
|
|
|
|
|
|
MAXIT => 1000, |
1971
|
|
|
|
|
|
|
EPS => 1e-7, |
1972
|
|
|
|
|
|
|
|
1973
|
|
|
|
|
|
|
=for usage |
1974
|
|
|
|
|
|
|
|
1975
|
|
|
|
|
|
|
Usage: |
1976
|
|
|
|
|
|
|
|
1977
|
|
|
|
|
|
|
# suppose this is whether a person had rented 10 movies |
1978
|
|
|
|
|
|
|
|
1979
|
|
|
|
|
|
|
perldl> p $y = ushort( random(10)*2 ) |
1980
|
|
|
|
|
|
|
[0 0 0 1 1 0 0 1 1 1] |
1981
|
|
|
|
|
|
|
|
1982
|
|
|
|
|
|
|
# IV 1 is box office ranking |
1983
|
|
|
|
|
|
|
|
1984
|
|
|
|
|
|
|
perldl> p $x1 = sequence(10) |
1985
|
|
|
|
|
|
|
[0 1 2 3 4 5 6 7 8 9] |
1986
|
|
|
|
|
|
|
|
1987
|
|
|
|
|
|
|
# IV 2 is whether the movie is action- or chick-flick |
1988
|
|
|
|
|
|
|
|
1989
|
|
|
|
|
|
|
perldl> p $x2 = sequence(10) % 2 |
1990
|
|
|
|
|
|
|
[0 1 0 1 0 1 0 1 0 1] |
1991
|
|
|
|
|
|
|
|
1992
|
|
|
|
|
|
|
# concatenate the IVs together |
1993
|
|
|
|
|
|
|
|
1994
|
|
|
|
|
|
|
perldl> p $x = cat $x1, $x2 |
1995
|
|
|
|
|
|
|
[ |
1996
|
|
|
|
|
|
|
[0 1 2 3 4 5 6 7 8 9] |
1997
|
|
|
|
|
|
|
[0 1 0 1 0 1 0 1 0 1] |
1998
|
|
|
|
|
|
|
] |
1999
|
|
|
|
|
|
|
|
2000
|
|
|
|
|
|
|
perldl> %m = $y->logistic( $x ) |
2001
|
|
|
|
|
|
|
|
2002
|
|
|
|
|
|
|
perldl> p "$_\t$m{$_}\n" for (sort keys %m) |
2003
|
|
|
|
|
|
|
|
2004
|
|
|
|
|
|
|
D0 13.8629436111989 |
2005
|
|
|
|
|
|
|
Dm 9.8627829791575 |
2006
|
|
|
|
|
|
|
Dm_chisq 4.00016063204141 |
2007
|
|
|
|
|
|
|
Dm_df 2 |
2008
|
|
|
|
|
|
|
Dm_p 0.135324414081692 |
2009
|
|
|
|
|
|
|
# ranking genre constant |
2010
|
|
|
|
|
|
|
b [0.41127706 0.53876358 -2.1201285] |
2011
|
|
|
|
|
|
|
b_chisq [ 3.5974504 0.16835559 2.8577151] |
2012
|
|
|
|
|
|
|
b_p [0.057868258 0.6815774 0.090936587] |
2013
|
|
|
|
|
|
|
iter 12 |
2014
|
|
|
|
|
|
|
y_pred [0.10715577 0.23683909 ... 0.76316091 0.89284423] |
2015
|
|
|
|
|
|
|
|
2016
|
|
|
|
|
|
|
|
2017
|
|
|
|
|
|
|
=cut |
2018
|
|
|
|
|
|
|
|
2019
|
|
|
|
|
|
|
*logistic = \&PDLA::logistic; |
2020
|
|
|
|
|
|
|
sub PDLA::logistic { |
2021
|
0
|
|
|
0
|
0
|
0
|
require PDLA::Fit::LM; # uses PDLA::Slatec |
2022
|
|
|
|
|
|
|
|
2023
|
0
|
|
|
|
|
0
|
my ( $self, $ivs, $opt ) = @_; |
2024
|
|
|
|
|
|
|
|
2025
|
0
|
|
|
|
|
0
|
$self = $self->squeeze; |
2026
|
|
|
|
|
|
|
# make compatible w multiple var cases |
2027
|
0
|
0
|
|
|
|
0
|
$ivs->getndims == 1 and $ivs = $ivs->dummy(1); |
2028
|
0
|
0
|
|
|
|
0
|
$self->dim(0) != $ivs->dim(0) and |
2029
|
|
|
|
|
|
|
carp "mismatched n btwn DV and IV!"; |
2030
|
|
|
|
|
|
|
|
2031
|
0
|
|
|
|
|
0
|
my %opt = ( |
2032
|
|
|
|
|
|
|
INITP => zeroes( $ivs->dim(1) + 1 ), # n_ivs + 1 |
2033
|
|
|
|
|
|
|
MAXIT => 1000, |
2034
|
|
|
|
|
|
|
EPS => 1e-7, |
2035
|
|
|
|
|
|
|
); |
2036
|
0
|
|
0
|
|
|
0
|
$opt and $opt{uc $_} = $opt->{$_} for (%$opt); |
2037
|
|
|
|
|
|
|
# not using it atm |
2038
|
0
|
|
|
|
|
0
|
$opt{WT} = 1; |
2039
|
|
|
|
|
|
|
|
2040
|
|
|
|
|
|
|
# Use lmfit. Fourth input argument is reference to user-defined |
2041
|
|
|
|
|
|
|
# copy INITP so we have the original value when needed |
2042
|
|
|
|
|
|
|
my ($yfit,$coeff,$cov,$iter) |
2043
|
|
|
|
|
|
|
= PDLA::Fit::LM::lmfit($ivs, $self, $opt{WT}, \&_logistic, $opt{INITP}->copy, |
2044
|
0
|
|
|
|
|
0
|
{ Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); |
2045
|
|
|
|
|
|
|
# apparently at least coeff is child of some pdl |
2046
|
|
|
|
|
|
|
# which is changed in later lmfit calls |
2047
|
0
|
|
|
|
|
0
|
$yfit = $yfit->copy; |
2048
|
0
|
|
|
|
|
0
|
$coeff = $coeff->copy; |
2049
|
|
|
|
|
|
|
|
2050
|
0
|
0
|
|
|
|
0
|
return $coeff unless wantarray; |
2051
|
|
|
|
|
|
|
|
2052
|
0
|
|
|
|
|
0
|
my %ret; |
2053
|
|
|
|
|
|
|
|
2054
|
0
|
|
|
|
|
0
|
my $n0 = $self->where($self == 0)->nelem; |
2055
|
0
|
|
|
|
|
0
|
my $n1 = $self->nelem - $n0; |
2056
|
|
|
|
|
|
|
|
2057
|
0
|
|
|
|
|
0
|
$ret{D0} = -2*($n0 * log($n0 / $self->nelem) + $n1 * log($n1 / $self->nelem)); |
2058
|
0
|
|
|
|
|
0
|
$ret{Dm} = sum( $self->dvrs( $yfit ) ** 2 ); |
2059
|
0
|
|
|
|
|
0
|
$ret{Dm_chisq} = $ret{D0} - $ret{Dm}; |
2060
|
0
|
|
|
|
|
0
|
$ret{Dm_df} = $ivs->dim(1); |
2061
|
|
|
|
|
|
|
$ret{Dm_p} |
2062
|
|
|
|
|
|
|
= 1 - PDLA::GSL::CDF::gsl_cdf_chisq_P( $ret{Dm_chisq}, $ret{Dm_df} ) |
2063
|
0
|
0
|
|
|
|
0
|
if $CDF; |
2064
|
|
|
|
|
|
|
|
2065
|
0
|
|
|
|
|
0
|
my $coeff_chisq = zeroes $opt{INITP}->nelem; |
2066
|
|
|
|
|
|
|
|
2067
|
0
|
0
|
|
|
|
0
|
if ( $ivs->dim(1) > 1 ) { |
2068
|
0
|
|
|
|
|
0
|
for my $k (0 .. $ivs->dim(1)-1) { |
2069
|
0
|
|
|
|
|
0
|
my @G = grep { $_ != $k } (0 .. $ivs->dim(1)-1); |
|
0
|
|
|
|
|
0
|
|
2070
|
0
|
|
|
|
|
0
|
my $G = $ivs->dice_axis(1, \@G); |
2071
|
|
|
|
|
|
|
|
2072
|
0
|
|
|
|
|
0
|
my $init = $opt{INITP}->dice([ @G, $opt{INITP}->dim(0)-1 ])->copy; |
2073
|
|
|
|
|
|
|
my $y_G |
2074
|
|
|
|
|
|
|
= PDLA::Fit::LM::lmfit( $G, $self, $opt{WT}, \&_logistic, $init, |
2075
|
0
|
|
|
|
|
0
|
{ Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); |
2076
|
|
|
|
|
|
|
|
2077
|
0
|
|
|
|
|
0
|
($_tmp = $coeff_chisq($k)) .= $self->dm( $y_G ) - $ret{Dm}; |
2078
|
|
|
|
|
|
|
} |
2079
|
|
|
|
|
|
|
} |
2080
|
|
|
|
|
|
|
else { |
2081
|
|
|
|
|
|
|
# d0 is, by definition, the deviance with only intercept |
2082
|
0
|
|
|
|
|
0
|
($_tmp = $coeff_chisq(0)) .= $ret{D0} - $ret{Dm}; |
2083
|
|
|
|
|
|
|
} |
2084
|
|
|
|
|
|
|
|
2085
|
|
|
|
|
|
|
my $y_c |
2086
|
|
|
|
|
|
|
= PDLA::Fit::LM::lmfit( $ivs, $self, $opt{WT}, \&_logistic_no_intercept, $opt{INITP}->(0:-2)->copy, |
2087
|
0
|
|
|
|
|
0
|
{ Maxiter=>$opt{MAXIT}, Eps=>$opt{EPS} } ); |
2088
|
|
|
|
|
|
|
|
2089
|
0
|
|
|
|
|
0
|
($_tmp = $coeff_chisq(-1)) .= $self->dm( $y_c ) - $ret{Dm}; |
2090
|
|
|
|
|
|
|
|
2091
|
0
|
|
|
|
|
0
|
$ret{b} = $coeff; |
2092
|
0
|
|
|
|
|
0
|
$ret{b_chisq} = $coeff_chisq; |
2093
|
0
|
0
|
|
|
|
0
|
$ret{b_p} = 1 - $ret{b_chisq}->gsl_cdf_chisq_P( 1 ) |
2094
|
|
|
|
|
|
|
if $CDF; |
2095
|
0
|
|
|
|
|
0
|
$ret{y_pred} = $yfit; |
2096
|
0
|
|
|
|
|
0
|
$ret{iter} = $iter; |
2097
|
|
|
|
|
|
|
|
2098
|
0
|
0
|
|
|
|
0
|
for (keys %ret) { ref $ret{$_} eq 'PDLA' and $ret{$_} = $ret{$_}->squeeze }; |
|
0
|
|
|
|
|
0
|
|
2099
|
|
|
|
|
|
|
|
2100
|
0
|
|
|
|
|
0
|
return %ret; |
2101
|
|
|
|
|
|
|
} |
2102
|
|
|
|
|
|
|
|
2103
|
|
|
|
|
|
|
sub _logistic { |
2104
|
0
|
|
|
0
|
|
0
|
my ($x,$par,$ym,$dyda) = @_; |
2105
|
|
|
|
|
|
|
|
2106
|
|
|
|
|
|
|
# $b and $c are fit parameters slope and intercept |
2107
|
0
|
|
|
|
|
0
|
my $b = $par(0 : $x->dim(1) - 1)->sever; |
2108
|
0
|
|
|
|
|
0
|
my $c = $par(-1)->sever; |
2109
|
|
|
|
|
|
|
|
2110
|
|
|
|
|
|
|
# Write function with dependent variable $ym, |
2111
|
|
|
|
|
|
|
# independent variable $x, and fit parameters as specified above. |
2112
|
|
|
|
|
|
|
# Use the .= (dot equals) assignment operator to express the equality |
2113
|
|
|
|
|
|
|
# (not just a plain equals) |
2114
|
0
|
|
|
|
|
0
|
($_tmp = $ym) .= 1 / ( 1 + exp( -1 * (sumover($b * $x->transpose) + $c) ) ); |
2115
|
|
|
|
|
|
|
|
2116
|
0
|
|
|
|
|
0
|
my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1); |
|
0
|
|
|
|
|
0
|
|
2117
|
|
|
|
|
|
|
|
2118
|
|
|
|
|
|
|
# Partial derivative of the function with respect to each slope |
2119
|
|
|
|
|
|
|
# fit parameter ($b in this case). Again, note .= assignment |
2120
|
|
|
|
|
|
|
# operator (not just "equals") |
2121
|
|
|
|
|
|
|
($_tmp = $dy[$_]) .= $x( ,$_) * $ym * (1 - $ym) |
2122
|
0
|
|
|
|
|
0
|
for (0 .. $b->dim(0)-1); |
2123
|
|
|
|
|
|
|
|
2124
|
|
|
|
|
|
|
# Partial derivative of the function re intercept par |
2125
|
0
|
|
|
|
|
0
|
($_tmp = $dy[-1]) .= $ym * (1 - $ym); |
2126
|
|
|
|
|
|
|
} |
2127
|
|
|
|
|
|
|
|
2128
|
|
|
|
|
|
|
sub _logistic_no_intercept { |
2129
|
0
|
|
|
0
|
|
0
|
my ($x,$par,$ym,$dyda) = @_; |
2130
|
|
|
|
|
|
|
|
2131
|
0
|
|
|
|
|
0
|
my $b = $par(0 : $x->dim(1) - 1)->sever; |
2132
|
|
|
|
|
|
|
|
2133
|
|
|
|
|
|
|
# Write function with dependent variable $ym, |
2134
|
|
|
|
|
|
|
# independent variable $x, and fit parameters as specified above. |
2135
|
|
|
|
|
|
|
# Use the .= (dot equals) assignment operator to express the equality |
2136
|
|
|
|
|
|
|
# (not just a plain equals) |
2137
|
0
|
|
|
|
|
0
|
($_tmp = $ym) .= 1 / ( 1 + exp( -1 * sumover($b * $x->transpose) ) ); |
2138
|
|
|
|
|
|
|
|
2139
|
0
|
|
|
|
|
0
|
my (@dy) = map {$dyda -> slice(",($_)") } (0 .. $par->dim(0)-1); |
|
0
|
|
|
|
|
0
|
|
2140
|
|
|
|
|
|
|
|
2141
|
|
|
|
|
|
|
# Partial derivative of the function with respect to each slope |
2142
|
|
|
|
|
|
|
# fit parameter ($b in this case). Again, note .= assignment |
2143
|
|
|
|
|
|
|
# operator (not just "equals") |
2144
|
|
|
|
|
|
|
($_tmp = $dy[$_]) .= $x( ,$_) * $ym * (1 - $ym) |
2145
|
0
|
|
|
|
|
0
|
for (0 .. $b->dim(0)-1); |
2146
|
|
|
|
|
|
|
} |
2147
|
|
|
|
|
|
|
|
2148
|
|
|
|
|
|
|
=head2 pca |
2149
|
|
|
|
|
|
|
|
2150
|
|
|
|
|
|
|
=for ref |
2151
|
|
|
|
|
|
|
|
2152
|
|
|
|
|
|
|
Principal component analysis. Based on corr instead of cov (bad values are ignored pair-wise. OK when bad values are few but otherwise probably should fill_m etc before pca). Use PDLA::Slatec::eigsys() if installed, otherwise use PDLA::MatrixOps::eigens_sym(). |
2153
|
|
|
|
|
|
|
|
2154
|
|
|
|
|
|
|
=for options |
2155
|
|
|
|
|
|
|
|
2156
|
|
|
|
|
|
|
Default options (case insensitive): |
2157
|
|
|
|
|
|
|
|
2158
|
|
|
|
|
|
|
CORR => 1, # boolean. use correlation or covariance |
2159
|
|
|
|
|
|
|
PLOT => 1, # calls plot_screes by default |
2160
|
|
|
|
|
|
|
# can set plot_screes options here |
2161
|
|
|
|
|
|
|
|
2162
|
|
|
|
|
|
|
=for usage |
2163
|
|
|
|
|
|
|
|
2164
|
|
|
|
|
|
|
Usage: |
2165
|
|
|
|
|
|
|
|
2166
|
|
|
|
|
|
|
my $d = qsort random 10, 5; # 10 obs on 5 variables |
2167
|
|
|
|
|
|
|
my %r = $d->pca( \%opt ); |
2168
|
|
|
|
|
|
|
print "$_\t$r{$_}\n" for (keys %r); |
2169
|
|
|
|
|
|
|
|
2170
|
|
|
|
|
|
|
eigenvalue # variance accounted for by each component |
2171
|
|
|
|
|
|
|
[4.70192 0.199604 0.0471421 0.0372981 0.0140346] |
2172
|
|
|
|
|
|
|
|
2173
|
|
|
|
|
|
|
eigenvector # dim var x comp. weights for mapping variables to component |
2174
|
|
|
|
|
|
|
[ |
2175
|
|
|
|
|
|
|
[ -0.451251 -0.440696 -0.457628 -0.451491 -0.434618] |
2176
|
|
|
|
|
|
|
[ -0.274551 0.582455 0.131494 0.255261 -0.709168] |
2177
|
|
|
|
|
|
|
[ 0.43282 0.500662 -0.139209 -0.735144 -0.0467834] |
2178
|
|
|
|
|
|
|
[ 0.693634 -0.428171 0.125114 0.128145 -0.550879] |
2179
|
|
|
|
|
|
|
[ 0.229202 0.180393 -0.859217 0.4173 0.0503155] |
2180
|
|
|
|
|
|
|
] |
2181
|
|
|
|
|
|
|
|
2182
|
|
|
|
|
|
|
loadings # dim var x comp. correlation between variable and component |
2183
|
|
|
|
|
|
|
[ |
2184
|
|
|
|
|
|
|
[ -0.978489 -0.955601 -0.992316 -0.97901 -0.942421] |
2185
|
|
|
|
|
|
|
[ -0.122661 0.260224 0.0587476 0.114043 -0.316836] |
2186
|
|
|
|
|
|
|
[ 0.0939749 0.108705 -0.0302253 -0.159616 -0.0101577] |
2187
|
|
|
|
|
|
|
[ 0.13396 -0.0826915 0.0241629 0.0247483 -0.10639] |
2188
|
|
|
|
|
|
|
[ 0.027153 0.0213708 -0.101789 0.0494365 0.00596076] |
2189
|
|
|
|
|
|
|
] |
2190
|
|
|
|
|
|
|
|
2191
|
|
|
|
|
|
|
pct_var # percent variance accounted for by each component |
2192
|
|
|
|
|
|
|
[0.940384 0.0399209 0.00942842 0.00745963 0.00280691] |
2193
|
|
|
|
|
|
|
|
2194
|
|
|
|
|
|
|
Plot scores along the first two components, |
2195
|
|
|
|
|
|
|
|
2196
|
|
|
|
|
|
|
$d->plot_scores( $r{eigenvector} ); |
2197
|
|
|
|
|
|
|
|
2198
|
|
|
|
|
|
|
=cut |
2199
|
|
|
|
|
|
|
|
2200
|
|
|
|
|
|
|
*pca = \&PDLA::pca; |
2201
|
|
|
|
|
|
|
sub PDLA::pca { |
2202
|
3
|
|
|
3
|
0
|
5295
|
my ($self, $opt) = @_; |
2203
|
|
|
|
|
|
|
|
2204
|
3
|
|
|
|
|
7
|
my %opt = ( |
2205
|
|
|
|
|
|
|
CORR => 1, |
2206
|
|
|
|
|
|
|
PLOT => 1, |
2207
|
|
|
|
|
|
|
); |
2208
|
3
|
|
33
|
|
|
18
|
$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); |
2209
|
|
|
|
|
|
|
|
2210
|
3
|
100
|
|
|
|
125
|
my $var_var = $opt{CORR}? $self->corr_table : $self->cov_table; |
2211
|
|
|
|
|
|
|
|
2212
|
|
|
|
|
|
|
# value is axis pdl and score is var x axis |
2213
|
3
|
|
|
|
|
8
|
my ($eigval, $eigvec); |
2214
|
3
|
50
|
|
|
|
8
|
if ( $SLATEC ) { |
2215
|
0
|
|
|
|
|
0
|
($eigval, $eigvec) = $var_var->PDLA::Slatec::eigsys; |
2216
|
|
|
|
|
|
|
} |
2217
|
|
|
|
|
|
|
else { |
2218
|
3
|
|
|
|
|
12
|
($eigvec, $eigval) = $var_var->eigens_sym; |
2219
|
|
|
|
|
|
|
# compatibility with PDLA::Slatec::eigsys |
2220
|
3
|
|
|
|
|
2415
|
$eigvec = $eigvec->inplace->transpose->sever; |
2221
|
|
|
|
|
|
|
} |
2222
|
|
|
|
|
|
|
|
2223
|
|
|
|
|
|
|
# ind is sticky point for threading |
2224
|
3
|
|
|
|
|
130
|
my $ind_sorted = $eigval->qsorti->(-1:0); |
2225
|
3
|
|
|
|
|
49
|
$eigvec = $eigvec( ,$ind_sorted)->sever; |
2226
|
3
|
|
|
|
|
129
|
$eigval = $eigval($ind_sorted)->sever; |
2227
|
|
|
|
|
|
|
|
2228
|
|
|
|
|
|
|
# var x axis |
2229
|
3
|
|
|
|
|
103
|
my $var = $eigval / $eigval->sum; |
2230
|
3
|
|
|
|
|
134
|
my $loadings; |
2231
|
3
|
100
|
|
|
|
9
|
if ($opt{CORR}) { |
2232
|
2
|
|
|
|
|
7
|
$loadings = $eigvec * sqrt( $eigval->transpose ); |
2233
|
|
|
|
|
|
|
} |
2234
|
|
|
|
|
|
|
else { |
2235
|
1
|
|
|
|
|
13
|
my $scores = $eigvec x $self->dev_m; |
2236
|
1
|
|
|
|
|
72
|
$loadings = $self->corr( $scores->dummy(1) ); |
2237
|
|
|
|
|
|
|
} |
2238
|
|
|
|
|
|
|
|
2239
|
|
|
|
|
|
|
$var->plot_screes(\%opt) |
2240
|
3
|
50
|
|
|
|
232
|
if $opt{PLOT}; |
2241
|
|
|
|
|
|
|
|
2242
|
3
|
|
|
|
|
29
|
return ( eigenvalue=>$eigval, eigenvector=>$eigvec, |
2243
|
|
|
|
|
|
|
pct_var=>$var, loadings=>$loadings ); |
2244
|
|
|
|
|
|
|
} |
2245
|
|
|
|
|
|
|
|
2246
|
|
|
|
|
|
|
=head2 pca_sorti |
2247
|
|
|
|
|
|
|
|
2248
|
|
|
|
|
|
|
Determine by which vars a component is best represented. Descending sort vars by size of association with that component. Returns sorted var and relevant component indices. |
2249
|
|
|
|
|
|
|
|
2250
|
|
|
|
|
|
|
=for options |
2251
|
|
|
|
|
|
|
|
2252
|
|
|
|
|
|
|
Default options (case insensitive): |
2253
|
|
|
|
|
|
|
|
2254
|
|
|
|
|
|
|
NCOMP => 10, # maximum number of components to consider |
2255
|
|
|
|
|
|
|
|
2256
|
|
|
|
|
|
|
=for usage |
2257
|
|
|
|
|
|
|
|
2258
|
|
|
|
|
|
|
Usage: |
2259
|
|
|
|
|
|
|
|
2260
|
|
|
|
|
|
|
# let's see if we replicated the Osgood et al. (1957) study |
2261
|
|
|
|
|
|
|
perldl> ($data, $idv, $ido) = rtable 'osgood_exp.csv', {v=>0} |
2262
|
|
|
|
|
|
|
|
2263
|
|
|
|
|
|
|
# select a subset of var to do pca |
2264
|
|
|
|
|
|
|
perldl> $ind = which_id $idv, [qw( ACTIVE BASS BRIGHT CALM FAST GOOD HAPPY HARD LARGE HEAVY )] |
2265
|
|
|
|
|
|
|
perldl> $data = $data( ,$ind)->sever |
2266
|
|
|
|
|
|
|
perldl> @$idv = @$idv[list $ind] |
2267
|
|
|
|
|
|
|
|
2268
|
|
|
|
|
|
|
perldl> %m = $data->pca |
2269
|
|
|
|
|
|
|
|
2270
|
|
|
|
|
|
|
perldl> ($iv, $ic) = $m{loadings}->pca_sorti() |
2271
|
|
|
|
|
|
|
|
2272
|
|
|
|
|
|
|
perldl> p "$idv->[$_]\t" . $m{loadings}->($_,$ic)->flat . "\n" for (list $iv) |
2273
|
|
|
|
|
|
|
|
2274
|
|
|
|
|
|
|
# COMP0 COMP1 COMP2 COMP3 |
2275
|
|
|
|
|
|
|
HAPPY [0.860191 0.364911 0.174372 -0.10484] |
2276
|
|
|
|
|
|
|
GOOD [0.848694 0.303652 0.198378 -0.115177] |
2277
|
|
|
|
|
|
|
CALM [0.821177 -0.130542 0.396215 -0.125368] |
2278
|
|
|
|
|
|
|
BRIGHT [0.78303 0.232808 -0.0534081 -0.0528796] |
2279
|
|
|
|
|
|
|
HEAVY [-0.623036 0.454826 0.50447 0.073007] |
2280
|
|
|
|
|
|
|
HARD [-0.679179 0.0505568 0.384467 0.165608] |
2281
|
|
|
|
|
|
|
ACTIVE [-0.161098 0.760778 -0.44893 -0.0888592] |
2282
|
|
|
|
|
|
|
FAST [-0.196042 0.71479 -0.471355 0.00460276] |
2283
|
|
|
|
|
|
|
LARGE [-0.241994 0.594644 0.634703 -0.00618055] |
2284
|
|
|
|
|
|
|
BASS [-0.621213 -0.124918 0.0605367 -0.765184] |
2285
|
|
|
|
|
|
|
|
2286
|
|
|
|
|
|
|
=cut |
2287
|
|
|
|
|
|
|
|
2288
|
|
|
|
|
|
|
*pca_sorti = \&PDLA::pca_sorti; |
2289
|
|
|
|
|
|
|
sub PDLA::pca_sorti { |
2290
|
|
|
|
|
|
|
# $self is pdl (var x component) |
2291
|
1
|
|
|
1
|
0
|
7
|
my ($self, $opt) = @_; |
2292
|
|
|
|
|
|
|
|
2293
|
1
|
|
|
|
|
4
|
my %opt = ( |
2294
|
|
|
|
|
|
|
NCOMP => 10, # maximum number of components to consider |
2295
|
|
|
|
|
|
|
); |
2296
|
1
|
|
0
|
|
|
4
|
$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); |
2297
|
|
|
|
|
|
|
|
2298
|
1
|
|
|
|
|
6
|
my $ncomp = pdl($opt{NCOMP}, $self->dim(1))->min; |
2299
|
1
|
|
|
|
|
62
|
$self = $self->dice_axis( 1, pdl(0..$ncomp-1) ); |
2300
|
|
|
|
|
|
|
|
2301
|
1
|
|
|
|
|
47
|
my $icomp = $self->transpose->abs->maximum_ind; |
2302
|
|
|
|
|
|
|
|
2303
|
|
|
|
|
|
|
# sort between comp |
2304
|
1
|
|
|
|
|
33
|
my $ivar_sort = $icomp->qsorti; |
2305
|
1
|
|
|
|
|
4
|
$self = $self($ivar_sort, )->sever; |
2306
|
|
|
|
|
|
|
|
2307
|
|
|
|
|
|
|
# sort within comp |
2308
|
1
|
|
|
|
|
41
|
my $ic = $icomp($ivar_sort)->iv_cluster; |
2309
|
1
|
|
|
|
|
7
|
for my $comp (0 .. $ic->dim(1)-1) { |
2310
|
3
|
|
|
|
|
369
|
my $i = $self(which($ic( ,$comp)), ($comp))->qsorti->(-1:0); |
2311
|
3
|
|
|
|
|
270
|
($_tmp = $ivar_sort(which $ic( ,$comp))) |
2312
|
|
|
|
|
|
|
.= $ivar_sort(which $ic( ,$comp))->($i)->sever; |
2313
|
|
|
|
|
|
|
} |
2314
|
1
|
50
|
|
|
|
168
|
return wantarray? ($ivar_sort, pdl(0 .. $ic->dim(1)-1)) : $ivar_sort; |
2315
|
|
|
|
|
|
|
} |
2316
|
|
|
|
|
|
|
|
2317
|
|
|
|
|
|
|
=head2 plot_means |
2318
|
|
|
|
|
|
|
|
2319
|
|
|
|
|
|
|
Plots means anova style. Can handle up to 4-way interactions (ie 4D pdl). |
2320
|
|
|
|
|
|
|
|
2321
|
|
|
|
|
|
|
=for options |
2322
|
|
|
|
|
|
|
|
2323
|
|
|
|
|
|
|
Default options (case insensitive): |
2324
|
|
|
|
|
|
|
|
2325
|
|
|
|
|
|
|
IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'], |
2326
|
|
|
|
|
|
|
DVNM => 'DV', |
2327
|
|
|
|
|
|
|
AUTO => 1, # auto set dims to be on x-axis, line, panel |
2328
|
|
|
|
|
|
|
# if set 0, dim 0 goes on x-axis, dim 1 as lines |
2329
|
|
|
|
|
|
|
# dim 2+ as panels |
2330
|
|
|
|
|
|
|
# see PDLA::Graphics::PGPLOT::Window for next options |
2331
|
|
|
|
|
|
|
WIN => undef, # pgwin object. not closed here if passed |
2332
|
|
|
|
|
|
|
# allows comparing multiple lines in same plot |
2333
|
|
|
|
|
|
|
# set env before passing WIN |
2334
|
|
|
|
|
|
|
DEV => '/xs', # open and close dev for plotting if no WIN |
2335
|
|
|
|
|
|
|
# defaults to '/png' in Windows |
2336
|
|
|
|
|
|
|
SIZE => 640, # individual square panel size in pixels |
2337
|
|
|
|
|
|
|
SYMBL => [0, 4, 7, 11], |
2338
|
|
|
|
|
|
|
|
2339
|
|
|
|
|
|
|
=for usage |
2340
|
|
|
|
|
|
|
|
2341
|
|
|
|
|
|
|
Usage: |
2342
|
|
|
|
|
|
|
|
2343
|
|
|
|
|
|
|
# see anova for mean / se pdl structure |
2344
|
|
|
|
|
|
|
$mean->plot_means( $se, {IVNM=>['apple', 'bake']} ); |
2345
|
|
|
|
|
|
|
|
2346
|
|
|
|
|
|
|
Or like this: |
2347
|
|
|
|
|
|
|
|
2348
|
|
|
|
|
|
|
$m{'# apple ~ bake # m'}->plot_means; |
2349
|
|
|
|
|
|
|
|
2350
|
|
|
|
|
|
|
=cut |
2351
|
|
|
|
|
|
|
|
2352
|
|
|
|
|
|
|
*plot_means = \&PDLA::plot_means; |
2353
|
|
|
|
|
|
|
sub PDLA::plot_means { |
2354
|
0
|
0
|
|
0
|
0
|
|
my $opt = pop @_ |
2355
|
|
|
|
|
|
|
if ref $_[-1] eq 'HASH'; |
2356
|
0
|
|
|
|
|
|
my ($self, $se) = @_; |
2357
|
0
|
0
|
|
|
|
|
if (!$PGPLOT) { |
2358
|
0
|
|
|
|
|
|
carp "No PDLA::Graphics::PGPLOT, no plot :("; |
2359
|
0
|
|
|
|
|
|
return; |
2360
|
|
|
|
|
|
|
} |
2361
|
0
|
|
|
|
|
|
$self = $self->squeeze; |
2362
|
0
|
0
|
|
|
|
|
if ($self->ndims > 4) { |
2363
|
0
|
|
|
|
|
|
carp "Data is > 4D. No plot here."; |
2364
|
0
|
|
|
|
|
|
return; |
2365
|
|
|
|
|
|
|
} |
2366
|
|
|
|
|
|
|
|
2367
|
0
|
|
|
|
|
|
my %opt = ( |
2368
|
|
|
|
|
|
|
IVNM => ['IV_0', 'IV_1', 'IV_2', 'IV_3'], |
2369
|
|
|
|
|
|
|
DVNM => 'DV', |
2370
|
|
|
|
|
|
|
AUTO => 1, # auto set vars to be on X axis, line, panel |
2371
|
|
|
|
|
|
|
WIN => undef, # PDLA::Graphics::PGPLOT::Window object |
2372
|
|
|
|
|
|
|
DEV => $DEV, |
2373
|
|
|
|
|
|
|
SIZE => 640, # individual square panel size in pixels |
2374
|
|
|
|
|
|
|
SYMBL => [0, 4, 7, 11], # ref PDLA::Graphics::PGPLOT::Window |
2375
|
|
|
|
|
|
|
); |
2376
|
0
|
|
0
|
|
|
|
$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); |
2377
|
|
|
|
|
|
|
|
2378
|
|
|
|
|
|
|
# decide which vars to plot as x axis, lines, panels |
2379
|
|
|
|
|
|
|
# put var w most levels on x axis |
2380
|
|
|
|
|
|
|
# put var w least levels on diff panels |
2381
|
0
|
|
|
|
|
|
my @iD = 0..3; |
2382
|
0
|
|
|
|
|
|
my @dims = (1, 1, 1, 1); |
2383
|
|
|
|
|
|
|
# splice ARRAY,OFFSET,LENGTH,LIST |
2384
|
0
|
|
|
|
|
|
splice @dims, 0, $self->ndims, $self->dims; |
2385
|
0
|
|
|
|
|
|
$self = $self->reshape(@dims)->sever; |
2386
|
0
|
0
|
|
|
|
|
$se = $se->reshape(@dims)->sever |
2387
|
|
|
|
|
|
|
if defined $se; |
2388
|
0
|
|
|
|
|
|
@iD = reverse sort { $a<=>$b } @dims |
2389
|
0
|
0
|
|
|
|
|
if $opt{AUTO}; |
2390
|
|
|
|
|
|
|
|
2391
|
|
|
|
|
|
|
# $iD[0] on x axis |
2392
|
|
|
|
|
|
|
# $iD[1] as separate lines |
2393
|
0
|
|
|
|
|
|
my $nx = $self->dim($iD[2]); # n xpanels |
2394
|
0
|
|
|
|
|
|
my $ny = $self->dim($iD[3]); # n ypanels |
2395
|
|
|
|
|
|
|
|
2396
|
0
|
|
|
|
|
|
my $w = $opt{WIN}; |
2397
|
0
|
0
|
|
|
|
|
if (!defined $w) { |
2398
|
|
|
|
|
|
|
$w = pgwin(DEV=>$opt{DEV}, NX=>$nx, NY=>$ny, |
2399
|
0
|
|
|
|
|
|
SIZE=>[$opt{SIZE}*$nx, $opt{SIZE}*$ny], UNIT=>3); |
2400
|
|
|
|
|
|
|
} |
2401
|
|
|
|
|
|
|
|
2402
|
0
|
0
|
|
|
|
|
my ($min, $max) = defined $se? pdl($self + $se, $self - $se)->minmax |
2403
|
|
|
|
|
|
|
: $self->minmax |
2404
|
|
|
|
|
|
|
; |
2405
|
0
|
|
|
|
|
|
my $range = $max - $min; |
2406
|
0
|
|
|
|
|
|
my $p = 0; # panel |
2407
|
|
|
|
|
|
|
|
2408
|
0
|
|
|
|
|
|
for my $y (0..$self->dim($iD[3])-1) { |
2409
|
0
|
|
|
|
|
|
for my $x (0..$self->dim($iD[2])-1) { |
2410
|
0
|
|
|
|
|
|
$p ++; |
2411
|
0
|
|
|
|
|
|
my $tl = ''; |
2412
|
0
|
0
|
|
|
|
|
$tl = $opt{IVNM}->[$iD[2]] . " $x" if $self->dim($iD[2]) > 1; |
2413
|
0
|
0
|
|
|
|
|
$tl .= ' ' . $opt{IVNM}->[$iD[3]] . " $y" if $self->dim($iD[3]) > 1; |
2414
|
|
|
|
|
|
|
$w->env( 0, $self->dim($iD[0])-1, $min - 2*$range/5, $max + $range/5, |
2415
|
|
|
|
|
|
|
{ XTitle=>$opt{IVNM}->[$iD[0]], YTitle=>$opt{DVNM}, Title=>$tl, PANEL=>$p, AXIS=>['BCNT', 'BCNST'], Border=>1, |
2416
|
|
|
|
|
|
|
} ) |
2417
|
0
|
0
|
|
|
|
|
unless $opt{WIN}; |
2418
|
|
|
|
|
|
|
|
2419
|
0
|
|
|
|
|
|
my (@legend, @color); |
2420
|
0
|
|
|
|
|
|
for (0 .. $self->dim($iD[1]) - 1) { |
2421
|
0
|
0
|
|
|
|
|
push @legend, $opt{IVNM}->[$iD[1]] . " $_" |
2422
|
|
|
|
|
|
|
if ($self->dim($iD[1]) > 1); |
2423
|
0
|
|
|
|
|
|
push @color, $_ + 2; # start from red |
2424
|
|
|
|
|
|
|
$w->points( sequence($self->dim($iD[0])), |
2425
|
|
|
|
|
|
|
$self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_), |
2426
|
0
|
|
|
|
|
|
$opt{SYMBL}->[$_], |
2427
|
|
|
|
|
|
|
{ PANEL=>$p, CHARSIZE=>2, COLOR=>$_+2, PLOTLINE=>1, } ); |
2428
|
0
|
0
|
|
|
|
|
$w->errb( sequence($self->dim($iD[0])), |
2429
|
|
|
|
|
|
|
$self->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_), |
2430
|
|
|
|
|
|
|
$se->dice_axis($iD[3],$y)->dice_axis($iD[2],$x)->dice_axis($iD[1],$_), |
2431
|
|
|
|
|
|
|
{ PANEL=>$p, CHARSIZE=>2, COLOR=>$_+2 } ) |
2432
|
|
|
|
|
|
|
if defined $se; |
2433
|
|
|
|
|
|
|
} |
2434
|
0
|
0
|
|
|
|
|
if ($self->dim($iD[1]) > 1) { |
2435
|
0
|
|
|
|
|
|
$w->legend( \@legend, ($self->dim($iD[0])-1)/1.6, $min - $range/10, |
2436
|
|
|
|
|
|
|
{ COLOR=>\@color } ); |
2437
|
|
|
|
|
|
|
$w->legend( \@legend, ($self->dim($iD[0])-1)/1.6, $min - $range/10, |
2438
|
0
|
|
|
|
|
|
{ COLOR=>\@color, SYMBOL=>[ @{$opt{SYMBL}}[0..$#color] ] } ); |
|
0
|
|
|
|
|
|
|
2439
|
|
|
|
|
|
|
} |
2440
|
|
|
|
|
|
|
} |
2441
|
|
|
|
|
|
|
} |
2442
|
|
|
|
|
|
|
$w->close |
2443
|
0
|
0
|
|
|
|
|
unless $opt{WIN}; |
2444
|
|
|
|
|
|
|
|
2445
|
0
|
|
|
|
|
|
return; |
2446
|
|
|
|
|
|
|
} |
2447
|
|
|
|
|
|
|
|
2448
|
|
|
|
|
|
|
=head2 plot_residuals |
2449
|
|
|
|
|
|
|
|
2450
|
|
|
|
|
|
|
Plots residuals against predicted values. |
2451
|
|
|
|
|
|
|
|
2452
|
|
|
|
|
|
|
=for usage |
2453
|
|
|
|
|
|
|
|
2454
|
|
|
|
|
|
|
Usage: |
2455
|
|
|
|
|
|
|
|
2456
|
|
|
|
|
|
|
$y->plot_residuals( $y_pred, { dev=>'/png' } ); |
2457
|
|
|
|
|
|
|
|
2458
|
|
|
|
|
|
|
=for options |
2459
|
|
|
|
|
|
|
|
2460
|
|
|
|
|
|
|
Default options (case insensitive): |
2461
|
|
|
|
|
|
|
|
2462
|
|
|
|
|
|
|
# see PDLA::Graphics::PGPLOT::Window for more info |
2463
|
|
|
|
|
|
|
WIN => undef, # pgwin object. not closed here if passed |
2464
|
|
|
|
|
|
|
# allows comparing multiple lines in same plot |
2465
|
|
|
|
|
|
|
# set env before passing WIN |
2466
|
|
|
|
|
|
|
DEV => '/xs', # open and close dev for plotting if no WIN |
2467
|
|
|
|
|
|
|
# defaults to '/png' in Windows |
2468
|
|
|
|
|
|
|
SIZE => 640, # plot size in pixels |
2469
|
|
|
|
|
|
|
COLOR => 1, |
2470
|
|
|
|
|
|
|
|
2471
|
|
|
|
|
|
|
=cut |
2472
|
|
|
|
|
|
|
|
2473
|
|
|
|
|
|
|
*plot_residuals = \&PDLA::plot_residuals; |
2474
|
|
|
|
|
|
|
sub PDLA::plot_residuals { |
2475
|
0
|
0
|
|
0
|
0
|
|
if (!$PGPLOT) { |
2476
|
0
|
|
|
|
|
|
carp "No PDLA::Graphics::PGPLOT, no plot :("; |
2477
|
0
|
|
|
|
|
|
return; |
2478
|
|
|
|
|
|
|
} |
2479
|
0
|
0
|
|
|
|
|
my $opt = pop @_ |
2480
|
|
|
|
|
|
|
if ref $_[-1] eq 'HASH'; |
2481
|
0
|
|
|
|
|
|
my ($y, $y_pred) = @_; |
2482
|
0
|
|
|
|
|
|
my %opt = ( |
2483
|
|
|
|
|
|
|
# see PDLA::Graphics::PGPLOT::Window for next options |
2484
|
|
|
|
|
|
|
WIN => undef, # pgwin object. not closed here if passed |
2485
|
|
|
|
|
|
|
# allows comparing multiple lines in same plot |
2486
|
|
|
|
|
|
|
# set env before passing WIN |
2487
|
|
|
|
|
|
|
DEV => $DEV , # open and close dev for plotting if no WIN |
2488
|
|
|
|
|
|
|
SIZE => 640, # plot size in pixels |
2489
|
|
|
|
|
|
|
COLOR => 1, |
2490
|
|
|
|
|
|
|
); |
2491
|
0
|
|
0
|
|
|
|
$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); |
2492
|
|
|
|
|
|
|
|
2493
|
0
|
|
|
|
|
|
my $residuals = $y - $y_pred; |
2494
|
|
|
|
|
|
|
|
2495
|
0
|
|
|
|
|
|
my $win = $opt{WIN}; |
2496
|
|
|
|
|
|
|
|
2497
|
0
|
0
|
|
|
|
|
if (!$win) { |
2498
|
0
|
|
|
|
|
|
$win = pgwin(DEV=>$opt{DEV}, SIZE=>[$opt{SIZE}, $opt{SIZE}], UNIT=>3); |
2499
|
0
|
|
|
|
|
|
$win->env( $y_pred->minmax, $residuals->minmax, |
2500
|
|
|
|
|
|
|
{XTITLE=>'predicted value', YTITLE=>'residuals', |
2501
|
|
|
|
|
|
|
AXIS=>['BCNT', 'BCNST'], Border=>1,} ); |
2502
|
|
|
|
|
|
|
} |
2503
|
|
|
|
|
|
|
|
2504
|
0
|
|
|
|
|
|
$win->points($y_pred, $residuals, { COLOR=>$opt{COLOR} }); |
2505
|
|
|
|
|
|
|
# add 0-line |
2506
|
0
|
|
|
|
|
|
$win->line(pdl($y_pred->minmax), pdl(0,0), { COLOR=>$opt{COLOR} } ); |
2507
|
|
|
|
|
|
|
|
2508
|
|
|
|
|
|
|
$win->close |
2509
|
0
|
0
|
|
|
|
|
unless $opt{WIN}; |
2510
|
|
|
|
|
|
|
|
2511
|
0
|
|
|
|
|
|
return; |
2512
|
|
|
|
|
|
|
} |
2513
|
|
|
|
|
|
|
|
2514
|
|
|
|
|
|
|
|
2515
|
|
|
|
|
|
|
=head2 plot_scores |
2516
|
|
|
|
|
|
|
|
2517
|
|
|
|
|
|
|
Plots standardized original and PCA transformed scores against two components. (Thank you, Bob MacCallum, for the documentation suggestion that led to this function.) |
2518
|
|
|
|
|
|
|
|
2519
|
|
|
|
|
|
|
=for options |
2520
|
|
|
|
|
|
|
|
2521
|
|
|
|
|
|
|
Default options (case insensitive): |
2522
|
|
|
|
|
|
|
|
2523
|
|
|
|
|
|
|
CORR => 1, # boolean. PCA was based on correlation or covariance |
2524
|
|
|
|
|
|
|
COMP => [0,1], # indices to components to plot |
2525
|
|
|
|
|
|
|
# see PDLA::Graphics::PGPLOT::Window for next options |
2526
|
|
|
|
|
|
|
WIN => undef, # pgwin object. not closed here if passed |
2527
|
|
|
|
|
|
|
# allows comparing multiple lines in same plot |
2528
|
|
|
|
|
|
|
# set env before passing WIN |
2529
|
|
|
|
|
|
|
DEV => '/xs', # open and close dev for plotting if no WIN |
2530
|
|
|
|
|
|
|
# defaults to '/png' in Windows |
2531
|
|
|
|
|
|
|
SIZE => 640, # plot size in pixels |
2532
|
|
|
|
|
|
|
COLOR => [1,2], # color for original and rotated scores |
2533
|
|
|
|
|
|
|
|
2534
|
|
|
|
|
|
|
=for usage |
2535
|
|
|
|
|
|
|
|
2536
|
|
|
|
|
|
|
Usage: |
2537
|
|
|
|
|
|
|
|
2538
|
|
|
|
|
|
|
my %p = $data->pca(); |
2539
|
|
|
|
|
|
|
$data->plot_scores( $p{eigenvector}, \%opt ); |
2540
|
|
|
|
|
|
|
|
2541
|
|
|
|
|
|
|
=cut |
2542
|
|
|
|
|
|
|
|
2543
|
|
|
|
|
|
|
*plot_scores = \&PDLA::plot_scores; |
2544
|
|
|
|
|
|
|
sub PDLA::plot_scores { |
2545
|
0
|
0
|
|
0
|
0
|
|
if (!$PGPLOT) { |
2546
|
0
|
|
|
|
|
|
carp "No PDLA::Graphics::PGPLOT, no plot :("; |
2547
|
0
|
|
|
|
|
|
return; |
2548
|
|
|
|
|
|
|
} |
2549
|
0
|
0
|
|
|
|
|
my $opt = pop @_ |
2550
|
|
|
|
|
|
|
if ref $_[-1] eq 'HASH'; |
2551
|
0
|
|
|
|
|
|
my ($self, $eigvec) = @_; |
2552
|
0
|
|
|
|
|
|
my %opt = ( |
2553
|
|
|
|
|
|
|
CORR => 1, # boolean. PCA was based on correlation or covariance |
2554
|
|
|
|
|
|
|
COMP => [0,1], # indices to components to plot |
2555
|
|
|
|
|
|
|
# see PDLA::Graphics::PGPLOT::Window for next options |
2556
|
|
|
|
|
|
|
WIN => undef, # pgwin object. not closed here if passed |
2557
|
|
|
|
|
|
|
# allows comparing multiple lines in same plot |
2558
|
|
|
|
|
|
|
# set env before passing WIN |
2559
|
|
|
|
|
|
|
DEV => $DEV , # open and close dev for plotting if no WIN |
2560
|
|
|
|
|
|
|
SIZE => 640, # plot size in pixels |
2561
|
|
|
|
|
|
|
COLOR => [1,2], # color for original and transformed scoress |
2562
|
|
|
|
|
|
|
); |
2563
|
0
|
|
0
|
|
|
|
$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); |
2564
|
|
|
|
|
|
|
|
2565
|
0
|
|
|
|
|
|
my $i = pdl $opt{COMP}; |
2566
|
0
|
0
|
|
|
|
|
my $z = $opt{CORR}? $self->stddz : $self->dev_m; |
2567
|
|
|
|
|
|
|
|
2568
|
|
|
|
|
|
|
# transformed normed values |
2569
|
0
|
|
|
|
|
|
my $scores = sumover($eigvec( ,$i) * $z->transpose->dummy(1))->transpose; |
2570
|
0
|
|
|
|
|
|
$z = $z( ,$i)->sever; |
2571
|
|
|
|
|
|
|
|
2572
|
0
|
|
|
|
|
|
my $win = $opt{WIN}; |
2573
|
0
|
|
|
|
|
|
my $max = pdl($z, $scores)->abs->ceil->max; |
2574
|
0
|
0
|
|
|
|
|
if (!$win) { |
2575
|
0
|
|
|
|
|
|
$win = pgwin(DEV=>$opt{DEV}, SIZE=>[$opt{SIZE}, $opt{SIZE}], UNIT=>3); |
2576
|
0
|
|
|
|
|
|
$win->env(-$max, $max, -$max, $max, |
2577
|
|
|
|
|
|
|
{XTitle=>"Compoment $opt{COMP}->[0]", YTitle=>"Component $opt{COMP}->[1]", |
2578
|
|
|
|
|
|
|
AXIS=>['ABCNST', 'ABCNST'], Border=>1, }); |
2579
|
|
|
|
|
|
|
} |
2580
|
|
|
|
|
|
|
|
2581
|
0
|
|
|
|
|
|
$win->points( $z( ,0;-), $z( ,1;-), { COLOR=>$opt{COLOR}->[0] } ); |
2582
|
0
|
|
|
|
|
|
$win->points( $scores( ,0;-), $scores( ,1;-), { COLOR=>$opt{COLOR}->[1] } ); |
2583
|
0
|
|
|
|
|
|
$win->legend( ['original', 'transformed'], .2*$max, .8*$max, {color=>[1,2],symbol=>[1,1]} ); |
2584
|
|
|
|
|
|
|
$win->close |
2585
|
0
|
0
|
|
|
|
|
unless $opt{WIN}; |
2586
|
0
|
|
|
|
|
|
return; |
2587
|
|
|
|
|
|
|
} |
2588
|
|
|
|
|
|
|
|
2589
|
|
|
|
|
|
|
|
2590
|
|
|
|
|
|
|
=head2 plot_screes |
2591
|
|
|
|
|
|
|
|
2592
|
|
|
|
|
|
|
Scree plot. Plots proportion of variance accounted for by PCA components. |
2593
|
|
|
|
|
|
|
|
2594
|
|
|
|
|
|
|
=for options |
2595
|
|
|
|
|
|
|
|
2596
|
|
|
|
|
|
|
Default options (case insensitive): |
2597
|
|
|
|
|
|
|
|
2598
|
|
|
|
|
|
|
NCOMP => 20, # max number of components to plot |
2599
|
|
|
|
|
|
|
CUT => 0, # set to plot cutoff line after this many components |
2600
|
|
|
|
|
|
|
# undef to plot suggested cutoff line for NCOMP comps |
2601
|
|
|
|
|
|
|
# see PDLA::Graphics::PGPLOT::Window for next options |
2602
|
|
|
|
|
|
|
WIN => undef, # pgwin object. not closed here if passed |
2603
|
|
|
|
|
|
|
# allows comparing multiple lines in same plot |
2604
|
|
|
|
|
|
|
# set env before passing WIN |
2605
|
|
|
|
|
|
|
DEV => '/xs', # open and close dev for plotting if no WIN |
2606
|
|
|
|
|
|
|
# defaults to '/png' in Windows |
2607
|
|
|
|
|
|
|
SIZE => 640, # plot size in pixels |
2608
|
|
|
|
|
|
|
COLOR => 1, |
2609
|
|
|
|
|
|
|
|
2610
|
|
|
|
|
|
|
=for usage |
2611
|
|
|
|
|
|
|
|
2612
|
|
|
|
|
|
|
Usage: |
2613
|
|
|
|
|
|
|
|
2614
|
|
|
|
|
|
|
# variance should be in descending order |
2615
|
|
|
|
|
|
|
|
2616
|
|
|
|
|
|
|
$pca{var}->plot_screes( {ncomp=>16} ); |
2617
|
|
|
|
|
|
|
|
2618
|
|
|
|
|
|
|
Or, because NCOMP is used so often, it is allowed a shortcut, |
2619
|
|
|
|
|
|
|
|
2620
|
|
|
|
|
|
|
$pca{var}->plot_screes( 16 ); |
2621
|
|
|
|
|
|
|
|
2622
|
|
|
|
|
|
|
=cut |
2623
|
|
|
|
|
|
|
|
2624
|
|
|
|
|
|
|
*plot_scree = \&PDLA::plot_screes; # here for now for compatibility |
2625
|
|
|
|
|
|
|
*plot_screes = \&PDLA::plot_screes; |
2626
|
|
|
|
|
|
|
sub PDLA::plot_screes { |
2627
|
0
|
0
|
|
0
|
0
|
|
if (!$PGPLOT) { |
2628
|
0
|
|
|
|
|
|
carp "No PDLA::Graphics::PGPLOT, no plot :("; |
2629
|
0
|
|
|
|
|
|
return; |
2630
|
|
|
|
|
|
|
} |
2631
|
0
|
0
|
|
|
|
|
my $opt = pop @_ |
2632
|
|
|
|
|
|
|
if ref $_[-1] eq 'HASH'; |
2633
|
0
|
|
|
|
|
|
my ($self, $ncomp) = @_; |
2634
|
0
|
|
|
|
|
|
my %opt = ( |
2635
|
|
|
|
|
|
|
NCOMP => 20, # max number of components to plot |
2636
|
|
|
|
|
|
|
CUT => 0, # set to plot cutoff line after this many components |
2637
|
|
|
|
|
|
|
# undef to plot suggested cutoff line for NCOMP comps |
2638
|
|
|
|
|
|
|
# see PDLA::Graphics::PGPLOT::Window for next options |
2639
|
|
|
|
|
|
|
WIN => undef, # pgwin object. not closed here if passed |
2640
|
|
|
|
|
|
|
# allows comparing multiple lines in same plot |
2641
|
|
|
|
|
|
|
# set env before passing WIN |
2642
|
|
|
|
|
|
|
DEV => $DEV , # open and close dev for plotting if no WIN |
2643
|
|
|
|
|
|
|
SIZE => 640, # plot size in pixels |
2644
|
|
|
|
|
|
|
COLOR => 1, |
2645
|
|
|
|
|
|
|
); |
2646
|
0
|
|
0
|
|
|
|
$opt and $opt{uc $_} = $opt->{$_} for (keys %$opt); |
2647
|
0
|
0
|
|
|
|
|
$opt{NCOMP} = $ncomp |
2648
|
|
|
|
|
|
|
if $ncomp; |
2649
|
|
|
|
|
|
|
# re-use $ncomp below |
2650
|
0
|
0
|
|
|
|
|
$ncomp = ($self->dim(0) < $opt{NCOMP})? $self->dim(0) : $opt{NCOMP}; |
2651
|
|
|
|
|
|
|
$opt{CUT} = PDLA::Stats::Kmeans::_scree_ind $self(0:$ncomp-1) |
2652
|
0
|
0
|
|
|
|
|
if !defined $opt{CUT}; |
2653
|
|
|
|
|
|
|
|
2654
|
0
|
|
|
|
|
|
my $win = $opt{WIN}; |
2655
|
|
|
|
|
|
|
|
2656
|
0
|
0
|
|
|
|
|
if (!$win) { |
2657
|
0
|
|
|
|
|
|
$win = pgwin(DEV=>$opt{DEV}, SIZE=>[$opt{SIZE}, $opt{SIZE}], UNIT=>3); |
2658
|
0
|
|
|
|
|
|
$win->env(0, $ncomp-1, 0, 1, |
2659
|
|
|
|
|
|
|
{XTitle=>'Compoment', YTitle=>'Proportion of Variance Accounted for', |
2660
|
|
|
|
|
|
|
AXIS=>['BCNT', 'BCNST'], Border=>1, }); |
2661
|
|
|
|
|
|
|
} |
2662
|
|
|
|
|
|
|
|
2663
|
|
|
|
|
|
|
$win->points(sequence($ncomp), $self(0:$ncomp-1, ), |
2664
|
0
|
|
|
|
|
|
{CHARSIZE=>2, COLOR=>$opt{COLOR}, PLOTLINE=>1} ); |
2665
|
|
|
|
|
|
|
$win->line( pdl($opt{CUT}-.5, $opt{CUT}-.5), pdl(-.05, $self->max+.05), |
2666
|
|
|
|
|
|
|
{COLOR=>15} ) |
2667
|
0
|
0
|
|
|
|
|
if $opt{CUT}; |
2668
|
|
|
|
|
|
|
$win->close |
2669
|
0
|
0
|
|
|
|
|
unless $opt{WIN}; |
2670
|
0
|
|
|
|
|
|
return; |
2671
|
|
|
|
|
|
|
} |
2672
|
|
|
|
|
|
|
|
2673
|
|
|
|
|
|
|
=head1 SEE ALSO |
2674
|
|
|
|
|
|
|
|
2675
|
|
|
|
|
|
|
PDLA::Fit::Linfit |
2676
|
|
|
|
|
|
|
|
2677
|
|
|
|
|
|
|
PDLA::Fit::LM |
2678
|
|
|
|
|
|
|
|
2679
|
|
|
|
|
|
|
=head1 REFERENCES |
2680
|
|
|
|
|
|
|
|
2681
|
|
|
|
|
|
|
Cohen, J., Cohen, P., West, S.G., & Aiken, L.S. (2003). Applied Multiple Regression/correlation Analysis for the Behavioral Sciences (3rd ed.). Mahwah, NJ: Lawrence Erlbaum Associates Publishers. |
2682
|
|
|
|
|
|
|
|
2683
|
|
|
|
|
|
|
Hosmer, D.W., & Lemeshow, S. (2000). Applied Logistic Regression (2nd ed.). New York, NY: Wiley-Interscience. |
2684
|
|
|
|
|
|
|
|
2685
|
|
|
|
|
|
|
Lorch, R.F., & Myers, J.L. (1990). Regression analyses of repeated measures data in cognitive research. Journal of Experimental Psychology: Learning, Memory, & Cognition, 16, 149-157. |
2686
|
|
|
|
|
|
|
|
2687
|
|
|
|
|
|
|
Osgood C.E., Suci, G.J., & Tannenbaum, P.H. (1957). The Measurement of Meaning. Champaign, IL: University of Illinois Press. |
2688
|
|
|
|
|
|
|
|
2689
|
|
|
|
|
|
|
Rutherford, A. (2001). Introducing Anova and Ancova: A GLM Approach (1st ed.). Thousand Oaks, CA: Sage Publications. |
2690
|
|
|
|
|
|
|
|
2691
|
|
|
|
|
|
|
Shlens, J. (2009). A Tutorial on Principal Component Analysis. Retrieved April 10, 2011 from http://citeseerx.ist.psu.edu/ |
2692
|
|
|
|
|
|
|
|
2693
|
|
|
|
|
|
|
The GLM procedure: unbalanced ANOVA for two-way design with interaction. (2008). SAS/STAT(R) 9.2 User's Guide. Retrieved June 18, 2009 from http://support.sas.com/ |
2694
|
|
|
|
|
|
|
|
2695
|
|
|
|
|
|
|
Van den Noortgatea, W., & Onghenaa, P. (2006). Analysing repeated measures data in cognitive research: A comment on regression coefficient analyses. European Journal of Cognitive Psychology, 18, 937-952. |
2696
|
|
|
|
|
|
|
|
2697
|
|
|
|
|
|
|
=head1 AUTHOR |
2698
|
|
|
|
|
|
|
|
2699
|
|
|
|
|
|
|
Copyright (C) 2009 Maggie J. Xiong |
2700
|
|
|
|
|
|
|
|
2701
|
|
|
|
|
|
|
All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDLA distribution. |
2702
|
|
|
|
|
|
|
|
2703
|
|
|
|
|
|
|
=cut |
2704
|
|
|
|
|
|
|
|
2705
|
|
|
|
|
|
|
|
2706
|
|
|
|
|
|
|
|
2707
|
|
|
|
|
|
|
; |
2708
|
|
|
|
|
|
|
|
2709
|
|
|
|
|
|
|
|
2710
|
|
|
|
|
|
|
|
2711
|
|
|
|
|
|
|
# Exit with OK status |
2712
|
|
|
|
|
|
|
|
2713
|
|
|
|
|
|
|
1; |
2714
|
|
|
|
|
|
|
|
2715
|
|
|
|
|
|
|
|