line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package PDLA::Stats; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
PDLA::Stats - a collection of statistics modules in Perl Data Language, with a quick-start guide for non-PDLA people. |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=cut |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
$VERSION = '0.76'; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
$PDLA::onlinedoc->scan(__FILE__) if $PDLA::onlinedoc; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 DESCRIPTION |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
Loads modules named below, making the functions available in the current namespace. |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
Properly formatted documentations online at http://pdl-stats.sf.net |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 SYNOPSIS |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
use PDLA::LiteF; # loads less modules |
22
|
|
|
|
|
|
|
use PDLA::NiceSlice; # preprocessor for easier pdl indexing syntax |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
use PDLA::Stats; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# Is equivalent to the following: |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
use PDLA::Stats::Basic; |
29
|
|
|
|
|
|
|
use PDLA::Stats::GLM; |
30
|
|
|
|
|
|
|
use PDLA::Stats::Kmeans; |
31
|
|
|
|
|
|
|
use PDLA::Stats::TS; |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# and the following if installed; |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
use PDLA::Stats::Distr; |
36
|
|
|
|
|
|
|
use PDLA::GSL::CDF; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=head1 QUICK-START FOR NON-PDLA PEOPLE |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
Enjoy PDLA::Stats without having to dive into PDLA, just wet your feet a little. Three key words two concepts and an icing on the cake, you should be well on your way there. |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head2 pdl |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
The magic word that puts PDLA::Stats at your disposal. pdl creates a PDLA numeric data object (a pdl, pronounced "piddle" :/ ) from perl array or array ref. All PDLA::Stats methods, unless meant for regular perl array, can then be called from the data object. |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
my @y = 0..5; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
my $y = pdl @y; |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# a simple function |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
my $stdv = $y->stdv; |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# you can skip the intermediate $y |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
my $stdv = stdv( pdl @y ); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# a more complex method, skipping intermediate $y |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
my @x1 = qw( y y y n n n ); |
61
|
|
|
|
|
|
|
my @x2 = qw( 1 0 1 0 1 0 ) |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# do a two-way analysis of variance with y as DV and x1 x2 as IVs |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
my %result = pdl(@y)->anova( \@x1, \@x2 ); |
66
|
|
|
|
|
|
|
print "$_\t$result{$_}\n" for (sort keys %result); |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
If you have a list of list, ie array of array refs, pdl will create a multi-dimensional data object. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
my @a = ( [1,2,3,4], [0,1,2,3], [4,5,6,7] ); |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
my $a = pdl @a; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
print $a . $a->info; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# here's what you will get |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
[ |
79
|
|
|
|
|
|
|
[1 2 3 4] |
80
|
|
|
|
|
|
|
[0 1 2 3] |
81
|
|
|
|
|
|
|
[4 5 6 7] |
82
|
|
|
|
|
|
|
] |
83
|
|
|
|
|
|
|
PDLA: Double D [4,3] |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
PDLA::Stats puts observations in the first dimension and variables in the second dimension, ie pdl [obs, var]. In PDLA::Stats the above example represents 4 observations on 3 variables. |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# you can do all kinds of fancy stuff on such a 2D pdl. |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
my %result = $a->kmeans( {NCLUS=>2} ); |
90
|
|
|
|
|
|
|
print "$_\t$result{$_}\n" for (sort keys %result); |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Make sure the array of array refs is rectangular. If the array refs are of unequal sizes, pdl will pad it out with 0s to match the longest list. |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=head2 info |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Tells you the data type (yes pdls are typed, but you shouldn't have to worry about it here*) and dimensionality of the pdl, as seen in the above example. I find it a big help for my sanity to keep track of the dimensionality of a pdl. As mentioned above, PDLA::Stats uses 2D pdl with observation x variable dimensionality. |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
*pdl uses double precision by default. If you are working with things like epoch time, then you should probably use pdl(long, @epoch) to maintain the precision. |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head2 list |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
Come back to the perl reality from the PDLA wonder land. list turns a pdl data object into a regular perl list. Caveat: list produces a flat list. The dimensionality of the data object is lost. |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=head2 Signature |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
This is not a function, but a concept. You will see something like this frequently in the pod: |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
stdv |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
Signature: (a(n); float+ [o]b()) |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
The signature tells you what the function expects as input and what kind of output it produces. a(n) means it expects a 1D pdl with n elements; [o] is for output, b() means its a scalar. So stdv will take your 1D list and give back a scalar. float+ you can ignore; but if you insist, it means the output is at float or double precision. The name a or b or c is not important. What's important is the thing in the parenthesis. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
corr |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
Signature: (a(n); b(n); float+ [o]c()) |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
Here the function corr takes two inputs, two 1D pdl with the same numbers of elements, and gives back a scalar. |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
t_test |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
Signature: (a(n); b(m); float+ [o]t(); [o]d()) |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Here the function t_test can take two 1D pdls of unequal size (n==m is certainly fine), and give back two scalars, t-value and degrees of freedom. Yes we accommodate t-tests with unequal sample sizes. |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
assign |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Signature: (data(o,v); centroid(c,v); byte [o]cluster(o,c)) |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
Here is one of the most complicated signatures in the package. This is a function from Kmeans. assign takes data of observasion x variable dimensions, and a centroid of cluster x variable dimensions, and returns an observation x cluster membership pdl (indicated by 1s and 0s). |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
Got the idea? Then we can see how PDLA does its magic :) |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=head2 Threading |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
Another concept. The first thing to know is that, threading is optional. |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
PDLA threading means automatically repeating the operation on extra elements or dimensions fed to a function. For a function with a signature like this |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
gsl_cdf_tdist_P |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
Signature: (double x(); double nu(); [o]out()) |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
the signatures says that it takes two scalars as input, and returns a scalar as output. If you need to look up the p-values for a list of t's, with the same degrees of freedom 19, |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
my @t = ( 1.65, 1.96, 2.56 ); |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
my $p = gsl_cdf_tdist_P( pdl(@t), 19 ); |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
print $p . "\n" . $p->info; |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# here's what you will get |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
[0.94231136 0.96758551 0.99042586] |
155
|
|
|
|
|
|
|
PDLA: Double D [3] |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
The same function is repeated on each element in the list you provided. If you had different degrees of freedoms for the t's, |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
my @df = (199, 39, 19); |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
my $p = gsl_cdf_tdist_P( pdl(@t), pdl(@df) ); |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
print $p . "\n" . $p->info; |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# here's what you will get |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
[0.94973979 0.97141553 0.99042586] |
168
|
|
|
|
|
|
|
PDLA: Double D [3] |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
The df's are automatically matched with the t's to give you the results. |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
An example of threading thru extra dimension(s): |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
stdv |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
Signature: (a(n); float+ [o]b()) |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
if the input is of 2D, say you want to compute the stdv for each of the 3 variables, |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
my @a = ( [1,1,3,4], [0,1,2,3], [4,5,6,7] ); |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# pdl @a is pdl dim [4,3] |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
my $sd = stdv( pdl @a ); |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
print $sd . "\n" . $sd->info; |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# this is what you will get |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
[ 1.2990381 1.118034 1.118034] |
191
|
|
|
|
|
|
|
PDLA: Double D [3] |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
Here the function was given an input with an extra dimension of size 3, so it repeates the stdv operation on the extra dimenion 3 times, and gives back a 1D pdl of size 3. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
Threading works for arbitrary number of dimensions, but it's best to refrain from higher dim pdls unless you have already decided to become a PDLA wiz / witch. |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Not all PDLA::Stats methods thread. As a rule of thumb, if a function has a signature attached to it, it threads. |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=head2 perldl |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
Essentially a perl shell with "use PDLA;" at start up. Comes with the PDLA installation. Very handy to try out pdl operations, or just plain perl. print is shortened to p to avoid injury from exessive typing. my goes out of scope at the end of (multi)line input, so mostly you will have to drop the good practice of my here. |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=head2 For more info |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
PDLA::Impatient |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=cut |
208
|
|
|
|
|
|
|
|
209
|
1
|
|
|
1
|
|
139389
|
use strict; |
|
1
|
|
|
|
|
6
|
|
|
1
|
|
|
|
|
30
|
|
210
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
150
|
|
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
sub PDLA::Stats::import { |
214
|
|
|
|
|
|
|
|
215
|
1
|
|
|
1
|
|
10
|
my $pkg = (caller())[0]; |
216
|
1
|
|
|
|
|
2
|
my $use; |
217
|
|
|
|
|
|
|
|
218
|
1
|
50
|
|
|
|
2
|
if (grep {-e $_ . '/PDLA/GSL/CDF.pm'} @INC) { |
|
11
|
|
|
|
|
106
|
|
219
|
0
|
|
|
|
|
0
|
$use = <
|
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
package $pkg; |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
use PDLA::Stats::Basic; |
224
|
|
|
|
|
|
|
use PDLA::Stats::Distr; |
225
|
|
|
|
|
|
|
use PDLA::Stats::GLM; |
226
|
|
|
|
|
|
|
use PDLA::Stats::Kmeans; |
227
|
|
|
|
|
|
|
use PDLA::Stats::TS; |
228
|
|
|
|
|
|
|
use PDLA::GSL::CDF; |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
EOD |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
else { |
233
|
1
|
|
|
|
|
3
|
$use = <
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
package $pkg; |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
use PDLA::Stats::Basic; |
238
|
|
|
|
|
|
|
use PDLA::Stats::GLM; |
239
|
|
|
|
|
|
|
use PDLA::Stats::Kmeans; |
240
|
|
|
|
|
|
|
use PDLA::Stats::TS; |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
EOD |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
|
245
|
1
|
|
|
1
|
|
439
|
eval $use; |
|
1
|
|
|
1
|
|
4
|
|
|
1
|
|
|
1
|
|
9
|
|
|
1
|
|
|
1
|
|
977
|
|
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
9
|
|
|
1
|
|
|
|
|
277
|
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
4
|
|
|
1
|
|
|
|
|
653
|
|
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
7
|
|
|
1
|
|
|
|
|
64
|
|
246
|
1
|
50
|
|
|
|
244
|
die $@ if $@; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=head1 AUTHOR |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
~~~~~~~~~~~~ ~~~~~ ~~~~~~~~ ~~~~~ ~~~ `` ><((("> |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
Copyright (C) 2009-2015 Maggie J. Xiong |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDLA distribution. |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=cut |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
1; |