File Coverage

blib/lib/PDLA/Stats.pm
Criterion Covered Total %
statement 25 26 96.1
branch 2 4 50.0
condition n/a
subroutine 7 7 100.0
pod n/a
total 34 37 91.8


line stmt bran cond sub pod time code
1             package PDLA::Stats;
2              
3             =head1 NAME
4              
5             PDLA::Stats - a collection of statistics modules in Perl Data Language, with a quick-start guide for non-PDLA people.
6              
7             =cut
8              
9             $VERSION = '0.76';
10              
11             $PDLA::onlinedoc->scan(__FILE__) if $PDLA::onlinedoc;
12              
13             =head1 DESCRIPTION
14              
15             Loads modules named below, making the functions available in the current namespace.
16              
17             Properly formatted documentations online at http://pdl-stats.sf.net
18            
19             =head1 SYNOPSIS
20              
21             use PDLA::LiteF; # loads less modules
22             use PDLA::NiceSlice; # preprocessor for easier pdl indexing syntax
23              
24             use PDLA::Stats;
25            
26             # Is equivalent to the following:
27              
28             use PDLA::Stats::Basic;
29             use PDLA::Stats::GLM;
30             use PDLA::Stats::Kmeans;
31             use PDLA::Stats::TS;
32            
33             # and the following if installed;
34              
35             use PDLA::Stats::Distr;
36             use PDLA::GSL::CDF;
37              
38             =head1 QUICK-START FOR NON-PDLA PEOPLE
39              
40             Enjoy PDLA::Stats without having to dive into PDLA, just wet your feet a little. Three key words two concepts and an icing on the cake, you should be well on your way there.
41              
42             =head2 pdl
43              
44             The magic word that puts PDLA::Stats at your disposal. pdl creates a PDLA numeric data object (a pdl, pronounced "piddle" :/ ) from perl array or array ref. All PDLA::Stats methods, unless meant for regular perl array, can then be called from the data object.
45              
46             my @y = 0..5;
47              
48             my $y = pdl @y;
49              
50             # a simple function
51              
52             my $stdv = $y->stdv;
53              
54             # you can skip the intermediate $y
55              
56             my $stdv = stdv( pdl @y );
57              
58             # a more complex method, skipping intermediate $y
59              
60             my @x1 = qw( y y y n n n );
61             my @x2 = qw( 1 0 1 0 1 0 )
62              
63             # do a two-way analysis of variance with y as DV and x1 x2 as IVs
64              
65             my %result = pdl(@y)->anova( \@x1, \@x2 );
66             print "$_\t$result{$_}\n" for (sort keys %result);
67              
68             If you have a list of list, ie array of array refs, pdl will create a multi-dimensional data object.
69              
70             my @a = ( [1,2,3,4], [0,1,2,3], [4,5,6,7] );
71              
72             my $a = pdl @a;
73              
74             print $a . $a->info;
75              
76             # here's what you will get
77              
78             [
79             [1 2 3 4]
80             [0 1 2 3]
81             [4 5 6 7]
82             ]
83             PDLA: Double D [4,3]
84              
85             PDLA::Stats puts observations in the first dimension and variables in the second dimension, ie pdl [obs, var]. In PDLA::Stats the above example represents 4 observations on 3 variables.
86              
87             # you can do all kinds of fancy stuff on such a 2D pdl.
88              
89             my %result = $a->kmeans( {NCLUS=>2} );
90             print "$_\t$result{$_}\n" for (sort keys %result);
91              
92             Make sure the array of array refs is rectangular. If the array refs are of unequal sizes, pdl will pad it out with 0s to match the longest list.
93              
94             =head2 info
95              
96             Tells you the data type (yes pdls are typed, but you shouldn't have to worry about it here*) and dimensionality of the pdl, as seen in the above example. I find it a big help for my sanity to keep track of the dimensionality of a pdl. As mentioned above, PDLA::Stats uses 2D pdl with observation x variable dimensionality.
97              
98             *pdl uses double precision by default. If you are working with things like epoch time, then you should probably use pdl(long, @epoch) to maintain the precision.
99              
100             =head2 list
101              
102             Come back to the perl reality from the PDLA wonder land. list turns a pdl data object into a regular perl list. Caveat: list produces a flat list. The dimensionality of the data object is lost.
103              
104             =head2 Signature
105              
106             This is not a function, but a concept. You will see something like this frequently in the pod:
107              
108             stdv
109            
110             Signature: (a(n); float+ [o]b())
111              
112             The signature tells you what the function expects as input and what kind of output it produces. a(n) means it expects a 1D pdl with n elements; [o] is for output, b() means its a scalar. So stdv will take your 1D list and give back a scalar. float+ you can ignore; but if you insist, it means the output is at float or double precision. The name a or b or c is not important. What's important is the thing in the parenthesis.
113              
114             corr
115            
116             Signature: (a(n); b(n); float+ [o]c())
117              
118             Here the function corr takes two inputs, two 1D pdl with the same numbers of elements, and gives back a scalar.
119              
120             t_test
121            
122             Signature: (a(n); b(m); float+ [o]t(); [o]d())
123              
124             Here the function t_test can take two 1D pdls of unequal size (n==m is certainly fine), and give back two scalars, t-value and degrees of freedom. Yes we accommodate t-tests with unequal sample sizes.
125              
126             assign
127            
128             Signature: (data(o,v); centroid(c,v); byte [o]cluster(o,c))
129              
130             Here is one of the most complicated signatures in the package. This is a function from Kmeans. assign takes data of observasion x variable dimensions, and a centroid of cluster x variable dimensions, and returns an observation x cluster membership pdl (indicated by 1s and 0s).
131              
132             Got the idea? Then we can see how PDLA does its magic :)
133              
134             =head2 Threading
135              
136             Another concept. The first thing to know is that, threading is optional.
137              
138             PDLA threading means automatically repeating the operation on extra elements or dimensions fed to a function. For a function with a signature like this
139              
140             gsl_cdf_tdist_P
141              
142             Signature: (double x(); double nu(); [o]out())
143              
144             the signatures says that it takes two scalars as input, and returns a scalar as output. If you need to look up the p-values for a list of t's, with the same degrees of freedom 19,
145              
146             my @t = ( 1.65, 1.96, 2.56 );
147              
148             my $p = gsl_cdf_tdist_P( pdl(@t), 19 );
149              
150             print $p . "\n" . $p->info;
151              
152             # here's what you will get
153              
154             [0.94231136 0.96758551 0.99042586]
155             PDLA: Double D [3]
156              
157             The same function is repeated on each element in the list you provided. If you had different degrees of freedoms for the t's,
158              
159             my @df = (199, 39, 19);
160              
161             my $p = gsl_cdf_tdist_P( pdl(@t), pdl(@df) );
162              
163             print $p . "\n" . $p->info;
164              
165             # here's what you will get
166            
167             [0.94973979 0.97141553 0.99042586]
168             PDLA: Double D [3]
169              
170             The df's are automatically matched with the t's to give you the results.
171              
172             An example of threading thru extra dimension(s):
173              
174             stdv
175            
176             Signature: (a(n); float+ [o]b())
177              
178             if the input is of 2D, say you want to compute the stdv for each of the 3 variables,
179              
180             my @a = ( [1,1,3,4], [0,1,2,3], [4,5,6,7] );
181              
182             # pdl @a is pdl dim [4,3]
183              
184             my $sd = stdv( pdl @a );
185              
186             print $sd . "\n" . $sd->info;
187              
188             # this is what you will get
189              
190             [ 1.2990381 1.118034 1.118034]
191             PDLA: Double D [3]
192              
193             Here the function was given an input with an extra dimension of size 3, so it repeates the stdv operation on the extra dimenion 3 times, and gives back a 1D pdl of size 3.
194              
195             Threading works for arbitrary number of dimensions, but it's best to refrain from higher dim pdls unless you have already decided to become a PDLA wiz / witch.
196              
197             Not all PDLA::Stats methods thread. As a rule of thumb, if a function has a signature attached to it, it threads.
198              
199             =head2 perldl
200              
201             Essentially a perl shell with "use PDLA;" at start up. Comes with the PDLA installation. Very handy to try out pdl operations, or just plain perl. print is shortened to p to avoid injury from exessive typing. my goes out of scope at the end of (multi)line input, so mostly you will have to drop the good practice of my here.
202              
203             =head2 For more info
204              
205             PDLA::Impatient
206              
207             =cut
208              
209 1     1   139389 use strict;
  1         6  
  1         30  
210 1     1   4 use warnings;
  1         2  
  1         150  
211              
212              
213             sub PDLA::Stats::import {
214              
215 1     1   10 my $pkg = (caller())[0];
216 1         2 my $use;
217            
218 1 50       2 if (grep {-e $_ . '/PDLA/GSL/CDF.pm'} @INC) {
  11         106  
219 0         0 $use = <
220            
221             package $pkg;
222              
223             use PDLA::Stats::Basic;
224             use PDLA::Stats::Distr;
225             use PDLA::Stats::GLM;
226             use PDLA::Stats::Kmeans;
227             use PDLA::Stats::TS;
228             use PDLA::GSL::CDF;
229              
230             EOD
231             }
232             else {
233 1         3 $use = <
234              
235             package $pkg;
236              
237             use PDLA::Stats::Basic;
238             use PDLA::Stats::GLM;
239             use PDLA::Stats::Kmeans;
240             use PDLA::Stats::TS;
241              
242             EOD
243             }
244            
245 1     1   439 eval $use;
  1     1   4  
  1     1   9  
  1     1   977  
  1         4  
  1         9  
  1         277  
  1         2  
  1         4  
  1         653  
  1         3  
  1         7  
  1         64  
246 1 50       244 die $@ if $@;
247             }
248              
249             =head1 AUTHOR
250              
251             ~~~~~~~~~~~~ ~~~~~ ~~~~~~~~ ~~~~~ ~~~ `` ><(((">
252              
253             Copyright (C) 2009-2015 Maggie J. Xiong
254              
255             All rights reserved. There is no warranty. You are allowed to redistribute this software / documentation as described in the file COPYING in the PDLA distribution.
256              
257             =cut
258              
259             1;