line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Statistics::Normality; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
61274
|
use warnings; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
87
|
|
4
|
2
|
|
|
2
|
|
15
|
use strict; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
89
|
|
5
|
2
|
|
|
2
|
|
13
|
use Carp; |
|
2
|
|
|
|
|
10
|
|
|
2
|
|
|
|
|
283
|
|
6
|
2
|
|
|
2
|
|
2502
|
use Statistics::Distributions; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
Statistics::Normality - test whether an empirical distribution can be taken as being drawn from a normally-distributed population |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 VERSION |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Version 0.01 |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=cut |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 SYNOPSIS |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
use Statistics::Normality ':all'; |
24
|
|
|
|
|
|
|
use Statistics::Normality 'shapiro_wilk_test'; |
25
|
|
|
|
|
|
|
use Statistics::Normality 'dagostino_k_square_test'; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=head1 DESCRIPTION |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
Various situations call for testing whether an empirical sample |
30
|
|
|
|
|
|
|
can be presumed to have been drawn from a normally |
31
|
|
|
|
|
|
|
(L) |
32
|
|
|
|
|
|
|
distributed population, especially because many downstream |
33
|
|
|
|
|
|
|
significance tests depend upon the assumption of |
34
|
|
|
|
|
|
|
normality. |
35
|
|
|
|
|
|
|
This package implements some of the more |
36
|
|
|
|
|
|
|
L |
37
|
|
|
|
|
|
|
from the mathematical statistics literature, though there |
38
|
|
|
|
|
|
|
are also others that are not |
39
|
|
|
|
|
|
|
included. |
40
|
|
|
|
|
|
|
The tests here are all so-called I tests that |
41
|
|
|
|
|
|
|
find departures from normality |
42
|
|
|
|
|
|
|
on the basis of skewness and/or kurtosis [Dagostino71]. |
43
|
|
|
|
|
|
|
Note that, although the |
44
|
|
|
|
|
|
|
L |
45
|
|
|
|
|
|
|
can also be used in this capacity, it is a |
46
|
|
|
|
|
|
|
I test and therefore not advisable [Dagostino71]. |
47
|
|
|
|
|
|
|
This, and other distance tests (e.g. Chi-square) are not implemented |
48
|
|
|
|
|
|
|
here. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 TESTS |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
The subtleties and esoterica of various statistical tests for |
53
|
|
|
|
|
|
|
normality require some familiarity with the mathematical statistics |
54
|
|
|
|
|
|
|
literature. |
55
|
|
|
|
|
|
|
We give rules-of-thumb for specific tests, where they exist, but it may be |
56
|
|
|
|
|
|
|
advisable to try several different tests to check the consistency of the |
57
|
|
|
|
|
|
|
conclusion. |
58
|
|
|
|
|
|
|
It is probably also a good idea to check results graphically, either by |
59
|
|
|
|
|
|
|
direct plotting or by a L. |
60
|
|
|
|
|
|
|
In general, small samples will often pass a normality test suggesting |
61
|
|
|
|
|
|
|
the I that there is insufficient information to detect |
62
|
|
|
|
|
|
|
departure from normal for such cases, should it |
63
|
|
|
|
|
|
|
exist. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
Each of the methods here is a frequentist test, i.e. one that tests against the |
66
|
|
|
|
|
|
|
L |
67
|
|
|
|
|
|
|
that the sample is |
68
|
|
|
|
|
|
|
normal. |
69
|
|
|
|
|
|
|
In other words, a low p-value recommends rejecting the |
70
|
|
|
|
|
|
|
null. |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=head1 EXPORT |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
A list of functions that can be exported. You can delete this section |
75
|
|
|
|
|
|
|
if you don't export anything, such as for a purely object-oriented module. |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=cut |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
use vars qw( @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION ); |
80
|
|
|
|
|
|
|
require Exporter; |
81
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
82
|
|
|
|
|
|
|
@EXPORT = qw(); |
83
|
|
|
|
|
|
|
@EXPORT_OK = qw/shapiro_wilk_test dagostino_k_square_test/; |
84
|
|
|
|
|
|
|
%EXPORT_TAGS = (all => [qw/shapiro_wilk_test dagostino_k_square_test/]); |
85
|
|
|
|
|
|
|
my $pkg = 'Statistics::Normality'; |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# GENERAL IMPLEMENTATION NOTES FOR STATISTICAL TESTS |
89
|
|
|
|
|
|
|
# |
90
|
|
|
|
|
|
|
# (1) Use standard Horner's Rule for polynomial evaluations, see e.g. Forsythe, |
91
|
|
|
|
|
|
|
# Malcolm, and Moler "Computer Methods for Mathematical Computations" |
92
|
|
|
|
|
|
|
# (1977) Prentice-Hall, pp 68. |
93
|
|
|
|
|
|
|
# |
94
|
|
|
|
|
|
|
# (2) VAGARIES OF THE PERL Statistics::Distributions PACKAGE symmetric |
95
|
|
|
|
|
|
|
# This package is implemented in the opposite way that one : | |
96
|
|
|
|
|
|
|
# finds tables of the standard normal function presented in /:\ | |
97
|
|
|
|
|
|
|
# textbooks, where F(z) = A1 is the area from -infinity to : / : \ | |
98
|
|
|
|
|
|
|
# Z (or sometimes from 0 to Z). Instead, the Perl package :/ : \| |
99
|
|
|
|
|
|
|
# gives f(z) = A2 as the area from Z to +infinity, i.e. / : \ |
100
|
|
|
|
|
|
|
# in the *context of a significance test*. Note the /: : |\ |
101
|
|
|
|
|
|
|
# following implications for this package: / : A1 | \ |
102
|
|
|
|
|
|
|
# / : : |A2\ |
103
|
|
|
|
|
|
|
# f(Z) = F(-Z) F(Z) + f(Z) + 1 __/___:___:___|___\___ |
104
|
|
|
|
|
|
|
# -Z 0 Z |
105
|
|
|
|
|
|
|
# -1 -1 -1 |
106
|
|
|
|
|
|
|
# udistr: Z = f [f(Z)] = f [1 - F(Z)] = f (1 - A1) |
107
|
|
|
|
|
|
|
# |
108
|
|
|
|
|
|
|
# and the same appears to be true for other distributions in this package, |
109
|
|
|
|
|
|
|
# e.g. chi-square, student's T, etc. |
110
|
|
|
|
|
|
|
# |
111
|
|
|
|
|
|
|
# (3) Tests that should perhaps be implemented in future versions: |
112
|
|
|
|
|
|
|
# |
113
|
|
|
|
|
|
|
# * Anderson-Darling test |
114
|
|
|
|
|
|
|
# * Jarque-Bera test |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
####################### |
117
|
|
|
|
|
|
|
# SHAPIRO-WILK TEST # |
118
|
|
|
|
|
|
|
####################### |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=head2 Shapiro-Wilk Test |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
The L |
123
|
|
|
|
|
|
|
[Shapiro65] is considered to be among the most objective tests of |
124
|
|
|
|
|
|
|
normality [Royston92] and also one of the most powerful ones for detecting |
125
|
|
|
|
|
|
|
non-normality [Chen71]. |
126
|
|
|
|
|
|
|
Its statistic is essentially the roughly best unbiased estimator |
127
|
|
|
|
|
|
|
of population standard deviation to the sample variance [Dagostino71]. |
128
|
|
|
|
|
|
|
The test is mathematically complex and most implementations use several |
129
|
|
|
|
|
|
|
conventional approximations (as we do here), including Blom's formula |
130
|
|
|
|
|
|
|
for the expected value of the order statistics [Harter61] and transformation |
131
|
|
|
|
|
|
|
to standard normal distribution for evaluation, especially for large |
132
|
|
|
|
|
|
|
samples [Royston92]. |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
$pval = shapiro_wilk_test ([0.34, -0.2, 0.8, ...]); |
135
|
|
|
|
|
|
|
($pval, $w_statistic) = shapiro_wilk_test ([0.34, -0.2, 0.8, ...]); |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
This test may not be the best if there are many repeated values in the test |
138
|
|
|
|
|
|
|
distribution or when the number of points in the test distribution is very |
139
|
|
|
|
|
|
|
large, e.g. more than 5000. |
140
|
|
|
|
|
|
|
The routine will L about the latter, but not the |
141
|
|
|
|
|
|
|
former. |
142
|
|
|
|
|
|
|
This particular implementation of the test also requires at |
143
|
|
|
|
|
|
|
least 6 data points in the sample distribution and will L |
144
|
|
|
|
|
|
|
otherwise. |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=cut |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# IMPLEMENTATION NOTES FOR SHAPIRO-WILK TEST |
149
|
|
|
|
|
|
|
# |
150
|
|
|
|
|
|
|
# (1) THIS ROUTINE IS NON-TRIVIAL TO IMPLEMENT --- THE IMPLEMENTATION HERE |
151
|
|
|
|
|
|
|
# ROUGHLY FOLLOWS THE EXPLANATIONS GIVEN IN A PAPER BY GUNER AND |
152
|
|
|
|
|
|
|
# JOHNSON (GJ), EXCEPT WHERE THIS PAPER HAS ERRORS, AS NOTED IN THE |
153
|
|
|
|
|
|
|
# CODE. A DRAFT OF THIS PAPER IS AVAILABLE AT: |
154
|
|
|
|
|
|
|
# |
155
|
|
|
|
|
|
|
# http://esl.eng.ohio-state.edu/~rstheory/iip/shapiro.pdf |
156
|
|
|
|
|
|
|
# |
157
|
|
|
|
|
|
|
# BUT NOTE THAT WE CHANGE THEIR 1->N VECTOR INDEX NOTATION TO 0->N-1 TO BE |
158
|
|
|
|
|
|
|
# CONSISTENT WITH HOW PERL STORES LISTS |
159
|
|
|
|
|
|
|
# |
160
|
|
|
|
|
|
|
# (2) Size limits for empirical distributions to be tested. The log-standard |
161
|
|
|
|
|
|
|
# normal transformation "Z-form" used here has been empirically shown to be |
162
|
|
|
|
|
|
|
# good up to 2000 data points according to GJ, but GJ also claims up to |
163
|
|
|
|
|
|
|
# 5000 is OK. This version that uses the Blom approximation also requires |
164
|
|
|
|
|
|
|
# at least 6 points because the vector of m-values is anti-symmetric and |
165
|
|
|
|
|
|
|
# 6 is the lowest number of members for which the numerator of epsilon |
166
|
|
|
|
|
|
|
# does not vanish: |
167
|
|
|
|
|
|
|
# |
168
|
|
|
|
|
|
|
# num = M*M - 2[m_(n)^2 + m_(n-1)^2] |
169
|
|
|
|
|
|
|
# = [m_(1)^2 + m_(2)^2 + m_(3)^2 +...+ m_(n)^2 - 2[m_(n)^2 + m_(n-1)^2] |
170
|
|
|
|
|
|
|
# |
171
|
|
|
|
|
|
|
# # points notes |
172
|
|
|
|
|
|
|
# 1 no distribution (a point) |
173
|
|
|
|
|
|
|
# 2 no distribution (a line) |
174
|
|
|
|
|
|
|
# 3 m1 & m3 cancelled by 2*m3 and m2=0 |
175
|
|
|
|
|
|
|
# 4 termwise cancellation: m1 & m4 by 2*m4 and m2 & m3 by 2*m3 |
176
|
|
|
|
|
|
|
# 5 similar except m3 identically 0 |
177
|
|
|
|
|
|
|
# |
178
|
|
|
|
|
|
|
# (3) test case from http://www.statsdirect.com/help/parametric_methods/swt.htm |
179
|
|
|
|
|
|
|
# |
180
|
|
|
|
|
|
|
# the list qw/0.0987 0.0000 0.0533 -0.0026 0.0293 -0.0036 0.0246 -0.0042 |
181
|
|
|
|
|
|
|
# 0.0200 -0.0114 0.0194 -0.0139 0.0191 -0.0222 0.0180 -0.0333 |
182
|
|
|
|
|
|
|
# 0.0172 -0.0348 0.0132 -0.0363 0.0102 -0.0363 0.0084 -0.0402 |
183
|
|
|
|
|
|
|
# 0.0077 -0.0583 0.0058 -0.1184 0.0016 -0.1420/ |
184
|
|
|
|
|
|
|
# |
185
|
|
|
|
|
|
|
# should give $w_statistic \approx 0.89218 and $pval \approx 0.00544 |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
sub shapiro_wilk_test { |
188
|
|
|
|
|
|
|
my ($sample_distribution) = @_; |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
#__SOME CONSTANTS |
191
|
|
|
|
|
|
|
my $three_eighths = 3/8; |
192
|
|
|
|
|
|
|
my $one_fourth = 1/4; |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
#__ARGUMENT MUST BE A REFERENCE TO A LIST OF NUMBERS (NOTE REGEXP::COMMON |
195
|
|
|
|
|
|
|
# DOES NOT SEEM TO PROVIDE A GENERAL REGEXP FOR FLOATING POINTS OF VARIOUS |
196
|
|
|
|
|
|
|
# FORMS (AS ITS TITLE WOULD SEEM TO INDICATE) --- FOUND A PRETTY GOOD REGEXP |
197
|
|
|
|
|
|
|
# AT http://perl.active-venture.com/pod/perlretut-regexp.html WHICH SEEMS TO |
198
|
|
|
|
|
|
|
# BE PART OF THE PERL REGULAR EXPRESSIONS TUTORIAL |
199
|
|
|
|
|
|
|
croak "arg must be list ref" unless ref $sample_distribution eq "ARRAY"; |
200
|
|
|
|
|
|
|
foreach my $string (@{$sample_distribution}) { |
201
|
|
|
|
|
|
|
croak "'$string' not a number" unless |
202
|
|
|
|
|
|
|
$string =~ /^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$/; |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
#__SORT AND SIZE |
206
|
|
|
|
|
|
|
@{$sample_distribution} = sort _numerical_ @{$sample_distribution}; |
207
|
|
|
|
|
|
|
my $num_vals = scalar @{$sample_distribution}; |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
#__LIST SIZE HAS CERTAIN LIMITS AS NOTED ABOVE |
210
|
|
|
|
|
|
|
carp "results not guaranteed for >5000 points" if $num_vals > 5000; |
211
|
|
|
|
|
|
|
croak "sample must have at least 6 points" unless $num_vals >= 6; |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
#__VECTOR OF M-VALUES (GJ EQN 7) I.E. EXPECTED VALUES OF THE ORDER STATISTICS |
214
|
|
|
|
|
|
|
# OF THE STANDARD NORMAL DISTRIBUTION USING BLOM'S APPROXIMATION -- SEE |
215
|
|
|
|
|
|
|
# [Harter61] PP 153 AND NOTE INDEXING CORRECTIONS AND IMPLEMENTATION NOTES |
216
|
|
|
|
|
|
|
# FOR "UDISTR" |
217
|
|
|
|
|
|
|
# |
218
|
|
|
|
|
|
|
# CERTIFICATION: THIS BLOCK RETURNS VALUES THAT ARE CONSISTENT WITH |
219
|
|
|
|
|
|
|
# [Harter61] TABLE 1 PP 158-165, THOUGH NOT EXACT BECAUSE |
220
|
|
|
|
|
|
|
# OF THE APPROXIMATION, WHERE THEY LIST THE POSITIVE VALUES |
221
|
|
|
|
|
|
|
# CORRESPONDING TO THE I-TH *LARGEST* NORMAL DEVIATE --- NOTE |
222
|
|
|
|
|
|
|
# THAT THIS LOOP IS BASED ON ORDER FROM LEAST TO GREATEST |
223
|
|
|
|
|
|
|
# SO WE CALCULATE THE I-TH *SMALLEST* NORMAL DEVIATES FIRST |
224
|
|
|
|
|
|
|
# |
225
|
|
|
|
|
|
|
# THE RESULT IS ANTI-SYMMETRIC |
226
|
|
|
|
|
|
|
my @mvals = (); |
227
|
|
|
|
|
|
|
my $mean = 0; |
228
|
|
|
|
|
|
|
for (my $i = 0; $i < $num_vals; $i++) { |
229
|
|
|
|
|
|
|
my $index = $i + 1; |
230
|
|
|
|
|
|
|
my $arg_p = ($index - $three_eighths)/($num_vals + $one_fourth); |
231
|
|
|
|
|
|
|
my $mval = Statistics::Distributions::udistr (1 - $arg_p); |
232
|
|
|
|
|
|
|
push @mvals, $mval; |
233
|
|
|
|
|
|
|
$mean += $sample_distribution->[$i]; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
$mean /= $num_vals; |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
#__DOT-PRODUCT OF VECTOR M WITH ITSELF |
238
|
|
|
|
|
|
|
my $m_dot_m = 0; |
239
|
|
|
|
|
|
|
foreach my $mval (@mvals) { |
240
|
|
|
|
|
|
|
$m_dot_m += $mval * $mval; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
#__VECTOR OF C-VALUES (GJ EQN 9) --- NO $INDEX CORRECTION NECESSARY HERE |
244
|
|
|
|
|
|
|
my $sqrt_m_dot_m = sqrt ($m_dot_m); |
245
|
|
|
|
|
|
|
my @cvals = (); |
246
|
|
|
|
|
|
|
for (my $i = 0; $i < $num_vals; $i++) { |
247
|
|
|
|
|
|
|
my $cval = $mvals[$i] / $sqrt_m_dot_m; |
248
|
|
|
|
|
|
|
push @cvals, $cval; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
#__CALCULATE THE LAST 2 MEMBERS OF THE VECTOR OF A-VALUES USING POLYNOMIAL |
252
|
|
|
|
|
|
|
# APPROXIMATION [Royston92] EVALUATED USING HORNER'S RULE (GJ EQNS 4 AND 5) |
253
|
|
|
|
|
|
|
#__NOTE INDEXING CORRECTIONS |
254
|
|
|
|
|
|
|
my $position_n = $num_vals - 1; |
255
|
|
|
|
|
|
|
my $uval = 1 / sqrt($num_vals); |
256
|
|
|
|
|
|
|
my $a_n = ((((-2.706056*$uval+4.434685)*$uval-2.07119)*$uval-0.147981)*$uval+0.221157)*$uval+$cvals[$position_n]; |
257
|
|
|
|
|
|
|
my $a_n_m_1 = ((((-3.582633*$uval+5.682633)*$uval-1.752461)*$uval-0.293762)*$uval+0.042981)*$uval+$cvals[$position_n - 1]; |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
#__EPSILON (GJ EQN 8) --- NOTE INDEXING CORRECTIONS |
260
|
|
|
|
|
|
|
my $epsilon = $m_dot_m - 2 * ( |
261
|
|
|
|
|
|
|
$mvals[$position_n] * $mvals[$position_n] |
262
|
|
|
|
|
|
|
+ |
263
|
|
|
|
|
|
|
$mvals[$position_n - 1] * $mvals[$position_n - 1] |
264
|
|
|
|
|
|
|
); |
265
|
|
|
|
|
|
|
$epsilon /= 1 - 2 * ($a_n * $a_n + $a_n_m_1 * $a_n_m_1); |
266
|
|
|
|
|
|
|
my $sqrt_epsilon = sqrt($epsilon); |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
#__BUILD THE VECTOR OF A-VALUES (GJ EQNS 4-6) --- NOTE INDEXING CORRECTIONS |
269
|
|
|
|
|
|
|
my @avals = (); |
270
|
|
|
|
|
|
|
push @avals, - $a_n; |
271
|
|
|
|
|
|
|
push @avals, - $a_n_m_1; |
272
|
|
|
|
|
|
|
for (my $i = 2; $i <= $num_vals - 3; $i++) { |
273
|
|
|
|
|
|
|
push @avals, $mvals[$i] / $sqrt_epsilon; |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
push @avals, $a_n_m_1; |
276
|
|
|
|
|
|
|
push @avals, $a_n; |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
#__SHAPIRO-WILK W-STATISTIC --- NOTE GJ EQN 1 FOR THIS ENTITY IS INCORRECT, AS |
279
|
|
|
|
|
|
|
# THE ACTUAL NUMERATOR IS SQUARED AND THE ACTUAL DIFFERENCE IN THE DENOMINATOR |
280
|
|
|
|
|
|
|
# IS BETWEEN THE I-TH ELEMENT OF THE TEST (SAMPLE) DISTRIBUTION AND ITS AVERAGE |
281
|
|
|
|
|
|
|
my ($sw_numerator, $sw_denominator) = (0, 0); |
282
|
|
|
|
|
|
|
for (my $i = 0; $i < $num_vals; $i++) { |
283
|
|
|
|
|
|
|
$sw_numerator += $avals[$i] * $sample_distribution->[$i]; |
284
|
|
|
|
|
|
|
$sw_denominator += ($sample_distribution->[$i] - $mean)**2; |
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
$sw_numerator *= $sw_numerator; |
287
|
|
|
|
|
|
|
my $w_statistic = $sw_numerator / $sw_denominator; |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
#__TRANSFORMATION OF W-STATISTIC INTO STANDARD Z-STATISTIC USING POLYNOMIAL |
290
|
|
|
|
|
|
|
# APPROXIMATION [Royston92] EVAL'D USING HORNER'S RULE (GJ EQNS 10-12) |
291
|
|
|
|
|
|
|
# SO THAT WE CAN TEST USING THE STANDARD NORMAL DISTRIBUTION |
292
|
|
|
|
|
|
|
my $log_n = log ($num_vals); |
293
|
|
|
|
|
|
|
my $mu_z = ((0.0038915*$log_n - 0.083751)*$log_n - 0.31082)*$log_n - 1.5861; |
294
|
|
|
|
|
|
|
my $log_sigma_z = (0.0030302 * $log_n - 0.082676) * $log_n - 0.4803; |
295
|
|
|
|
|
|
|
my $sigma_z = exp ($log_sigma_z); |
296
|
|
|
|
|
|
|
my $z_statistic = (log (1 - $w_statistic) - $mu_z) / $sigma_z; |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
#__PVALUE FROM STANDARD NORMAL DISTRIBUTION -- SEE IMPLEMENTATION NOTES ABOVE: |
299
|
|
|
|
|
|
|
# "UPROB" GIVES SIGNIFICANCE PVALUE (I.E. AREA TO THE RIGHT OF THE STATISTIC) |
300
|
|
|
|
|
|
|
my $pval = Statistics::Distributions::uprob ($z_statistic); |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
#__RETURN RESULTS |
303
|
|
|
|
|
|
|
if (wantarray) { |
304
|
|
|
|
|
|
|
return ($pval, $w_statistic); |
305
|
|
|
|
|
|
|
} else { |
306
|
|
|
|
|
|
|
return $pval; |
307
|
|
|
|
|
|
|
} |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
############################## |
311
|
|
|
|
|
|
|
# D'AGOSTINO K-SQUARE TEST # |
312
|
|
|
|
|
|
|
############################## |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=head2 D'Agostino K-Squared Test |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
The L is a good test against non-normality arising from |
317
|
|
|
|
|
|
|
L and/or |
318
|
|
|
|
|
|
|
L [Dagostino90]. |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
$pval = dagostino_k_square_test ([0.34, -0.2, ...]); |
321
|
|
|
|
|
|
|
($pval, $ksq_statistic) = dagostino_k_square_test ([0.34, -0.2, ...]); |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
The test statistic depends upon both the sample kurtosis and skewness, as |
324
|
|
|
|
|
|
|
well as the moments of these parameters from a normal population, as quantified |
325
|
|
|
|
|
|
|
by Pearson's coefficients [Pearson31]. |
326
|
|
|
|
|
|
|
These are transformed [Dagostino70,Anscombe83] to expressions that sum |
327
|
|
|
|
|
|
|
to the K-squared statistic, which is essentially chi-square-distributed |
328
|
|
|
|
|
|
|
with 2 degrees of |
329
|
|
|
|
|
|
|
freedom [Dagostino90]. |
330
|
|
|
|
|
|
|
The kurtosis transform, and thus the overall test, generally works best |
331
|
|
|
|
|
|
|
when the sample distribution has at least 20 data points [Anscombe83] and the |
332
|
|
|
|
|
|
|
routine will L |
333
|
|
|
|
|
|
|
otherwise. |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
=cut |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
# IMPLEMENTATION NOTES FOR D'AGOSTINO K-SQUARED TEST |
338
|
|
|
|
|
|
|
# |
339
|
|
|
|
|
|
|
# (1) WE ROUGHLY FOLLOW THE VARIABLE NOTATION GIVEN BY THE WIKIPEDIA ARTICLE |
340
|
|
|
|
|
|
|
# AT http://en.wikipedia.org/wiki/D'Agostino's_K-squared_test |
341
|
|
|
|
|
|
|
# |
342
|
|
|
|
|
|
|
# (2) test case from [Dagostino90] PP 318 |
343
|
|
|
|
|
|
|
# |
344
|
|
|
|
|
|
|
# the list qw/393 353 334 336 327 300 300 308 283 285 270 270 272 278 278 |
345
|
|
|
|
|
|
|
# 263 264 267 267 267 268 254 254 254 256 256 258 240 243 246 |
346
|
|
|
|
|
|
|
# 247 248 230 230 230 230 231 232 232 232 234 234 236 236 238 |
347
|
|
|
|
|
|
|
# 220 225 225 226 210 211 212 215 216 217 218 200 202 192 198 |
348
|
|
|
|
|
|
|
# 184 167/; |
349
|
|
|
|
|
|
|
# |
350
|
|
|
|
|
|
|
# should give $ksq_statistic \approx 14.752 and $pval \approx 0.00063 |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
sub dagostino_k_square_test { |
353
|
|
|
|
|
|
|
my ($sample_distribution) = @_; |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
#__ARGUMENT MUST BE A REFERENCE TO A LIST OF NUMBERS (NOTE REGEXP::COMMON |
356
|
|
|
|
|
|
|
# DOES NOT SEEM TO PROVIDE A GENERAL REGEXP FOR FLOATING POINTS OF VARIOUS |
357
|
|
|
|
|
|
|
# FORMS (AS ITS TITLE WOULD SEEM TO INDICATE) --- FOUND A PRETTY GOOD REGEXP |
358
|
|
|
|
|
|
|
# AT http://perl.active-venture.com/pod/perlretut-regexp.html WHICH SEEMS TO |
359
|
|
|
|
|
|
|
# BE PART OF THE PERL REGULAR EXPRESSIONS TUTORIAL |
360
|
|
|
|
|
|
|
croak "arg must be list ref" unless ref $sample_distribution eq "ARRAY"; |
361
|
|
|
|
|
|
|
foreach my $string (@{$sample_distribution}) { |
362
|
|
|
|
|
|
|
croak "'$string' not a number" unless |
363
|
|
|
|
|
|
|
$string =~ /^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$/; |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
#__MOMENTS OF SAMPLE DISTRIBUTION |
367
|
|
|
|
|
|
|
my ($n, $mean, $g_1, $g_2) = _sample_mean_skew_kurt_ ($sample_distribution); |
368
|
|
|
|
|
|
|
carp "results not guaranteed for <20 points" if $n < 20; |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
#__SELECTED PEARSON'S COEFFICIENTS [Pearson31] WITH HORNER'S RULE FOR POLYNOMS |
371
|
|
|
|
|
|
|
my $mu_2_g_1 = 6*($n-2) / (($n+1)*($n+3)); |
372
|
|
|
|
|
|
|
my $gamma_2_g_1 = 36*($n-7)*(($n+2)*$n-5) / (($n-2)*($n+5)*($n+7)*($n+9)); |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
my $mu_1_g_2 = - 6 / ($n+1); |
375
|
|
|
|
|
|
|
my $mu_2_g_2 = 24*$n*($n-2)*($n-3) / (($n+1)*($n+1)*($n+3)*($n+5)); |
376
|
|
|
|
|
|
|
my $gamma_1_g_2 = 6*(($n-5)*$n+2) / (($n+7)*($n+9)); |
377
|
|
|
|
|
|
|
$gamma_1_g_2 *= sqrt(6*($n+3)*($n+5)); |
378
|
|
|
|
|
|
|
$gamma_1_g_2 /= sqrt($n*($n-2)*($n-3)); |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
#__TRANSFORMED SKEWNESS PARAMETER [Dagostino70] |
381
|
|
|
|
|
|
|
my $w_squared = sqrt(2*$gamma_2_g_1+4) - 1; |
382
|
|
|
|
|
|
|
my $delta = 1 / sqrt(log(sqrt($w_squared))); |
383
|
|
|
|
|
|
|
my $alpha_squared = 2/($w_squared - 1); |
384
|
|
|
|
|
|
|
my $ratio_squared = $g_1 * $g_1 / ($alpha_squared * $mu_2_g_1); |
385
|
|
|
|
|
|
|
my $z1 = $delta*log(sqrt($ratio_squared) + sqrt($ratio_squared + 1)); |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
#__EQUIVALENT-ALTERNATE DERIVATION OF SKEWNESS TRANSFORM [Dagostino90] |
388
|
|
|
|
|
|
|
# my $y = $g_1 * sqrt(($n + 1)*($n + 3)/(6*($n - 2))); |
389
|
|
|
|
|
|
|
# my $beta_2 = 3*(($n + 27)*$n - 70)*($n + 1)*($n + 3); |
390
|
|
|
|
|
|
|
# $beta_2 /= ($n - 2)*($n + 5)*($n + 7)*($n + 9); |
391
|
|
|
|
|
|
|
# my $w_squared = sqrt(2*$beta_2 - 2) - 1; |
392
|
|
|
|
|
|
|
# my $delta = 1 / sqrt(log(sqrt($w_squared))); |
393
|
|
|
|
|
|
|
# my $alpha = sqrt(2/($w_squared - 1)); |
394
|
|
|
|
|
|
|
# my $z1 = $delta*log($y/$alpha + sqrt(($y/$alpha)*($y/$alpha) + 1)); |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
#__TRANSFORMED KURTOSIS PARAMETER [Anscombe83] |
397
|
|
|
|
|
|
|
my $a = sqrt(1 + 4 / ($gamma_1_g_2 * $gamma_1_g_2)) + 2/$gamma_1_g_2; |
398
|
|
|
|
|
|
|
$a *= 8/$gamma_1_g_2; |
399
|
|
|
|
|
|
|
$a += 6; |
400
|
|
|
|
|
|
|
my $term = 1 - 2/$a; |
401
|
|
|
|
|
|
|
$term /= 1 + ($g_2 - $mu_1_g_2) * sqrt(2/($a-4)) / sqrt($mu_2_g_2); |
402
|
|
|
|
|
|
|
my $z2 = 1 - 2/(9*$a) - $term**(1/3); |
403
|
|
|
|
|
|
|
$z2 *= sqrt(9*$a/2); |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
#__OMNIBUS K-SQUARED STATISTIC WHICH IS DSITRBUTED ROUGHLY CHI-SQUARED |
406
|
|
|
|
|
|
|
# WITH 2 DEGREES OF FREEDOM [Dagostino90] |
407
|
|
|
|
|
|
|
my $k_squared_stat = $z1*$z1 + $z2*$z2; |
408
|
|
|
|
|
|
|
my $pval = Statistics::Distributions::chisqrprob (2, $k_squared_stat); |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
#__RETURN RESULTS |
411
|
|
|
|
|
|
|
if (wantarray) { |
412
|
|
|
|
|
|
|
return ($pval, $k_squared_stat); |
413
|
|
|
|
|
|
|
} else { |
414
|
|
|
|
|
|
|
return $pval; |
415
|
|
|
|
|
|
|
} |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
sub _numerical_ {$a <=> $b} |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
sub _sample_mean_skew_kurt_ { |
421
|
|
|
|
|
|
|
my ($sample_distribution) = @_; |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
#__MEAN |
424
|
|
|
|
|
|
|
my $mean = 0; |
425
|
|
|
|
|
|
|
my $num_vals = scalar @{$sample_distribution}; |
426
|
|
|
|
|
|
|
foreach my $datum (@{$sample_distribution}) { |
427
|
|
|
|
|
|
|
$mean += $datum; |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
$mean /= $num_vals; |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
#__SAMPLE SKEWNESS AND KURTOSIS |
432
|
|
|
|
|
|
|
my ($sum_square_diffs, $sum_cube_diffs, $sum_quad_diffs) = (0, 0, 0); |
433
|
|
|
|
|
|
|
foreach my $datum (@{$sample_distribution}) { |
434
|
|
|
|
|
|
|
my $sq_diff = ($datum - $mean) * ($datum - $mean); |
435
|
|
|
|
|
|
|
$sum_square_diffs += $sq_diff; |
436
|
|
|
|
|
|
|
$sum_cube_diffs += $sq_diff * ($datum - $mean); |
437
|
|
|
|
|
|
|
$sum_quad_diffs += $sq_diff * $sq_diff; |
438
|
|
|
|
|
|
|
} |
439
|
|
|
|
|
|
|
my $sum_square_diffs_over_n = $sum_square_diffs / $num_vals; |
440
|
|
|
|
|
|
|
my $g_1 = $sum_cube_diffs / $num_vals; |
441
|
|
|
|
|
|
|
$g_1 /= $sum_square_diffs_over_n**1.5; |
442
|
|
|
|
|
|
|
my $g_2 = $sum_quad_diffs / $num_vals; |
443
|
|
|
|
|
|
|
$g_2 /= $sum_square_diffs_over_n**2; |
444
|
|
|
|
|
|
|
$g_2 -= 3; |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
#__RESULTS: NUMBER OF VALUES, MEAN, SKEWNESS, KURTOSIS |
447
|
|
|
|
|
|
|
return ($num_vals, $mean, $g_1, $g_2); |
448
|
|
|
|
|
|
|
} |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=head1 REFERENCES |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=over |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
=item * |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
[Anscombe83] Anscombe, F. J. and Glynn, W. J. (1983) |
457
|
|
|
|
|
|
|
I, |
458
|
|
|
|
|
|
|
Biometrika B<70>(1), 227-234. |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
=item * |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
[Chen71] Chen, E. H. (1971) |
463
|
|
|
|
|
|
|
I, |
464
|
|
|
|
|
|
|
Journal of the American Statistical Association B<66>(336), 760-762. |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
=item * |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
[Dagostino70] D'Agostino, R. B. (1970) |
469
|
|
|
|
|
|
|
I, |
470
|
|
|
|
|
|
|
Biometrika B<57>(3), 679-681. |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
=item * |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
[Dagostino71] D'Agostino, R. B. (1971) |
475
|
|
|
|
|
|
|
I, |
476
|
|
|
|
|
|
|
Biometrika B<58>(2), 341-348. |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=item * |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
[Dagostino90] D'Agostino, R. B. et al. (1990) |
481
|
|
|
|
|
|
|
I, |
482
|
|
|
|
|
|
|
American Statistician B<44>(4), 316-321. |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
=item * |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
[Harter61] Harter, H. L. (1961) |
487
|
|
|
|
|
|
|
I, |
488
|
|
|
|
|
|
|
Biometrika B<48>(1/2), 151-165. |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
=item * |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
[Pearson31] Pearson, E. S. (1931) |
493
|
|
|
|
|
|
|
I, |
494
|
|
|
|
|
|
|
Biometrika B<22>(3/4), 423-424. |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
=item * |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
[Royston92] Royston, J. P. (1992) |
499
|
|
|
|
|
|
|
I, |
500
|
|
|
|
|
|
|
Statistics and Computing B<2>(3) 117-119. |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
=item * |
503
|
|
|
|
|
|
|
|
504
|
|
|
|
|
|
|
[Shapiro65] Shapiro, S. S. and Wilk, M. B. (1965) |
505
|
|
|
|
|
|
|
I, |
506
|
|
|
|
|
|
|
Biometrika B<52>(3/4), 591-611. |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
=back |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
=head1 AUTHOR |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
Mike Wendl, C<< >> |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
=head1 BUGS |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through |
517
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll |
518
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
=head1 SUPPORT |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
perldoc Statistics::Normality |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
You can also look for information at: |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
=over 4 |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
L |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
L |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
=item * CPAN Ratings |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
L |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
=item * Search CPAN |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
L |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
=back |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
Copyright (C) 2011 Washington University |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
554
|
|
|
|
|
|
|
it under the terms of the GNU General Public License as published by |
555
|
|
|
|
|
|
|
the Free Software Foundation; either version 2 of the License, or |
556
|
|
|
|
|
|
|
(at your option) any later version. |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, |
559
|
|
|
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
560
|
|
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
561
|
|
|
|
|
|
|
GNU General Public License for more details. |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License |
564
|
|
|
|
|
|
|
along with this program; if not, write to the Free Software |
565
|
|
|
|
|
|
|
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
=cut |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
1; # End of Statistics::Normality |