blib/lib/Statistics/Sequences/Runs.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 171 | 189 | 90.4 |
branch | 66 | 78 | 84.6 |
condition | 9 | 16 | 56.2 |
subroutine | 33 | 36 | 91.6 |
pod | 18 | 18 | 100.0 |
total | 297 | 337 | 88.1 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package Statistics::Sequences::Runs; | ||||||
2 | 6 | 6 | 312006 | use 5.008008; | |||
6 | 15 | ||||||
3 | 6 | 6 | 19 | use strict; | |||
6 | 6 | ||||||
6 | 115 | ||||||
4 | 6 | 6 | 20 | use warnings FATAL => 'all'; | |||
6 | 16 | ||||||
6 | 214 | ||||||
5 | 6 | 6 | 19 | use Carp qw(carp croak); | |||
6 | 7 | ||||||
6 | 307 | ||||||
6 | 6 | 6 | 20 | use base qw(Statistics::Sequences); | |||
6 | 5 | ||||||
6 | 2612 | ||||||
7 | 6 | 6 | 152904 | use List::AllUtils qw(all mesh sum uniq); | |||
6 | 9 | ||||||
6 | 281 | ||||||
8 | 6 | 6 | 21 | use Number::Misc qw(is_even is_numeric); | |||
6 | 7 | ||||||
6 | 214 | ||||||
9 | 6 | 6 | 2505 | use Statistics::Zed 0.10; | |||
6 | 37208 | ||||||
6 | 158 | ||||||
10 | 6 | 6 | 27 | use String::Numeric qw(is_int); | |||
6 | 8 | ||||||
6 | 10166 | ||||||
11 | $Statistics::Sequences::Runs::VERSION = '0.22'; | ||||||
12 | |||||||
13 | =pod | ||||||
14 | |||||||
15 | =head1 NAME | ||||||
16 | |||||||
17 | Statistics::Sequences::Runs - The Runs Test: Wald-Wolfowitz runs test descriptives, deviation and combinatorics | ||||||
18 | |||||||
19 | =head1 VERSION | ||||||
20 | |||||||
21 | This is documentation for B |
||||||
22 | |||||||
23 | =head1 SYNOPSIS | ||||||
24 | |||||||
25 | use strict; | ||||||
26 | use Statistics::Sequences::Runs 0.22; | ||||||
27 | my $runs = Statistics::Sequences::Runs->new(); | ||||||
28 | |||||||
29 | # Data are a sequence of dichotomous strings: | ||||||
30 | my @data = (qw/1 0 0 0 1 1 0 1 1 0 0 1 0 0 1 1 1 1 0 1/); | ||||||
31 | my $val; | ||||||
32 | |||||||
33 | # - Pre-load data to use for all methods: | ||||||
34 | $runs->load(\@data); | ||||||
35 | $val = $runs->observed(); | ||||||
36 | $val = $runs->expected(); | ||||||
37 | |||||||
38 | # - or give data as "data => $aref" to each method: | ||||||
39 | $val = $runs->observed(data => AREF); | ||||||
40 | |||||||
41 | # - or give frequencies of the 2 "states" in a sequence: | ||||||
42 | $val = $runs->expected(freqs => [11, 9]); # works with other methods except observed() | ||||||
43 | |||||||
44 | # Deviation ratio: | ||||||
45 | $val = $runs->z_value(ccorr => 1); | ||||||
46 | |||||||
47 | # Probability of deviation from expectation: | ||||||
48 | my ($z, $p) = $runs->z_value(ccorr => 1, tails => 1); # dev. ratio with p-value | ||||||
49 | $val = $runs->p_value(tails => 1); # normal dist. p-value itself | ||||||
50 | $val = $runs->p_value(exact => 1, tails => 1); # p-value by combinatorics | ||||||
51 | |||||||
52 | # Keyed list of descriptives etc.: | ||||||
53 | my $href = $runs->stats_hash(values => [qw/observed p_value/], exact => 1); | ||||||
54 | |||||||
55 | # Print descriptives etc. in the same way: | ||||||
56 | $runs->dump( | ||||||
57 | values => [qw/observed expected p_value/], | ||||||
58 | exact => 1, | ||||||
59 | flag => 1, | ||||||
60 | precision_s => 3, | ||||||
61 | precision_p => 7 | ||||||
62 | ); | ||||||
63 | # prints: observed = 11.000, expected = 10.900, p_value = 0.5700167 | ||||||
64 | |||||||
65 | =head1 DESCRIPTION | ||||||
66 | |||||||
67 | The module returns statistical information re Wald-type runs across a sequence of dichotmous events on one or more consecutive trials. For example, given an accuracy-based sequence composed of matches (H) and misses (M) like (H, H, M, H, M, M, M, M, H), there are 5 runs: 3 for Hs, 2 for Ms. This observed number of runs can be compared with the number expected to occur by chance over the number of trials, relative to the expected variance. More runs than expected ("negative serial dependence") can denote irregularity, instability, mixing up of alternatives. Fewer runs than expected ("positive serial dependence") can denote cohesion, insulation, isolation of alternatives. Both can indicate sequential dependency: either negative (an alternation bias), or positive (a repetition bias). | ||||||
68 | |||||||
69 | The distribution of runs is asymptotically normal, and a deviation-based test of extra-chance occurrence when at least one alternative has more than 20 occurrences (Siegal rule), or both event occurrences exceed 10 (Kelly, 1982), is conventionally considered reliable; otherwise, the module provides an "exact test" based on combinatorics. | ||||||
70 | |||||||
71 | For non-dichotomous, continuous or multinomial data, see L |
||||||
72 | |||||||
73 | =head1 SUBROUTINES/METHODS | ||||||
74 | |||||||
75 | =head2 Data-handling | ||||||
76 | |||||||
77 | =head3 new | ||||||
78 | |||||||
79 | $runs = Statistics::Sequences::Runs->new(); | ||||||
80 | |||||||
81 | Returns a new Runs object. Expects/accepts no arguments but the classname. | ||||||
82 | |||||||
83 | =head3 load | ||||||
84 | |||||||
85 | $runs->load(ARRAY); | ||||||
86 | $runs->load(AREF); | ||||||
87 | $runs->load(foodat => AREF); # named whatever | ||||||
88 | |||||||
89 | Loads a sequence anonymously or by name - see L |
||||||
90 | |||||||
91 | Alternatively, skip this action; data don't always have to be loaded to use the stats methods here. The sequence can be provided with each method call, as shown below, or by simply giving the observed counts of runs (apart, of course, for calculating these counts, when a specific sequence is needed). | ||||||
92 | |||||||
93 | =head3 add, access, unload | ||||||
94 | |||||||
95 | See L |
||||||
96 | |||||||
97 | =head2 Descriptives | ||||||
98 | |||||||
99 | =head3 observed | ||||||
100 | |||||||
101 | $v = $runs->observed(); # use the data loaded anonymously | ||||||
102 | $v = $runs->observed(name => 'foodat'); # ... or the name given on loading | ||||||
103 | $v = $runs->observed(data => AREF); # ... or just give the data now | ||||||
104 | |||||||
105 | Returns the total observed number of runs in the loaded or given data. For example, | ||||||
106 | |||||||
107 | $v = $runs->observed(data => [qw/H H H T T H H/]); | ||||||
108 | |||||||
109 | returns 3 (for the runs 'HHH', 'TT' and 'HH'). | ||||||
110 | |||||||
111 | =cut | ||||||
112 | |||||||
113 | sub observed { | ||||||
114 | 58 | 58 | 1 | 5416 | my ( $self, @args ) = @_; | ||
115 | 58 | 100 | 108 | my $args = ref $args[0] ? $args[0] : {@args}; | |||
116 | 58 | 100 | 111 | return $args->{'observed'} if defined $args->{'observed'}; | |||
117 | 49 | 95 | my $data = _get_data( $self, $args ); | ||||
118 | 49 | 1074 | my $observed = 0; | ||||
119 | 49 | 39 | for ( 0 .. scalar @{$data} - 1 ) { | ||||
49 | 89 | ||||||
120 | 1250 | 100 | 100 | 2974 | if ( $_ == 0 or $data->[$_] ne $data->[ $_ - 1 ] ) { | ||
121 | 550 | 400 | $observed++; | ||||
122 | } | ||||||
123 | } | ||||||
124 | 49 | 96 | return $observed; | ||||
125 | } | ||||||
126 | |||||||
127 | =head3 observed_per_state | ||||||
128 | |||||||
129 | @freq = $runs->observed_per_state(data => AREF); | ||||||
130 | $href = $runs->observed_per_state(data => AREF); | ||||||
131 | |||||||
132 | Returns the number of runs per state - as a two-dimensional array where the first element gives the count for the first state in the data, and so for the second. A hashref is returned if not called in list context, the frequencies keyed by state. For example: | ||||||
133 | |||||||
134 | @ari = $runs->observed_per_state(data => [qw/H H H T T H H/]); # returns (2, 1) | ||||||
135 | $ref = $runs->observed_per_state(data => [qw/H H H T T H H/]); # returns { H => 2, T => 1} | ||||||
136 | |||||||
137 | Exceptions: If there was only one state in the loaded/given sequence (e.g., data => [qw/H H H/]), there is only one run and so the returned array will be one-dimensional, i.e., (1), and the returned hashref has only a single key (for this example: { H => 1 }). If there are no states, with an empty array loaded/given for the sequence, then the same applies, except the returned array is (0) and the returned hashref has the empty string as its single key ( q{} => 0 ). | ||||||
138 | |||||||
139 | =cut | ||||||
140 | |||||||
141 | sub observed_per_state { | ||||||
142 | 6 | 6 | 1 | 2123 | my ( $self, @args ) = @_; | ||
143 | 6 | 50 | 17 | my $args = ref $args[0] ? $args[0] : {@args}; | |||
144 | 6 | 10 | my $data = _get_data( $self, $args ); | ||||
145 | 6 | 143 | my @states = uniq @{$data}; | ||||
6 | 31 | ||||||
146 | 6 | 9 | my @freqs = (); | ||||
147 | 6 | 100 | 5 | if ( scalar @{$data} ) { | |||
6 | 37 | ||||||
148 | 5 | 100 | 10 | if ( scalar @states > 1 ) { | |||
149 | 3 | 50 | 8 | @freqs = $data->[0] eq $states[0] ? ( 1, 0 ) : ( 0, 1 ); | |||
150 | } | ||||||
151 | else { | ||||||
152 | 2 | 3 | @freqs = (1); | ||||
153 | } | ||||||
154 | 5 | 4 | for ( 1 .. scalar @{$data} - 1 ) { | ||||
5 | 12 | ||||||
155 | 64 | 100 | 84 | if ( $data->[$_] ne $data->[ $_ - 1 ] ) { | |||
156 | 11 | 100 | 13 | if ( $data->[$_] eq $states[0] ) { | |||
157 | 5 | 5 | $freqs[0]++; | ||||
158 | } | ||||||
159 | else { | ||||||
160 | 6 | 6 | $freqs[1]++; | ||||
161 | } | ||||||
162 | } | ||||||
163 | } | ||||||
164 | } | ||||||
165 | else { | ||||||
166 | 1 | 2 | @states = (q{}); | ||||
167 | 1 | 2 | @freqs = (0); | ||||
168 | } | ||||||
169 | 6 | 100 | 32 | return wantarray ? @freqs : { mesh @states, @freqs }; | |||
170 | } | ||||||
171 | |||||||
172 | =head3 expected | ||||||
173 | |||||||
174 | $v = $runs->expected(); # or specify loaded data by "name", or give as "data" | ||||||
175 | $v = $runs->expected(data => AREF); # use these data | ||||||
176 | $v = $runs->expected(freqs => [POS_INT, POS_INT]); # no actual data; calculate from these two Ns | ||||||
177 | |||||||
178 | Returns the expected number of runs across the loaded data. Expectation is given as follows: | ||||||
179 | |||||||
180 | =for html E[R] = ( (2n1n2) / (n1 + n2) ) + 1 |
||||||
181 | |||||||
182 | where I |
||||||
183 | |||||||
184 | =cut | ||||||
185 | |||||||
186 | sub expected { | ||||||
187 | 36 | 36 | 1 | 1950 | my ( $self, @args ) = @_; | ||
188 | 36 | 74 | my ( $sum, $n1, $n2 ) = _sum_bi_frequency( $self, @args ); | ||||
189 | 36 | 28 | my $val; | ||||
190 | 36 | 100 | 53 | if ($sum) { | |||
191 | 33 | 55 | $val = ( ( 2 * $n1 * $n2 ) / $sum ) + 1; | ||||
192 | } | ||||||
193 | 36 | 68 | return $val; | ||||
194 | } | ||||||
195 | |||||||
196 | =head3 variance | ||||||
197 | |||||||
198 | $v = $runs->variance(); # use data already loaded - anonymously; or specify its "name" | ||||||
199 | $v = $runs->variance(data => AREF); # use these data | ||||||
200 | $v = $runs->variance(freqs => [POS_INT, POS_INT]); # use these counts - not any particular sequence of data | ||||||
201 | |||||||
202 | Returns the variance in the number of runs for the given data. | ||||||
203 | |||||||
204 | =for html V[R] = ( (2n1n2)([2n1n2] – [n1 + n2]) ) / ( ((n1 + n2)2)((n1 + n2) – 1) ) |
||||||
205 | |||||||
206 | defined as above for L |
||||||
207 | |||||||
208 | The data to test can already have been L |
||||||
209 | |||||||
210 | =cut | ||||||
211 | |||||||
212 | sub variance { | ||||||
213 | 35 | 35 | 1 | 2316 | my ( $self, @args ) = @_; | ||
214 | 35 | 50 | my ( $sum, $n1, $n2 ) = _sum_bi_frequency( $self, @args ); | ||||
215 | 35 | 30 | my $val; | ||||
216 | 35 | 100 | 45 | if ($sum) { | |||
217 | 32 | 100 | 48 | if ( $sum < 2 ) { | |||
218 | 4 | 4 | $val = 0; | ||||
219 | } | ||||||
220 | else { | ||||||
221 | 28 | 78 | $val = | ||||
222 | ( ( 2 * $n1 * $n2 * ( ( 2 * $n1 * $n2 ) - $sum ) ) / | ||||||
223 | ( ( $sum**2 ) * ( $sum - 1 ) ) ); | ||||||
224 | } | ||||||
225 | } | ||||||
226 | 35 | 127 | return $val; | ||||
227 | } | ||||||
228 | |||||||
229 | =head3 observed_deviation | ||||||
230 | |||||||
231 | $v = $runs->obsdev(); # use data already loaded - anonymously; or specify its "name" | ||||||
232 | $v = $runs->obsdev(data => AREF); # use these data | ||||||
233 | |||||||
234 | Returns the deviation of (difference between) observed and expected runs for the loaded/given sequence (I |
||||||
235 | |||||||
236 | I |
||||||
237 | |||||||
238 | =cut | ||||||
239 | |||||||
240 | sub observed_deviation { | ||||||
241 | 2 | 2 | 1 | 545 | my ( $self, @args ) = @_; | ||
242 | 2 | 10 | return $self->observed(@args) - $self->expected(@args); | ||||
243 | } | ||||||
244 | *obsdev = \&observed_deviation; | ||||||
245 | |||||||
246 | =head3 standard_deviation | ||||||
247 | |||||||
248 | $v = $runs->stdev(); # use data already loaded - anonymously; or specify its "name" | ||||||
249 | $v = $runs->stdev(data => AREF); | ||||||
250 | $v = $runs->stdev(freqs => [POS_INT, POS_INT]); # don't use actual data; calculate from these two Ns | ||||||
251 | |||||||
252 | Returns square-root of the variance. | ||||||
253 | |||||||
254 | I |
||||||
255 | |||||||
256 | =cut | ||||||
257 | |||||||
258 | sub standard_deviation { | ||||||
259 | 2 | 2 | 1 | 547 | my ( $self, @args ) = @_; | ||
260 | 2 | 5 | return sqrt $self->variance(@args); | ||||
261 | } | ||||||
262 | *stdev = \&standard_deviation; | ||||||
263 | *stddev = \&standard_deviation; | ||||||
264 | |||||||
265 | =head3 skewness | ||||||
266 | |||||||
267 | $v = $runs->skewness(); # use data already loaded - anonymously; or specify its "name" | ||||||
268 | $v = $runs->skewness(data => AREF); # use these data | ||||||
269 | |||||||
270 | Returns run skewness as given by Barton & David (1958) based on the frequencies of the two different elements in the sequence. | ||||||
271 | |||||||
272 | =cut | ||||||
273 | |||||||
274 | sub skewness { | ||||||
275 | 0 | 0 | 1 | 0 | my ( $self, @args ) = @_; | ||
276 | 0 | 0 | my ( $sum, $n1, $n2 ) = _sum_bi_frequency( $self, @args ); | ||||
277 | 0 | 0 | my $k3 = 0; | ||||
278 | 0 | 0 | 0 | 0 | if ( $sum && $n1 != $n2 ) { | ||
279 | 0 | 0 | $k3 = | ||||
280 | ( ( 2 * $n1 * $n2 ) / $sum**3 ) * | ||||||
281 | ( ( ( 16 * $n1**2 * $n2**2 ) / $sum**2 ) - | ||||||
282 | ( ( 4 * $n1 * $n2 * ( $sum + 3 ) ) / $sum ) + | ||||||
283 | 3 * $sum ); | ||||||
284 | } | ||||||
285 | 0 | 0 | return $k3; | ||||
286 | } | ||||||
287 | |||||||
288 | =head3 kurtosis | ||||||
289 | |||||||
290 | $v = $runs->kurtosis(); # use data already loaded - anonymously; or specify its "name" | ||||||
291 | $v = $runs->kurtosis(data => AREF); # use these data | ||||||
292 | |||||||
293 | Returns run kurtosis as given by Barton & David (1958) based on the frequencies of the two different elements in the sequence. | ||||||
294 | |||||||
295 | =cut | ||||||
296 | |||||||
297 | sub kurtosis { | ||||||
298 | 0 | 0 | 1 | 0 | my ( $self, @args ) = @_; | ||
299 | 0 | 0 | my ( $sum, $n1, $n2 ) = _sum_bi_frequency( $self, @args ); | ||||
300 | 0 | 0 | my $k4; | ||||
301 | 0 | 0 | 0 | if ( defined $sum ) { | |||
302 | 0 | 0 | $k4 = ( ( 2 * $n1 * $n2 ) / $sum**4 ) * ( | ||||
303 | ( | ||||||
304 | ( 48 * ( 5 * $sum - 6 ) * $n1**3 * $n2**3 ) / | ||||||
305 | ( $sum**2 * $sum**2 ) | ||||||
306 | ) - ( | ||||||
307 | ( 48 * ( 2 * $sum**2 + 3 * $sum - 6 ) * $n1**2 * $n2**2 ) / | ||||||
308 | ( $sum**2 * $sum ) | ||||||
309 | ) + ( | ||||||
310 | ( | ||||||
311 | 2 * ( 4 * $sum**3 + 45 * $sum**2 - 37 * $sum - 18 ) * | ||||||
312 | $n1 * $n2 | ||||||
313 | ) / $sum**2 | ||||||
314 | ) - ( 7 * $sum**2 + 13 * $sum - 6 ) | ||||||
315 | ); | ||||||
316 | } | ||||||
317 | 0 | 0 | return $k4; | ||||
318 | } | ||||||
319 | |||||||
320 | =head2 Distribution and tests | ||||||
321 | |||||||
322 | =head3 pmf | ||||||
323 | |||||||
324 | $p = $runs->pmf(data => AREF); # or no args to use last pre-loaded data | ||||||
325 | $p = $runs->pmf(observed => POS_INT, freqs => [POS_INT, POS_INT]); | ||||||
326 | |||||||
327 | Implements the runs probability mass function, returning the probability for a particular number of runs given so many dichotomous events (e.g., as in Swed & Eisenhart, 1943, p. 66); i.e., for I' the observed number of runs, I {I = I'}. The required function parameters are the observed number of runs, and the frequencies (counts) of each state in the sequence, which can be given directly, as above, in the arguments B |
||||||
328 | |||||||
329 | =cut | ||||||
330 | |||||||
331 | sub pmf { | ||||||
332 | 2 | 2 | 1 | 360 | my ( $self, @args ) = @_; | ||
333 | 2 | 6 | my ( $n1, $n2 ) = $self->bi_frequency(@args); | ||||
334 | 2 | 4 | return _pmf_num( $self->observed(@args), $n1, $n2 ) / | ||||
335 | _pmf_denom( $n1, $n2 ); | ||||||
336 | } | ||||||
337 | |||||||
338 | =head3 cdf | ||||||
339 | |||||||
340 | $p = $runs->cdf(data => AREF); # or no args to use last pre-loaded data | ||||||
341 | $p = $runs->cdf(observed => POS_INT, freqs => [POS_INT, POS_INT]); | ||||||
342 | |||||||
343 | Implements the cumulative distribution function for runs, returning the probability of obtaining the observed number of runs or less down to the expected number of 2 (assuming that the two possible events are actually represented in the data), as per Swed & Eisenhart (1943), p. 66; i.e., for I' the observed number of runs, I {I <= I'}. The summation is over the probability mass function L |
||||||
344 | |||||||
345 | =cut | ||||||
346 | |||||||
347 | sub cdf { | ||||||
348 | 5 | 5 | 1 | 545 | my ( $self, @args ) = @_; | ||
349 | 5 | 9 | my ( $n1, $n2 ) = $self->bi_frequency(@args); | ||||
350 | 5 | 8 | my $u = $self->observed(@args); | ||||
351 | 5 | 6 | my $sum = 0; | ||||
352 | 5 | 7 | for ( 2 .. $u ) { | ||||
353 | 39 | 41 | $sum += _pmf_num( $_, $n1, $n2 ); | ||||
354 | } | ||||||
355 | 5 | 7 | return $sum / _pmf_denom( $n1, $n2 ); | ||||
356 | } | ||||||
357 | |||||||
358 | =head3 cdfi | ||||||
359 | |||||||
360 | $p = $runs->cdfi(data => AREF); # or no args for last pre-loaded data | ||||||
361 | $p = $runs->cdfi(observed => POS_INT, freqs => [POS_INT, POS_INT]); | ||||||
362 | |||||||
363 | Implements the (inverse) cumulative distribution function for runs, returning the probability of obtaining more than the observed number of runs up from the expected number of 2 (assuming that the two possible events are actually represented in the data), as per Swed & Eisenhart (1943), p. 66; ; i.e., for I' the observed number of runs, I = 1 - I {I <= I' - 1}. The summation is over the probability mass function L |
||||||
364 | |||||||
365 | =cut | ||||||
366 | |||||||
367 | sub cdfi { | ||||||
368 | 8 | 8 | 1 | 383 | my ( $self, @args ) = @_; | ||
369 | 8 | 13 | my ( $n1, $n2 ) = $self->bi_frequency(@args); | ||||
370 | 8 | 15 | my $u = $self->observed(@args); | ||||
371 | 8 | 7 | my $sum = 0; | ||||
372 | 8 | 14 | for ( 2 .. $u - 1 ) { | ||||
373 | 147 | 138 | $sum += _pmf_num( $_, $n1, $n2 ); | ||||
374 | } | ||||||
375 | 8 | 27 | return 1 - $sum / _pmf_denom( $n1, $n2 ); | ||||
376 | } | ||||||
377 | |||||||
378 | =head3 z_value | ||||||
379 | |||||||
380 | $v = $runs->z_value(ccorr => BOOL); # use data already loaded - anonymously; or specify its "name" | ||||||
381 | $v = $runs->z_value(data => AREF, ccorr => BOOL); | ||||||
382 | ($zvalue, $pvalue) = $runs->z_value(data => AREF, ccorr => BOOL, tails => 1|2); # wanting an array, get p-value too | ||||||
383 | |||||||
384 | Returns the normal deviate from a test of runcount deviation, taking the runcount expected from that observed and dividing by the root variance, by default with a continuity correction to expectation. Called wanting an array, returns the I -value for the B |
||||||
385 | |||||||
386 | The data to test can already have been L |
||||||
387 | |||||||
388 | Other options are B |
||||||
389 | |||||||
390 | I |
||||||
391 | |||||||
392 | =cut | ||||||
393 | |||||||
394 | sub z_value { | ||||||
395 | 8 | 8 | 1 | 1895 | my ( $self, @args ) = @_; | ||
396 | 8 | 100 | 22 | my $args = ref $args[0] ? $args[0] : {@args}; | |||
397 | 8 | 17 | my $observed = $self->observed($args); | ||||
398 | 8 | 100 | 17 | return q{} if $observed == 0; | |||
399 | 7 | 52 | my $zed = Statistics::Zed->new(); | ||||
400 | return $zed->z_value( | ||||||
401 | observed => $observed, | ||||||
402 | expected => $self->expected($args), | ||||||
403 | variance => $self->variance($args), | ||||||
404 | ccorr => ( defined $args->{'ccorr'} ? $args->{'ccorr'} : 1 ), | ||||||
405 | tails => ( $args->{'tails'} || 2 ), | ||||||
406 | precision_s => $args->{'precision_s'}, | ||||||
407 | 7 | 100 | 50 | 128 | precision_p => $args->{'precision_p'}, | ||
408 | ); | ||||||
409 | } | ||||||
410 | *zvalue = \&z_value; | ||||||
411 | *zscore = \&z_value; | ||||||
412 | |||||||
413 | =head3 p_value | ||||||
414 | |||||||
415 | $p = $runs->p_value(); # using loaded data and default args | ||||||
416 | $p = $runs->p_value(ccorr => BOOL, tails => 1|2); # normal-approx. for last-loaded data | ||||||
417 | $p = $runs->p_value(exact => BOOL); # calc combinatorially for observed >= or < than expectation | ||||||
418 | $p = $runs->p_value(data => AREF, exact => BOOL); # given data | ||||||
419 | $p = $runs->p_value(observed => POS_INT, freqs => [POS_INT, POS_INT]); # no data sequence, specify known params | ||||||
420 | |||||||
421 | Returns the probability of getting the observed number of runs or a smaller number given the number of each of the two events. By default, a large sample is assumed, and the probability is obtained from the normalized deviation, as given by the L |
||||||
422 | |||||||
423 | If the option B |
||||||
424 | |||||||
425 | If there is only one state/event in the sequence, then the variance from the expected value of 1 is 0, and this method returns 1 (however long this single event sequence is, the observed number of runs cannot differ from the expected number of runs). If the sequence is empty, an empty string is returned. | ||||||
426 | |||||||
427 | Output from these tests has been checked against the tables and examples in Swed & Eisenhart (given to 7 decimal places), and found to agree. | ||||||
428 | |||||||
429 | The option B -value to so many decimal places. |
||||||
430 | |||||||
431 | I |
||||||
432 | |||||||
433 | =cut | ||||||
434 | |||||||
435 | sub p_value { | ||||||
436 | 19 | 19 | 1 | 4859 | my ( $self, @args ) = @_; | ||
437 | 19 | 100 | 59 | my $args = ref $args[0] ? $args[0] : {@args}; | |||
438 | 19 | 34 | my $vals = { | ||||
439 | observed => $self->observed($args), | ||||||
440 | expected => $self->expected($args), | ||||||
441 | variance => $self->variance($args), | ||||||
442 | }; | ||||||
443 | 19 | 100 | 48 | return $args->{'exact'} | |||
444 | ? _p_exact( $self, $args, $vals ) | ||||||
445 | : _p_norm( $self, $args, $vals ); | ||||||
446 | } | ||||||
447 | *pvalue = \&p_value; | ||||||
448 | |||||||
449 | =head3 ztest_ok | ||||||
450 | |||||||
451 | $bool = $runs->ztest_ok(); # use data already loaded - anonymously; or specify its "name" | ||||||
452 | $bool = $runs->ztest_ok(data => AREF); | ||||||
453 | |||||||
454 | Returns true for the loaded sequence if its constituent sample numbers are sufficient for their expected runs to be normally approximated - using Siegal's (1956, p. 140) rule - ok if I |
||||||
455 | |||||||
456 | =cut | ||||||
457 | |||||||
458 | sub ztest_ok { | ||||||
459 | 5 | 5 | 1 | 1658 | my ( $self, @args ) = @_; | ||
460 | 5 | 10 | my ( $n1, $n2 ) = $self->bi_frequency(@args); | ||||
461 | 5 | 100 | 100 | 26 | my $retval = | ||
462 | $n1 > 20 || $n2 > 20 | ||||||
463 | ? 1 | ||||||
464 | : 0; | ||||||
465 | 5 | 9 | return $retval; | ||||
466 | } | ||||||
467 | |||||||
468 | =head2 Utils | ||||||
469 | |||||||
470 | Methods used internally, or for returning/printing descriptives, etc., in a bunch. | ||||||
471 | |||||||
472 | =head3 bi_frequency | ||||||
473 | |||||||
474 | @freq = $runs->bi_frequency(data => AREF); # or no args if using last pre-loaded data | ||||||
475 | |||||||
476 | Returns frequency of the two elements - or croaks if there are more than 2, and gives zero for any absent. | ||||||
477 | |||||||
478 | =cut | ||||||
479 | |||||||
480 | sub bi_frequency { | ||||||
481 | 99 | 99 | 1 | 961 | my ( $self, @args ) = @_; | ||
482 | 99 | 100 | 172 | my $args = ref $args[0] ? $args[0] : {@args}; | |||
483 | |||||||
484 | # might be called internally from a method where an array of frequencies per state is optionally already given: | ||||||
485 | carp | ||||||
486 | 'Argument named \'trials\' is deprecated; use \'freqs\' to give aref of frequencies per state' | ||||||
487 | 99 | 50 | 153 | if $args->{'trials'}; | |||
488 | 99 | 100 | 150 | return @{ $args->{'freqs'} } if ref $args->{'freqs'}; | |||
17 | 34 | ||||||
489 | 82 | 94 | my $data = _get_data( $self, $args ); | ||||
490 | |||||||
491 | # build hash keying each element with its frequency: | ||||||
492 | 82 | 1508 | my %states = (); | ||||
493 | 82 | 65 | for ( @{$data} ) { | ||||
82 | 92 | ||||||
494 | 1983 | 1395 | $states{$_}++; | ||||
495 | } | ||||||
496 | |||||||
497 | # Check that number of states in the sequences are computable for Runs, | ||||||
498 | # and ensure that there is at least zero frequency for the 2 states: | ||||||
499 | 82 | 69 | my $nstates = scalar keys %states; | ||||
500 | 82 | 116 | my @vals = values %states; | ||||
501 | |||||||
502 | 82 | 100 | 182 | if ( !$nstates ) { | |||
100 | |||||||
50 | |||||||
503 | 7 | 12 | @vals = ( 0, 0 ); | ||||
504 | } | ||||||
505 | elsif ( $nstates == 1 ) { | ||||||
506 | 27 | 21 | push @vals, 0; | ||||
507 | } | ||||||
508 | elsif ( $nstates > 2 ) { | ||||||
509 | 0 | 0 | croak | ||||
510 | 'Cannot compute runs: More than two states were found in the data'; | ||||||
511 | } | ||||||
512 | 82 | 155 | return @vals; | ||||
513 | } | ||||||
514 | |||||||
515 | =head3 n_max_seq | ||||||
516 | |||||||
517 | $n = $runs->n_max_seq(); # loaded data | ||||||
518 | $n = $runs->n_max_seq(data => AREF); # this sequence | ||||||
519 | $n = $runs->n_max_seq(observed => POS_INT, freqs => [POS_INT, POS_INT]); # these counts | ||||||
520 | |||||||
521 | Returns the number of possible sequences for the two given state frequencies. So the urn contains I |
||||||
522 | |||||||
523 | =for html Nmax = ( N1 + N2 )! / N1!N2! |
||||||
524 | |||||||
525 | This is the denominator term in the runs L |
||||||
526 | |||||||
527 | =cut | ||||||
528 | |||||||
529 | sub n_max_seq { | ||||||
530 | 1 | 1 | 1 | 172 | my ( $self, @args ) = @_; | ||
531 | 1 | 3 | return _pmf_denom( $self->bi_frequency(@args) ); | ||||
532 | } | ||||||
533 | |||||||
534 | =head3 m_seq_k | ||||||
535 | |||||||
536 | $n = $runs->m_seq_k(); # loaded data | ||||||
537 | $n = $runs->m_seq_k(data => AREF); # this sequence | ||||||
538 | $n = $runs->m_seq_k(observed => POS_INT, freqs => [POS_INT, POS_INT]); # these counts | ||||||
539 | |||||||
540 | Returns the number of sequences that can produce I |
||||||
541 | |||||||
542 | =cut | ||||||
543 | |||||||
544 | sub m_seq_k { | ||||||
545 | 1 | 1 | 1 | 338 | my ( $self, @args ) = @_; | ||
546 | 1 | 3 | return _pmf_num( $self->observed(@args), $self->bi_frequency(@args) ); | ||||
547 | } | ||||||
548 | |||||||
549 | =head3 stats_hash | ||||||
550 | |||||||
551 | $href = $runs->stats_hash(values => [qw/observed expected z_value/], precision_s => POS_INT, ccorr => BOOL); # among other values/options | ||||||
552 | $href = $runs->stats_hash(values => | ||||||
553 | { | ||||||
554 | observed => BOOL, | ||||||
555 | expected => BOOL, | ||||||
556 | variance => BOOL, | ||||||
557 | z_value => BOOL, | ||||||
558 | p_value => BOOL, | ||||||
559 | }, | ||||||
560 | exact => BOOL, # for p_value | ||||||
561 | ccorr => BOOL # for z_value | ||||||
562 | ); | ||||||
563 | |||||||
564 | Returns a hashref for the counts and stats as specified in its "values" argument, and with any options for calculating them (e.g., exact for p_value). See L |
||||||
565 | |||||||
566 | =head3 dump | ||||||
567 | |||||||
568 | $runs->dump(values => [qw/observed expected z_value/], precision_s => POS_INT, ccorr => BOOL); # among other values/options | ||||||
569 | $runs->dump(values => | ||||||
570 | { | ||||||
571 | observed => BOOL, | ||||||
572 | expected => BOOL, | ||||||
573 | variance => BOOL, | ||||||
574 | z_value => BOOL, | ||||||
575 | p_value => BOOL, | ||||||
576 | }, | ||||||
577 | precision_s => POS_INT, | ||||||
578 | precision_p => POS_INT, # for p_value | ||||||
579 | flag => BOOL, # for p_value | ||||||
580 | exact => BOOL, # for p_value | ||||||
581 | ccorr => BOOL # for z_value | ||||||
582 | ); | ||||||
583 | |||||||
584 | Print Runs-test results to STDOUT, including the stats as given a true value by their method names in a referenced hash of B |
||||||
585 | |||||||
586 | =cut | ||||||
587 | |||||||
588 | sub dump { | ||||||
589 | 0 | 0 | 1 | 0 | my ( $self, @args ) = @_; | ||
590 | 0 | 0 | 0 | my $args = ref $args[0] ? $args[0] : {@args}; | |||
591 | 0 | 0 | $args->{'stat'} = 'runs'; | ||||
592 | 0 | 0 | $self->SUPER::dump($args); | ||||
593 | 0 | 0 | return; | ||||
594 | } | ||||||
595 | |||||||
596 | =head3 dump_data | ||||||
597 | |||||||
598 | $runs->dump_data(delim => "\n"); # print whatevers loaded (or specify by name, or as "data") | ||||||
599 | |||||||
600 | See L |
||||||
601 | |||||||
602 | =cut | ||||||
603 | |||||||
604 | # Private methods: | ||||||
605 | |||||||
606 | sub _p_exact { | ||||||
607 | 9 | 9 | 9 | my ( $self, $args, $vals ) = @_; | |||
608 | 9 | 7 | my $pval; | ||||
609 | 9 | 100 | 18 | 26 | if ( all { is_numeric($_) } ( $vals->{'observed'}, $vals->{'expected'} ) ) { | ||
18 | 106 | ||||||
610 | $pval = | ||||||
611 | 8 | 100 | 111 | ( $vals->{'observed'} - $vals->{'expected'} >= 0 ) | |||
612 | ? $self->cdfi($args) | ||||||
613 | : $self->cdf($args); | ||||||
614 | 8 | 100 | 20 | if ( $args->{'precision_p'} ) { | |||
615 | 2 | 27 | $pval = sprintf q{%.} . $args->{'precision_p'} . qw{f}, $pval; | ||||
616 | } | ||||||
617 | } | ||||||
618 | else { | ||||||
619 | 1 | 5 | $pval = q{}; | ||||
620 | } | ||||||
621 | 9 | 36 | return $pval; | ||||
622 | } | ||||||
623 | |||||||
624 | sub _p_norm { | ||||||
625 | 10 | 10 | 10 | my ( $self, $args, $vals ) = @_; | |||
626 | 10 | 10 | my $pval; | ||||
627 | 10 | 100 | 43 | if ( !$vals->{'expected'} ) { | |||
100 | |||||||
628 | 1 | 2 | $pval = q{}; | ||||
629 | } | ||||||
630 | elsif ( !$vals->{'variance'} ) { | ||||||
631 | 2 | 3 | $pval = 1; | ||||
632 | } | ||||||
633 | else { | ||||||
634 | 7 | 25 | my $zed = Statistics::Zed->new(); | ||||
635 | $pval = $zed->p_value( | ||||||
636 | 7 | 41 | %{$vals}, | ||||
637 | ccorr => ( defined $args->{'ccorr'} ? $args->{'ccorr'} : 1 ), | ||||||
638 | tails => ( $args->{'tails'} || 2 ), | ||||||
639 | 7 | 100 | 50 | 100 | precision_p => $args->{'precision_p'}, | ||
640 | ); | ||||||
641 | } | ||||||
642 | 10 | 744 | return $pval; | ||||
643 | } | ||||||
644 | |||||||
645 | sub _pmf_num { | ||||||
646 | 191 | 191 | 141 | my ( $u, $m, $n ) = @_; | |||
647 | 191 | 115 | my $f; | ||||
648 | 191 | 100 | 220 | if ( is_even($u) ) { | |||
649 | 99 | 1255 | my $k = $u / 2 - 1; | ||||
650 | 99 | 109 | $f = 2 * _choose( $m - 1, $k ) * _choose( $n - 1, $k ); | ||||
651 | } | ||||||
652 | else { | ||||||
653 | 92 | 1111 | my $k = ( $u + 1 ) / 2; | ||||
654 | 92 | 103 | $f = | ||||
655 | _choose( $m - 1, $k - 1 ) * _choose( $n - 1, $k - 2 ) + | ||||||
656 | _choose( $m - 1, $k - 2 ) * _choose( $n - 1, $k - 1 ); | ||||||
657 | } | ||||||
658 | 191 | 230 | return $f; | ||||
659 | } | ||||||
660 | |||||||
661 | sub _pmf_denom { | ||||||
662 | 17 | 17 | 199 | my @args = @_; | |||
663 | 17 | 49 | return _choose( sum(@args), $args[0] ); | ||||
664 | } | ||||||
665 | |||||||
666 | sub _choose { # from Orwant et al., p. 573 | ||||||
667 | 583 | 583 | 378 | my ( $n, $k ) = @_; | |||
668 | 583 | 334 | my ( $res, $j ) = ( 1, 1 ); | ||||
669 | 583 | 50 | 33 | 1506 | return 0 if $k > $n || $k < 0; | ||
670 | 583 | 100 | 637 | $k = ( $n - $k ) if ( $n - $k ) < $k; | |||
671 | 583 | 682 | while ( $j <= $k ) { | ||||
672 | 4309 | 2534 | $res *= $n--; | ||||
673 | 4309 | 4752 | $res /= $j++; | ||||
674 | } | ||||||
675 | 583 | 701 | return $res; | ||||
676 | } | ||||||
677 | |||||||
678 | sub _sum_bi_frequency { | ||||||
679 | 71 | 71 | 57 | my ( $self, @args ) = @_; | |||
680 | 71 | 136 | my ( $n1, $n2 ) = $self->bi_frequency(@args); | ||||
681 | 71 | 51 | my $sum; | ||||
682 | 71 | 50 | 142 | 196 | if ( all { is_int($_) } ( $n1, $n2 ) ) { | ||
142 | 597 | ||||||
683 | 71 | 344 | $sum = $n1 + $n2; | ||||
684 | } | ||||||
685 | 71 | 157 | return ( $sum, $n1, $n2 ); | ||||
686 | } | ||||||
687 | |||||||
688 | sub _get_data { | ||||||
689 | 137 | 137 | 113 | my ( $self, $args ) = @_; | |||
690 | 137 | 100 | 181 | return ref $args->{'data'} ? $args->{'data'} : $self->get_aref( %{$args} ); | |||
119 | 329 | ||||||
691 | } | ||||||
692 | |||||||
693 | 1; | ||||||
694 | |||||||
695 | __END__ |