blib/lib/Statistics/Sequences/Runs.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 16 | 18 | 88.8 |
branch | n/a | ||
condition | n/a | ||
subroutine | 6 | 6 | 100.0 |
pod | n/a | ||
total | 22 | 24 | 91.6 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package Statistics::Sequences::Runs; | ||||||
2 | 1 | 1 | 17398 | use 5.008008; | |||
1 | 3 | ||||||
1 | 31 | ||||||
3 | 1 | 1 | 4 | use strict; | |||
1 | 1 | ||||||
1 | 30 | ||||||
4 | 1 | 1 | 3 | use warnings FATAL => 'all'; | |||
1 | 6 | ||||||
1 | 44 | ||||||
5 | 1 | 1 | 4 | use Carp qw(carp croak); | |||
1 | 1 | ||||||
1 | 70 | ||||||
6 | 1 | 1 | 4 | use base qw(Statistics::Sequences); | |||
1 | 1 | ||||||
1 | 367 | ||||||
7 | 1 | 1 | 277 | use List::AllUtils qw(mesh sum uniq); | |||
0 | |||||||
0 | |||||||
8 | use Number::Misc qw(is_even); | ||||||
9 | use Statistics::Zed 0.08; | ||||||
10 | $Statistics::Sequences::Runs::VERSION = '0.21'; | ||||||
11 | |||||||
12 | =pod | ||||||
13 | |||||||
14 | =head1 NAME | ||||||
15 | |||||||
16 | Statistics::Sequences::Runs - descriptives, deviation and combinatorial tests of Wald-Wolfowitz runs | ||||||
17 | |||||||
18 | =head1 VERSION | ||||||
19 | |||||||
20 | This is documentation for Version 0.21 of Statistics::Sequences::Runs. | ||||||
21 | |||||||
22 | =head1 SYNOPSIS | ||||||
23 | |||||||
24 | use strict; | ||||||
25 | use Statistics::Sequences::Runs 0.21; # not compatible with versions < .10 | ||||||
26 | my $runs = Statistics::Sequences::Runs->new(); | ||||||
27 | |||||||
28 | # Data make up a dichotomous sequence: | ||||||
29 | my @data = (qw/1 0 0 0 1 1 0 1 1 0 0 1 0 0 1 1 1 1 0 1/); | ||||||
30 | my $val; | ||||||
31 | |||||||
32 | # - Pre-load data to use for all methods: | ||||||
33 | $runs->load(\@data); | ||||||
34 | $val = $runs->observed(); | ||||||
35 | $val = $runs->expected(); | ||||||
36 | |||||||
37 | # - or send data as "data => $aref" to each method: | ||||||
38 | $val = $runs->observed(data => \@data); | ||||||
39 | |||||||
40 | # - or send frequencies of each of the 2 elements: | ||||||
41 | $val = $runs->expected(freqs => [11, 9]); # works with other methods except observed() | ||||||
42 | |||||||
43 | # Deviation ratio: | ||||||
44 | $val = $runs->z_value(ccorr => 1); | ||||||
45 | |||||||
46 | # Probability: | ||||||
47 | my ($z, $p) = $runs->z_value(ccorr => 1, tails => 1); # dev. ratio with p-value | ||||||
48 | $val = $runs->p_value(tails => 1); # normal dist. p-value itself | ||||||
49 | $val = $runs->p_value(exact => 1, tails => 1); # by combinatorics | ||||||
50 | |||||||
51 | # Keyed list of descriptives etc.: | ||||||
52 | my $href = $runs->stats_hash(values => {observed => 1, p_value => 1}, exact => 1); | ||||||
53 | |||||||
54 | # Print descriptives etc. in the same way: | ||||||
55 | $runs->dump( | ||||||
56 | values => {observed => 1, expected => 1, p_value => 1}, | ||||||
57 | exact => 1, | ||||||
58 | flag => 1, | ||||||
59 | precision_s => 3, | ||||||
60 | precision_p => 7 | ||||||
61 | ); | ||||||
62 | # prints: observed = 11.000, expected = 10.900, p_value = 0.5700167 | ||||||
63 | |||||||
64 | =head1 DESCRIPTION | ||||||
65 | |||||||
66 | The module returns statistical information re Wald-type runs: a sequence of events on 1 or more consecutive trials. For example, in a signal-detection test composed of match (H) and miss (M) events over time like H-H-M-H-M-M-M-M-H, there are 5 runs: 3 Hs, 2 Ms. This number of runs between two events can be compared with the number expected to occur by chance over the number of trials, relative to the expected variance (see REFERENCES). More runs than expected ("negative serial dependence") can denote irregularity, instability, mixing up of alternatives. Fewer runs than expected ("positive serial dependence") can denote cohesion, insulation, isolation of alternatives. Both can indicate sequential dependency: either negative (a bias to produce too many alternations), or positive (a bias to produce too many repetitions). | ||||||
67 | |||||||
68 | The distribution of runs is asymptotically normal, and a deviation-based test of extra-chance occurrence when at least one alternative has more than 20 occurrences (Siegal rule), or both event occurrences exceed 10 (Kelly, 1982), is conventionally considered reliable; otherwise, the module provides an "exact test" option, based on combinatorics. | ||||||
69 | |||||||
70 | Have non-dichotomous, continuous or multinomial data? See L |
||||||
71 | |||||||
72 | =head1 SUBROUTINES/METHODS | ||||||
73 | |||||||
74 | =head2 Data-handling | ||||||
75 | |||||||
76 | =head3 new | ||||||
77 | |||||||
78 | $runs = Statistics::Sequences::Runs->new(); | ||||||
79 | |||||||
80 | Returns a new Runs object. Expects/accepts no arguments but the classname. | ||||||
81 | |||||||
82 | =head3 load | ||||||
83 | |||||||
84 | $runs->load(@data); | ||||||
85 | $runs->load(\@data); | ||||||
86 | $runs->load(foodat => \@data); # labelled whatever | ||||||
87 | |||||||
88 | Loads data anonymously or by name - see L |
||||||
89 | |||||||
90 | After the load, the data are L |
||||||
91 | |||||||
92 | Alternatively, skip this action; data don't always have to be loaded to use the stats methods here. To get the observed number of runs, data of course have to be loaded, but other stats can be got if given the observed count - otherwise, they too depend on data having been loaded. | ||||||
93 | |||||||
94 | Every load unloads all previous loads and any additions to them. | ||||||
95 | |||||||
96 | =head3 add, access, unload | ||||||
97 | |||||||
98 | See L |
||||||
99 | |||||||
100 | =head2 Descriptives | ||||||
101 | |||||||
102 | =head3 observed | ||||||
103 | |||||||
104 | $v = $runs->observed(); # use the first data loaded anonymously | ||||||
105 | $v = $runs->observed(index => 1); # ... or give the required "index" for the loaded data | ||||||
106 | $v = $runs->observed(label => 'foodat'); # ... or its "label" value | ||||||
107 | $v = $runs->observed(data => [1, 0, 1, 1]); # ... or just give the data now | ||||||
108 | |||||||
109 | Returns the total observed number of runs in the loaded or given data. For example, | ||||||
110 | |||||||
111 | $v = $runs->observed_per_state(data => [qw/H H H T T H H/]); | ||||||
112 | |||||||
113 | returns 3. | ||||||
114 | |||||||
115 | I |
||||||
116 | |||||||
117 | =cut | ||||||
118 | |||||||
119 | sub observed { | ||||||
120 | my ( $self, @args ) = @_; | ||||||
121 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
122 | return $args->{'observed'} if defined $args->{'observed'}; | ||||||
123 | my $data = _get_data( $self, $args ); | ||||||
124 | my $rco = 1; | ||||||
125 | for ( 1 .. scalar @{$data} - 1 ) { | ||||||
126 | $rco++ if $data->[$_] ne $data->[ $_ - 1 ]; | ||||||
127 | } | ||||||
128 | return $rco; | ||||||
129 | } | ||||||
130 | *runcount_observed = \&observed; | ||||||
131 | *rco = \&observed; | ||||||
132 | |||||||
133 | =head3 observed_per_state | ||||||
134 | |||||||
135 | @freq = $runs->observed_per_state(data => \@data); | ||||||
136 | $href = $runs->observed_per_state(data => \@data); | ||||||
137 | |||||||
138 | Returns the number of runs per state - as an array where the first element gives the count for the first state in the data, and so for the second. A keyed hashref is returned if not called in array context. For example: | ||||||
139 | |||||||
140 | @ari = $runs->observed_per_state(data => [qw/H H H T T H H/]); # returns (2, 1) | ||||||
141 | $ref = $runs->observed_per_state(data => [qw/H H H T T H H/]); # returns { H => 2, T => 1} | ||||||
142 | |||||||
143 | =cut | ||||||
144 | |||||||
145 | sub observed_per_state { | ||||||
146 | my ( $self, @args ) = @_; | ||||||
147 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
148 | my $data = _get_data( $self, $args ); | ||||||
149 | my @states = uniq @{$data}; | ||||||
150 | my @freqs = $data->[0] eq $states[0] ? ( 1, 0 ) : ( 0, 1 ); | ||||||
151 | for ( 1 .. scalar @{$data} - 1 ) { | ||||||
152 | if ( $data->[$_] ne $data->[ $_ - 1 ] ) { | ||||||
153 | if ( $data->[$_] eq $states[0] ) { | ||||||
154 | $freqs[0]++; | ||||||
155 | } | ||||||
156 | else { | ||||||
157 | $freqs[1]++; | ||||||
158 | } | ||||||
159 | } | ||||||
160 | } | ||||||
161 | return wantarray ? @freqs : { mesh @states, @freqs }; | ||||||
162 | } | ||||||
163 | |||||||
164 | =head3 expected | ||||||
165 | |||||||
166 | $v = $runs->expected(); # or specify loaded data by "index" or "label", or give it as "data" - see observed() | ||||||
167 | $v = $runs->expected(data => [qw/blah bing blah blah blah/]); # use these data | ||||||
168 | $v = $runs->expected(freqs => [12, 7]); # don't use actual data; calculate from these two Ns | ||||||
169 | |||||||
170 | Returns the expected number of runs across the loaded data. Expectation is given as follows: | ||||||
171 | |||||||
172 | =for html E[R] = ( (2n1n2) / (n1 + n2) ) + 1 |
||||||
173 | |||||||
174 | where I |
||||||
175 | |||||||
176 | I |
||||||
177 | |||||||
178 | =cut | ||||||
179 | |||||||
180 | sub expected { | ||||||
181 | my ( $self, @args ) = @_; | ||||||
182 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
183 | my ( $n1, $n2 ) = $self->bi_frequency($args); | ||||||
184 | my $sum = $n1 + $n2; | ||||||
185 | return $sum ? ( ( 2 * $n1 * $n2 ) / $sum ) + 1 : undef; | ||||||
186 | } | ||||||
187 | *rce = \&expected; | ||||||
188 | *runcount_expected = \&expected; | ||||||
189 | |||||||
190 | =head3 variance | ||||||
191 | |||||||
192 | $v = $runs->variance(); # use data already loaded - anonymously; or specify its "label" or "index" - see observed() | ||||||
193 | $v = $runs->variance(data => [qw/blah bing blah blah blah/]); # use these data | ||||||
194 | $v = $runs->variance(freqs => [5, 12]); # use these trial numbers - not any particular sequence of data | ||||||
195 | |||||||
196 | Returns the variance in the number of runs for the given data. | ||||||
197 | |||||||
198 | =for html V[R] = ( (2n1n2)([2n1n2] – [n1 + n2]) ) / ( ((n1 + n2)2)((n1 + n2) – 1) ) |
||||||
199 | |||||||
200 | defined as above for L |
||||||
201 | |||||||
202 | The data to test can already have been L |
||||||
203 | |||||||
204 | I |
||||||
205 | |||||||
206 | =cut | ||||||
207 | |||||||
208 | sub variance { | ||||||
209 | my ( $self, @args ) = @_; | ||||||
210 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
211 | my ( $n1, $n2 ) = $self->bi_frequency($args); | ||||||
212 | my $sum = $n1 + $n2; | ||||||
213 | return $sum < 2 | ||||||
214 | ? 1 | ||||||
215 | : ( ( 2 * $n1 * $n2 * ( ( 2 * $n1 * $n2 ) - $sum ) ) / | ||||||
216 | ( ( $sum**2 ) * ( $sum - 1 ) ) ); | ||||||
217 | } | ||||||
218 | *rcv = \&variance; | ||||||
219 | *runcount_variance = \&variance; | ||||||
220 | |||||||
221 | =head3 observed_deviation | ||||||
222 | |||||||
223 | $v = $runs->obsdev(); # use data already loaded - anonymously; or specify its "label" or "index" - see observed() | ||||||
224 | $v = $runs->obsdev(data => [qw/blah bing blah blah blah/]); # use these data | ||||||
225 | |||||||
226 | Returns the deviation of (difference between) observed and expected runs for the loaded/given sequence (I |
||||||
227 | |||||||
228 | I |
||||||
229 | |||||||
230 | =cut | ||||||
231 | |||||||
232 | sub observed_deviation { | ||||||
233 | return observed(@_) - expected(@_); | ||||||
234 | } | ||||||
235 | *obsdev = \&observed_deviation; | ||||||
236 | |||||||
237 | =head3 standard_deviation | ||||||
238 | |||||||
239 | $v = $runs->stdev(); # use data already loaded - anonymously; or specify its "label" or "index" - see observed() | ||||||
240 | $v = $runs->stdev(data => [qw/blah bing blah blah blah/]); | ||||||
241 | |||||||
242 | Returns square-root of the variance. | ||||||
243 | |||||||
244 | I |
||||||
245 | |||||||
246 | =cut | ||||||
247 | |||||||
248 | sub standard_deviation { | ||||||
249 | return sqrt variance(@_); | ||||||
250 | } | ||||||
251 | *stdev = \&standard_deviation; | ||||||
252 | |||||||
253 | =head3 skewness | ||||||
254 | |||||||
255 | Returns run skewness as given by Barton & David (1958) based on the frequencies of the two different elements in the sequence. | ||||||
256 | |||||||
257 | =cut | ||||||
258 | |||||||
259 | sub skewness { | ||||||
260 | my ( $self, @args ) = @_; | ||||||
261 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
262 | my ( $n1, $n2 ) = $self->bi_frequency($args); | ||||||
263 | my $k3 = 0; | ||||||
264 | if ( $n1 != $n2 ) { | ||||||
265 | my $sum = $n1 + $n2; | ||||||
266 | $k3 = | ||||||
267 | ( ( 2 * $n1 * $n2 ) / $sum**3 ) * | ||||||
268 | ( ( ( 16 * $n1**2 * $n2**2 ) / $sum**2 ) - | ||||||
269 | ( ( 4 * $n1 * $n2 * ( $sum + 3 ) ) / $sum ) + | ||||||
270 | 3 * $sum ); | ||||||
271 | } | ||||||
272 | return $k3; | ||||||
273 | } | ||||||
274 | |||||||
275 | =head3 kurtosis | ||||||
276 | |||||||
277 | Returns run kurtosis as given by Barton & David (1958) based on the frequencies of the two different elements in the sequence. | ||||||
278 | |||||||
279 | =cut | ||||||
280 | |||||||
281 | sub kurtosis { | ||||||
282 | my ( $self, @args ) = @_; | ||||||
283 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
284 | my ( $n1, $n2 ) = $self->bi_frequency($args); | ||||||
285 | my $sum = $n1 + $n2; | ||||||
286 | my $k4 = ( ( 2 * $n1 * $n2 ) / $sum**4 ) * ( | ||||||
287 | ( ( 48 * ( 5 * $sum - 6 ) * $n1**3 * $n2**3 ) / ( $sum**2 * $sum**2 ) ) | ||||||
288 | - ( | ||||||
289 | ( 48 * ( 2 * $sum**2 + 3 * $sum - 6 ) * $n1**2 * $n2**2 ) / | ||||||
290 | ( $sum**2 * $sum ) | ||||||
291 | ) + ( | ||||||
292 | ( 2 * ( 4 * $sum**3 + 45 * $sum**2 - 37 * $sum - 18 ) * $n1 * $n2 ) | ||||||
293 | / $sum**2 | ||||||
294 | ) - ( 7 * $sum**2 + 13 * $sum - 6 ) | ||||||
295 | ); | ||||||
296 | return $k4; | ||||||
297 | } | ||||||
298 | |||||||
299 | =head2 Distribution and tests | ||||||
300 | |||||||
301 | =head3 pmf | ||||||
302 | |||||||
303 | $p = $runs->pmf(data => \@data); # or no args to use last pre-loaded data | ||||||
304 | $p = $runs->pmf(observed => 5, freqs => [5, 20]); | ||||||
305 | |||||||
306 | Implements the runs probability mass function, returning the probability for a particular number of runs given so many dichotomous events (e.g., as in Swed & Eisenhart, 1943, p. 66); i.e., for I' the observed number of runs, I {I = I'}. The required function parameters are the observed number of runs, and the frequencies (counts) of each state in the sequence, which can be given directly, as above, in the arguments B |
||||||
307 | |||||||
308 | =cut | ||||||
309 | |||||||
310 | sub pmf { | ||||||
311 | my ( $self, @args ) = @_; | ||||||
312 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
313 | my ( $n1, $n2 ) = $self->bi_frequency($args); | ||||||
314 | my $u = $self->observed($args); | ||||||
315 | return _pmf_num( $u, $n1, $n2 ) / _pmf_denom( $n1, $n2 ); | ||||||
316 | } | ||||||
317 | |||||||
318 | =head3 cdf | ||||||
319 | |||||||
320 | $p = $runs->cdf(data => \@data); # or no args to use last pre-loaded data | ||||||
321 | $p = $runs->cdf(observed => 5, freqs => [5, 20]); | ||||||
322 | |||||||
323 | Implements the cumulative distribution function for runs, returning the probability of obtaining the observed number of runs or less down to the expected number of 2 (assuming that the two possible events are actually represented in the data), as per Swed & Eisenhart (1943), p. 66; i.e., for I' the observed number of runs, I {I <= I'}. The summation is over the probability mass function L |
||||||
324 | |||||||
325 | =cut | ||||||
326 | |||||||
327 | sub cdf { | ||||||
328 | my ( $self, @args ) = @_; | ||||||
329 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
330 | my ( $n1, $n2 ) = $self->bi_frequency($args); | ||||||
331 | my $u = $self->observed($args); | ||||||
332 | my $sum = 0; | ||||||
333 | for ( 2 .. $u ) { | ||||||
334 | $sum += _pmf_num( $_, $n1, $n2 ); | ||||||
335 | } | ||||||
336 | return $sum / _pmf_denom( $n1, $n2 ); | ||||||
337 | } | ||||||
338 | |||||||
339 | =head3 cdfi | ||||||
340 | |||||||
341 | $p = $runs->cdfi(data => \@data); # or no args for last pre-loaded data | ||||||
342 | $p = $runs->cdfi(observed => 11, freqs => [5, 11]); | ||||||
343 | |||||||
344 | Implements the (inverse) cumulative distribution function for runs, returning the probability of obtaining more than the observed number of runs up from the expected number of 2 (assuming that the two possible events are actually represented in the data), as per Swed & Eisenhart (1943), p. 66; ; i.e., for I' the observed number of runs, I = 1 - I {I <= I' - 1}. The summation is over the probability mass function L |
||||||
345 | |||||||
346 | =cut | ||||||
347 | |||||||
348 | sub cdfi { | ||||||
349 | my ( $self, @args ) = @_; | ||||||
350 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
351 | my ( $n1, $n2 ) = $self->bi_frequency($args); | ||||||
352 | my $u = $self->observed($args); | ||||||
353 | my $sum = 0; | ||||||
354 | for ( 2 .. $u - 1 ) { | ||||||
355 | $sum += _pmf_num( $_, $n1, $n2 ); | ||||||
356 | } | ||||||
357 | return 1 - $sum / _pmf_denom( $n1, $n2 ); | ||||||
358 | } | ||||||
359 | |||||||
360 | =head3 z_value | ||||||
361 | |||||||
362 | $v = $runs->z_value(ccorr => 1); # use data already loaded - anonymously; or specify its "label" or "index" - see observed() | ||||||
363 | $v = $runs->z_value(data => $aref, ccorr => 1); | ||||||
364 | ($zvalue, $pvalue) = $runs->z_value(data => $aref, ccorr => 1, tails => 2); # same but wanting an array, get the p-value too | ||||||
365 | |||||||
366 | Returns the zscore from a test of runcount deviation, taking the runcount expected away from that observed and dividing by the root expected runcount variance, by default with a continuity correction to expectation. Called wanting an array, returns the z-value with its I -value for the tails (1 or 2) given. |
||||||
367 | |||||||
368 | The data to test can already have been L |
||||||
369 | |||||||
370 | Other options are B |
||||||
371 | |||||||
372 | I |
||||||
373 | |||||||
374 | =cut | ||||||
375 | |||||||
376 | sub z_value { | ||||||
377 | my ( $self, @args ) = @_; | ||||||
378 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
379 | my $zed = Statistics::Zed->new(); | ||||||
380 | my ( $zval, $pval ) = $zed->zscore( | ||||||
381 | observed => $self->rco($args), | ||||||
382 | expected => $self->rce($args), | ||||||
383 | variance => $self->rcv($args), | ||||||
384 | ccorr => ( defined $args->{'ccorr'} ? $args->{'ccorr'} : 1 ), | ||||||
385 | tails => ( $args->{'tails'} || 2 ), | ||||||
386 | precision_s => $args->{'precision_s'}, | ||||||
387 | precision_p => $args->{'precision_p'}, | ||||||
388 | ); | ||||||
389 | return wantarray ? ( $zval, $pval ) : $zval; | ||||||
390 | } | ||||||
391 | *rzs = \&z_value; | ||||||
392 | *runcount_zscore = \&z_value; | ||||||
393 | *zscore = \&z_value; | ||||||
394 | |||||||
395 | =head3 p_value | ||||||
396 | |||||||
397 | $p = $runs->p_value(); # using loaded data and default args | ||||||
398 | $p = $runs->p_value(ccorr => 0|1, tails => 1|2); # normal-approx. for last-loaded data | ||||||
399 | $p = $runs->p_value(exact => 1); # calc combinatorially for observed >= or < than expectation | ||||||
400 | $p = $runs->p_value(data => [1, 0, 1, 1, 0], exact => 1); # given data | ||||||
401 | $p = $runs->p_value(freqs => [12, 12], observed => 8); # no data sequence, specify known params | ||||||
402 | |||||||
403 | Returns the probability of getting the observed number of runs or a smaller number given the number of each of the two events. By default, a large sample is assumed, and the probability is obtained from the normalized deviation, as given by the L |
||||||
404 | |||||||
405 | If the option B -value from any of these tests. Output from these tests has been checked against the tables and examples in Swed & Eisenhart (given to 7 decimal places), and found to agree. |
||||||
406 | |||||||
407 | The option B -value to so many decimal places. |
||||||
408 | |||||||
409 | I |
||||||
410 | |||||||
411 | =cut | ||||||
412 | |||||||
413 | sub p_value { | ||||||
414 | my ( $self, @args ) = @_; | ||||||
415 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
416 | my $pval; | ||||||
417 | if ( $args->{'exact'} ) { | ||||||
418 | $pval = | ||||||
419 | ( $self->observed($args) - $self->rce($args) >= 0 ) | ||||||
420 | ? $self->cdfi($args) | ||||||
421 | : $self->cdf($args); | ||||||
422 | $pval *= 2 if $args->{'tails'} and $args->{'tails'} == 2; | ||||||
423 | $pval = sprintf( q{%.} . $args->{'precision_p'} . 'f', $pval ) | ||||||
424 | if $args->{'precision_p'}; | ||||||
425 | } | ||||||
426 | else { | ||||||
427 | $pval = ( $self->zscore($args) )[1]; | ||||||
428 | } | ||||||
429 | return $pval; | ||||||
430 | } | ||||||
431 | *test = \&p_value; | ||||||
432 | *runs_test = \*p_value; | ||||||
433 | *rct = \*p_value; | ||||||
434 | |||||||
435 | =head3 lrx2 | ||||||
436 | |||||||
437 | Likelihood ratio chi-square test for runs by length. | ||||||
438 | |||||||
439 | =cut | ||||||
440 | |||||||
441 | sub lrx2 { | ||||||
442 | my ( $self, @args ) = @_; | ||||||
443 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
444 | my ( $n1, $n2 ) = $self->bi_frequency($args); | ||||||
445 | return; | ||||||
446 | } | ||||||
447 | |||||||
448 | =head3 ztest_ok | ||||||
449 | |||||||
450 | Returns true for the loaded sequence if its constituent sample numbers are sufficient for their expected runs to be normally approximated - using Siegal's (1956, p. 140) rule - ok if I |
||||||
451 | |||||||
452 | =cut | ||||||
453 | |||||||
454 | sub ztest_ok { | ||||||
455 | my ( $self, @args ) = @_; | ||||||
456 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
457 | my ( $n1, $n2 ) = $self->bi_frequency($args); | ||||||
458 | my $retval = | ||||||
459 | $n1 > 20 || $n2 > 20 | ||||||
460 | ? 1 | ||||||
461 | : 0 | ||||||
462 | ; # Siegal's rule (p. 140) - ok if either of the two Ns are greater than 20 | ||||||
463 | return $retval; | ||||||
464 | } | ||||||
465 | |||||||
466 | =head2 Utils | ||||||
467 | |||||||
468 | Methods used internally, or for returning/printing descriptives, etc., in a bunch. | ||||||
469 | |||||||
470 | =head3 bi_frequency | ||||||
471 | |||||||
472 | @freq = $runs->bi_frequency(data => \@data); # or no args if using last pre-loaded data | ||||||
473 | |||||||
474 | Returns frequency of the two elements - or croaks if there are more than 2, and gives zero for any absent. | ||||||
475 | |||||||
476 | =cut | ||||||
477 | |||||||
478 | sub bi_frequency { | ||||||
479 | my ( $self, @args ) = @_; | ||||||
480 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
481 | carp | ||||||
482 | 'Argument named \'trials\' is deprecated; use \'freqs\' to give aref of frequencies per state' | ||||||
483 | if $args->{'trials'}; | ||||||
484 | return @{ $args->{'freqs'} } if ref $args->{'freqs'}; | ||||||
485 | my $data = _get_data( $self, $args ); | ||||||
486 | my %states = (); | ||||||
487 | $states{$_}++ for @{$data}; # hash keying each element with its frequency | ||||||
488 | croak 'Cannot compute runs: More than two states were found in the data: ' | ||||||
489 | . join( q{, }, keys %states ) | ||||||
490 | if scalar keys %states > 2; | ||||||
491 | my @vals = values %states; | ||||||
492 | $vals[1] = 0 if scalar @vals < 2; | ||||||
493 | return @vals; | ||||||
494 | } | ||||||
495 | |||||||
496 | =head3 n_max_seq | ||||||
497 | |||||||
498 | $n = $runs->n_max_seq(); # loaded data | ||||||
499 | $n = $runs->n_max_seq(data => \@data); # this sequence | ||||||
500 | $n = $runs->n_max_seq(observed => int, freqs => [int, int]); # these specs | ||||||
501 | |||||||
502 | Returns the number of possible sequences for the two given state frequencies. So the proverbial urn contains I |
||||||
503 | |||||||
504 | =for html Nmax = ( N1 + N2 )! / N1!N2! |
||||||
505 | |||||||
506 | With the usual definition of a probability as M / N, this is the denominator term in the runs L |
||||||
507 | |||||||
508 | =cut | ||||||
509 | |||||||
510 | sub n_max_seq { | ||||||
511 | my ( $self, @args ) = @_; | ||||||
512 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
513 | my @freqs = $self->bi_frequency($args); | ||||||
514 | return _pmf_denom(@freqs); | ||||||
515 | } | ||||||
516 | |||||||
517 | =head3 m_seq_k | ||||||
518 | |||||||
519 | $n = $runs->m_seq_k(); # loaded data | ||||||
520 | $n = $runs->m_seq_k(data => \@data); # this sequence | ||||||
521 | $n = $runs->m_seq_k(observed => int, freqs => [int, int]); # these specs | ||||||
522 | |||||||
523 | Returns the number of sequences that can produce I |
||||||
524 | |||||||
525 | =cut | ||||||
526 | |||||||
527 | sub m_seq_k { | ||||||
528 | my ( $self, @args ) = @_; | ||||||
529 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
530 | my $u = $self->observed($args); | ||||||
531 | my @freqs = $self->bi_frequency($args); | ||||||
532 | return _pmf_num( $u, @freqs ); | ||||||
533 | } | ||||||
534 | |||||||
535 | =head3 stats_hash | ||||||
536 | |||||||
537 | $href = $runs->stats_hash(values => {observed => 1, expected => 1, variance => 1, z_value => 1, p_value => 1}, exact => 0, ccorr => 1); | ||||||
538 | |||||||
539 | Returns a hashref for the counts and stats as specified in its "values" argument, and with any options for calculating them (e.g., exact for p_value). See L |
||||||
540 | |||||||
541 | =head3 dump | ||||||
542 | |||||||
543 | $runs->dump(values => { observed => 1, variance => 1, p_value => 1}, exact => 1, flag => 1, precision_s => 3); # among other options | ||||||
544 | |||||||
545 | Print Runs-test results to STDOUT. See L |
||||||
546 | |||||||
547 | =cut | ||||||
548 | |||||||
549 | sub dump { | ||||||
550 | my ( $self, @args ) = @_; | ||||||
551 | my $args = ref $args[0] ? $args[0] : {@args}; | ||||||
552 | $args->{'stat'} = 'runs'; | ||||||
553 | $self->SUPER::dump($args); | ||||||
554 | return; | ||||||
555 | } | ||||||
556 | |||||||
557 | =head3 dump_data | ||||||
558 | |||||||
559 | $runs->dump_data(delim => "\n"); # print whatevers loaded (or specify by label, index, or as "data") | ||||||
560 | |||||||
561 | See L |
||||||
562 | |||||||
563 | =cut | ||||||
564 | |||||||
565 | # Private methods: | ||||||
566 | |||||||
567 | sub _pmf_num { | ||||||
568 | my ( $u, $m, $n ) = @_; | ||||||
569 | my $f; | ||||||
570 | if ( is_even($u) ) { | ||||||
571 | my $k = $u / 2 - 1; | ||||||
572 | $f = 2 * _choose( $m - 1, $k ) * _choose( $n - 1, $k ); | ||||||
573 | } | ||||||
574 | else { | ||||||
575 | my $k = ( $u + 1 ) / 2; | ||||||
576 | $f = | ||||||
577 | _choose( $m - 1, $k - 1 ) * _choose( $n - 1, $k - 2 ) + | ||||||
578 | _choose( $m - 1, $k - 2 ) * _choose( $n - 1, $k - 1 ); | ||||||
579 | } | ||||||
580 | return $f; | ||||||
581 | } | ||||||
582 | |||||||
583 | sub _pmf_denom { | ||||||
584 | return _choose( sum(@_), $_[0] ); | ||||||
585 | |||||||
586 | #return _factorial(sum(@_)) / ( _factorial($_[0]) * _factorial($_[1]) ); | ||||||
587 | } | ||||||
588 | |||||||
589 | sub _choose { # from Orwant et al., p. 573 | ||||||
590 | my ( $n, $k ) = @_; | ||||||
591 | my ( $res, $j ) = ( 1, 1 ); | ||||||
592 | return 0 if $k > $n || $k < 0; | ||||||
593 | $k = ( $n - $k ) if ( $n - $k ) < $k; | ||||||
594 | while ( $j <= $k ) { | ||||||
595 | $res *= $n--; | ||||||
596 | $res /= $j++; | ||||||
597 | } | ||||||
598 | return $res; | ||||||
599 | } | ||||||
600 | |||||||
601 | sub _factorial { | ||||||
602 | my ( $n, $res ) = ( shift, 1 ); | ||||||
603 | return undef unless $n >= 0 and $n == int($n); | ||||||
604 | $res *= $n-- while $n > 1; | ||||||
605 | return $res; | ||||||
606 | } | ||||||
607 | |||||||
608 | sub _get_data { | ||||||
609 | my ( $self, $args ) = @_; | ||||||
610 | return ref $args->{'data'} ? $args->{'data'} : $self->read($args); | ||||||
611 | } | ||||||
612 | |||||||
613 | 1; | ||||||
614 | |||||||
615 | __END__ |