File Coverage

blib/lib/Statistics/Sequences/Runs.pm

Criterion	Covered	Total	%
statement	171	189	90.4
branch	66	78	84.6
condition	9	16	56.2
subroutine	33	36	91.6
pod	18	18	100.0
total	297	337	88.1

line	stmt	bran	cond	sub	pod	time	code
1							package Statistics::Sequences::Runs;
2	6			6		312006	use 5.008008;
	6					15
3	6			6		19	use strict;
	6					6
	6					115
4	6			6		20	use warnings FATAL => 'all';
	6					16
	6					214
5	6			6		19	use Carp qw(carp croak);
	6					7
	6					307
6	6			6		20	use base qw(Statistics::Sequences);
	6					5
	6					2612
7	6			6		152904	use List::AllUtils qw(all mesh sum uniq);
	6					9
	6					281
8	6			6		21	use Number::Misc qw(is_even is_numeric);
	6					7
	6					214
9	6			6		2505	use Statistics::Zed 0.10;
	6					37208
	6					158
10	6			6		27	use String::Numeric qw(is_int);
	6					8
	6					10166
11							$Statistics::Sequences::Runs::VERSION = '0.22';
12
13							=pod
14
15							=head1 NAME
16
17							Statistics::Sequences::Runs - The Runs Test: Wald-Wolfowitz runs test descriptives, deviation and combinatorics
18
19							=head1 VERSION
20
21							This is documentation for B of Statistics::Sequences::Runs.
22
23							=head1 SYNOPSIS
24
25							use strict;
26							use Statistics::Sequences::Runs 0.22;
27							my $runs = Statistics::Sequences::Runs->new();
28
29							# Data are a sequence of dichotomous strings:
30							my @data = (qw/1 0 0 0 1 1 0 1 1 0 0 1 0 0 1 1 1 1 0 1/);
31							my $val;
32
33							# - Pre-load data to use for all methods:
34							$runs->load(\@data);
35							$val = $runs->observed();
36							$val = $runs->expected();
37
38							# - or give data as "data => $aref" to each method:
39							$val = $runs->observed(data => AREF);
40
41							# - or give frequencies of the 2 "states" in a sequence:
42							$val = $runs->expected(freqs => [11, 9]); # works with other methods except observed()
43
44							# Deviation ratio:
45							$val = $runs->z_value(ccorr => 1);
46
47							# Probability of deviation from expectation:
48							my ($z, $p) = $runs->z_value(ccorr => 1, tails => 1); # dev. ratio with p-value
49							$val = $runs->p_value(tails => 1); # normal dist. p-value itself
50							$val = $runs->p_value(exact => 1, tails => 1); # p-value by combinatorics
51
52							# Keyed list of descriptives etc.:
53							my $href = $runs->stats_hash(values => [qw/observed p_value/], exact => 1);
54
55							# Print descriptives etc. in the same way:
56							$runs->dump(
57							values => [qw/observed expected p_value/],
58							exact => 1,
59							flag => 1,
60							precision_s => 3,
61							precision_p => 7
62							);
63							# prints: observed = 11.000, expected = 10.900, p_value = 0.5700167
64
65							=head1 DESCRIPTION
66
67							The module returns statistical information re Wald-type runs across a sequence of dichotmous events on one or more consecutive trials. For example, given an accuracy-based sequence composed of matches (H) and misses (M) like (H, H, M, H, M, M, M, M, H), there are 5 runs: 3 for Hs, 2 for Ms. This observed number of runs can be compared with the number expected to occur by chance over the number of trials, relative to the expected variance. More runs than expected ("negative serial dependence") can denote irregularity, instability, mixing up of alternatives. Fewer runs than expected ("positive serial dependence") can denote cohesion, insulation, isolation of alternatives. Both can indicate sequential dependency: either negative (an alternation bias), or positive (a repetition bias).
68
69							The distribution of runs is asymptotically normal, and a deviation-based test of extra-chance occurrence when at least one alternative has more than 20 occurrences (Siegal rule), or both event occurrences exceed 10 (Kelly, 1982), is conventionally considered reliable; otherwise, the module provides an "exact test" based on combinatorics.
70
71							For non-dichotomous, continuous or multinomial data, see L for potentially transforming them for runs descriptives/tests.
72
73							=head1 SUBROUTINES/METHODS
74
75							=head2 Data-handling
76
77							=head3 new
78
79							$runs = Statistics::Sequences::Runs->new();
80
81							Returns a new Runs object. Expects/accepts no arguments but the classname.
82
83							=head3 load
84
85							$runs->load(ARRAY);
86							$runs->load(AREF);
87							$runs->load(foodat => AREF); # named whatever
88
89							Loads a sequence anonymously or by name - see L in the Statistics::Data manpage for details on the various ways data can be loaded, updated and then retrieved. Every load unloads all previous loads and any updates to them.
90
91							Alternatively, skip this action; data don't always have to be loaded to use the stats methods here. The sequence can be provided with each method call, as shown below, or by simply giving the observed counts of runs (apart, of course, for calculating these counts, when a specific sequence is needed).
92
93							=head3 add, access, unload
94
95							See L for these additional operations on data that have been loaded.
96
97							=head2 Descriptives
98
99							=head3 observed
100
101							$v = $runs->observed(); # use the data loaded anonymously
102							$v = $runs->observed(name => 'foodat'); # ... or the name given on loading
103							$v = $runs->observed(data => AREF); # ... or just give the data now
104
105							Returns the total observed number of runs in the loaded or given data. For example,
106
107							$v = $runs->observed(data => [qw/H H H T T H H/]);
108
109							returns 3 (for the runs 'HHH', 'TT' and 'HH').
110
111							=cut
112
113							sub observed {
114	58			58	1	5416	my ( $self, @args ) = @_;
115	58	100				108	my $args = ref $args[0] ? $args[0] : {@args};
116	58	100				111	return $args->{'observed'} if defined $args->{'observed'};
117	49					95	my $data = _get_data( $self, $args );
118	49					1074	my $observed = 0;
119	49					39	for ( 0 .. scalar @{$data} - 1 ) {
	49					89
120	1250	100	100			2974	if ( $_ == 0 or $data->[$_] ne $data->[ $_ - 1 ] ) {
121	550					400	$observed++;
122							}
123							}
124	49					96	return $observed;
125							}
126
127							=head3 observed_per_state
128
129							@freq = $runs->observed_per_state(data => AREF);
130							$href = $runs->observed_per_state(data => AREF);
131
132							Returns the number of runs per state - as a two-dimensional array where the first element gives the count for the first state in the data, and so for the second. A hashref is returned if not called in list context, the frequencies keyed by state. For example:
133
134							@ari = $runs->observed_per_state(data => [qw/H H H T T H H/]); # returns (2, 1)
135							$ref = $runs->observed_per_state(data => [qw/H H H T T H H/]); # returns { H => 2, T => 1}
136
137							Exceptions: If there was only one state in the loaded/given sequence (e.g., data => [qw/H H H/]), there is only one run and so the returned array will be one-dimensional, i.e., (1), and the returned hashref has only a single key (for this example: { H => 1 }). If there are no states, with an empty array loaded/given for the sequence, then the same applies, except the returned array is (0) and the returned hashref has the empty string as its single key ( q{} => 0 ).
138
139							=cut
140
141							sub observed_per_state {
142	6			6	1	2123	my ( $self, @args ) = @_;
143	6	50				17	my $args = ref $args[0] ? $args[0] : {@args};
144	6					10	my $data = _get_data( $self, $args );
145	6					143	my @states = uniq @{$data};
	6					31
146	6					9	my @freqs = ();
147	6	100				5	if ( scalar @{$data} ) {
	6					37
148	5	100				10	if ( scalar @states > 1 ) {
149	3	50				8	@freqs = $data->[0] eq $states[0] ? ( 1, 0 ) : ( 0, 1 );
150							}
151							else {
152	2					3	@freqs = (1);
153							}
154	5					4	for ( 1 .. scalar @{$data} - 1 ) {
	5					12
155	64	100				84	if ( $data->[$_] ne $data->[ $_ - 1 ] ) {
156	11	100				13	if ( $data->[$_] eq $states[0] ) {
157	5					5	$freqs[0]++;
158							}
159							else {
160	6					6	$freqs[1]++;
161							}
162							}
163							}
164							}
165							else {
166	1					2	@states = (q{});
167	1					2	@freqs = (0);
168							}
169	6	100				32	return wantarray ? @freqs : { mesh @states, @freqs };
170							}
171
172							=head3 expected
173
174							$v = $runs->expected(); # or specify loaded data by "name", or give as "data"
175							$v = $runs->expected(data => AREF); # use these data
176							$v = $runs->expected(freqs => [POS_INT, POS_INT]); # no actual data; calculate from these two Ns
177
178							Returns the expected number of runs across the loaded data. Expectation is given as follows:
179
180							=for html E[R] = ( (2n₁n₂) / (n₁ + n₂) ) + 1
181
182							where I(I is the number of observations of each element in the data.
183
184							=cut
185
186							sub expected {
187	36			36	1	1950	my ( $self, @args ) = @_;
188	36					74	my ( $sum, $n1, $n2 ) = _sum_bi_frequency( $self, @args );
189	36					28	my $val;
190	36	100				53	if ($sum) {
191	33					55	$val = ( ( 2 * $n1 * $n2 ) / $sum ) + 1;
192							}
193	36					68	return $val;
194							}
195
196							=head3 variance
197
198							$v = $runs->variance(); # use data already loaded - anonymously; or specify its "name"
199							$v = $runs->variance(data => AREF); # use these data
200							$v = $runs->variance(freqs => [POS_INT, POS_INT]); # use these counts - not any particular sequence of data
201
202							Returns the variance in the number of runs for the given data.
203
204							=for html V[R] = ( (2n₁n₂)([2n₁n₂] – [n₁ + n₂]) ) / ( ((n₁ + n₂)²)((n₁ + n₂) – 1) )
205
206							defined as above for L.
207
208							The data to test can already have been Led, or you send it directly as a flat referenced array keyed as B.
209
210							=cut
211
212							sub variance {
213	35			35	1	2316	my ( $self, @args ) = @_;
214	35					50	my ( $sum, $n1, $n2 ) = _sum_bi_frequency( $self, @args );
215	35					30	my $val;
216	35	100				45	if ($sum) {
217	32	100				48	if ( $sum < 2 ) {
218	4					4	$val = 0;
219							}
220							else {
221	28					78	$val =
222							( ( 2 * $n1 * $n2 * ( ( 2 * $n1 * $n2 ) - $sum ) ) /
223							( ( $sum*2 ) ( $sum - 1 ) ) );
224							}
225							}
226	35					127	return $val;
227							}
228
229							=head3 observed_deviation
230
231							$v = $runs->obsdev(); # use data already loaded - anonymously; or specify its "name"
232							$v = $runs->obsdev(data => AREF); # use these data
233
234							Returns the deviation of (difference between) observed and expected runs for the loaded/given sequence (I - I).
235
236							I: obsdev
237
238							=cut
239
240							sub observed_deviation {
241	2			2	1	545	my ( $self, @args ) = @_;
242	2					10	return $self->observed(@args) - $self->expected(@args);
243							}
244							*obsdev = \&observed_deviation;
245
246							=head3 standard_deviation
247
248							$v = $runs->stdev(); # use data already loaded - anonymously; or specify its "name"
249							$v = $runs->stdev(data => AREF);
250							$v = $runs->stdev(freqs => [POS_INT, POS_INT]); # don't use actual data; calculate from these two Ns
251
252							Returns square-root of the variance.
253
254							I: stdev, stddev
255
256							=cut
257
258							sub standard_deviation {
259	2			2	1	547	my ( $self, @args ) = @_;
260	2					5	return sqrt $self->variance(@args);
261							}
262							*stdev = \&standard_deviation;
263							*stddev = \&standard_deviation;
264
265							=head3 skewness
266
267							$v = $runs->skewness(); # use data already loaded - anonymously; or specify its "name"
268							$v = $runs->skewness(data => AREF); # use these data
269
270							Returns run skewness as given by Barton & David (1958) based on the frequencies of the two different elements in the sequence.
271
272							=cut
273
274							sub skewness {
275	0			0	1	0	my ( $self, @args ) = @_;
276	0					0	my ( $sum, $n1, $n2 ) = _sum_bi_frequency( $self, @args );
277	0					0	my $k3 = 0;
278	0	0	0			0	if ( $sum && $n1 != $n2 ) {
279	0					0	$k3 =
280							( ( 2 * $n1 * $n2 ) / $sum*3 )
281							( ( ( 16 * $n1*2 $n22 ) / $sum2 ) -
282							( ( 4 * $n1 * $n2 * ( $sum + 3 ) ) / $sum ) +
283							3 * $sum );
284							}
285	0					0	return $k3;
286							}
287
288							=head3 kurtosis
289
290							$v = $runs->kurtosis(); # use data already loaded - anonymously; or specify its "name"
291							$v = $runs->kurtosis(data => AREF); # use these data
292
293							Returns run kurtosis as given by Barton & David (1958) based on the frequencies of the two different elements in the sequence.
294
295							=cut
296
297							sub kurtosis {
298	0			0	1	0	my ( $self, @args ) = @_;
299	0					0	my ( $sum, $n1, $n2 ) = _sum_bi_frequency( $self, @args );
300	0					0	my $k4;
301	0	0				0	if ( defined $sum ) {
302	0					0	$k4 = ( ( 2 * $n1 * $n2 ) / $sum*4 ) (
303							(
304							( 48 * ( 5 * $sum - 6 ) * $n1*3 $n2**3 ) /
305							( $sum*2 $sum**2 )
306							) - (
307							( 48 * ( 2 * $sum*2 + 3 $sum - 6 ) * $n1*2 $n2**2 ) /
308							( $sum*2 $sum )
309							) + (
310							(
311							2 * ( 4 * $sum*3 + 45 $sum*2 - 37 $sum - 18 ) *
312							$n1 * $n2
313							) / $sum**2
314							) - ( 7 * $sum*2 + 13 $sum - 6 )
315							);
316							}
317	0					0	return $k4;
318							}
319
320							=head2 Distribution and tests
321
322							=head3 pmf
323
324							$p = $runs->pmf(data => AREF); # or no args to use last pre-loaded data
325							$p = $runs->pmf(observed => POS_INT, freqs => [POS_INT, POS_INT]);
326
327							Implements the runs probability mass function, returning the probability for a particular number of runs given so many dichotomous events (e.g., as in Swed & Eisenhart, 1943, p. 66); i.e., for I' the observed number of runs, I {I = I'}. The required function parameters are the observed number of runs, and the frequencies (counts) of each state in the sequence, which can be given directly, as above, in the arguments B and B, respectively, or these will be worked out from a given data sequence itself (given here or as pre-loaded). For derivation, see its public internal methods L and L, which make use of the choose() method from Orwant et al. (1999).
328
329							=cut
330
331							sub pmf {
332	2			2	1	360	my ( $self, @args ) = @_;
333	2					6	my ( $n1, $n2 ) = $self->bi_frequency(@args);
334	2					4	return _pmf_num( $self->observed(@args), $n1, $n2 ) /
335							_pmf_denom( $n1, $n2 );
336							}
337
338							=head3 cdf
339
340							$p = $runs->cdf(data => AREF); # or no args to use last pre-loaded data
341							$p = $runs->cdf(observed => POS_INT, freqs => [POS_INT, POS_INT]);
342
343							Implements the cumulative distribution function for runs, returning the probability of obtaining the observed number of runs or less down to the expected number of 2 (assuming that the two possible events are actually represented in the data), as per Swed & Eisenhart (1943), p. 66; i.e., for I' the observed number of runs, I {I <= I'}. The summation is over the probability mass function L. The function parameters are the observed number of runs, and the frequencies (counts) of the two events, which can be given directly, as above, in the arguments B and B, respectively, or these will be worked out from a given data sequence itself (given here or as pre-loaded).
344
345							=cut
346
347							sub cdf {
348	5			5	1	545	my ( $self, @args ) = @_;
349	5					9	my ( $n1, $n2 ) = $self->bi_frequency(@args);
350	5					8	my $u = $self->observed(@args);
351	5					6	my $sum = 0;
352	5					7	for ( 2 .. $u ) {
353	39					41	$sum += _pmf_num( $_, $n1, $n2 );
354							}
355	5					7	return $sum / _pmf_denom( $n1, $n2 );
356							}
357
358							=head3 cdfi
359
360							$p = $runs->cdfi(data => AREF); # or no args for last pre-loaded data
361							$p = $runs->cdfi(observed => POS_INT, freqs => [POS_INT, POS_INT]);
362
363							Implements the (inverse) cumulative distribution function for runs, returning the probability of obtaining more than the observed number of runs up from the expected number of 2 (assuming that the two possible events are actually represented in the data), as per Swed & Eisenhart (1943), p. 66; ; i.e., for I' the observed number of runs, I = 1 - I {I <= I' - 1}. The summation is over the probability mass function L. The function parameters are the observed number of runs, and the frequencies (counts) of the two events, which can be given directly, as above, in the arguments B and B, respectively, or these will be worked out from a given data sequence itself (given here as B or as pre-loaded).
364
365							=cut
366
367							sub cdfi {
368	8			8	1	383	my ( $self, @args ) = @_;
369	8					13	my ( $n1, $n2 ) = $self->bi_frequency(@args);
370	8					15	my $u = $self->observed(@args);
371	8					7	my $sum = 0;
372	8					14	for ( 2 .. $u - 1 ) {
373	147					138	$sum += _pmf_num( $_, $n1, $n2 );
374							}
375	8					27	return 1 - $sum / _pmf_denom( $n1, $n2 );
376							}
377
378							=head3 z_value
379
380							$v = $runs->z_value(ccorr => BOOL); # use data already loaded - anonymously; or specify its "name"
381							$v = $runs->z_value(data => AREF, ccorr => BOOL);
382							($zvalue, $pvalue) = $runs->z_value(data => AREF, ccorr => BOOL, tails => 1\|2); # wanting an array, get p-value too
383
384							Returns the normal deviate from a test of runcount deviation, taking the runcount expected from that observed and dividing by the root variance, by default with a continuity correction to expectation. Called wanting an array, returns the I-value with its I -value for the B (1 or 2) given. The returned value is an empty string if the variance is undefined, empty or equals 0 (as when there is only one state in the sequence).
385
386							The data to test can already have been Led, or sent directly as an aref keyed as B.
387
388							Other options are B (for the z_value) and B (for the p_value).
389
390							I: zscore, zvalue
391
392							=cut
393
394							sub z_value {
395	8			8	1	1895	my ( $self, @args ) = @_;
396	8	100				22	my $args = ref $args[0] ? $args[0] : {@args};
397	8					17	my $observed = $self->observed($args);
398	8	100				17	return q{} if $observed == 0;
399	7					52	my $zed = Statistics::Zed->new();
400							return $zed->z_value(
401							observed => $observed,
402							expected => $self->expected($args),
403							variance => $self->variance($args),
404							ccorr => ( defined $args->{'ccorr'} ? $args->{'ccorr'} : 1 ),
405							tails => ( $args->{'tails'} \|\| 2 ),
406							precision_s => $args->{'precision_s'},
407	7	100	50			128	precision_p => $args->{'precision_p'},
408							);
409							}
410							*zvalue = \&z_value;
411							*zscore = \&z_value;
412
413							=head3 p_value
414
415							$p = $runs->p_value(); # using loaded data and default args
416							$p = $runs->p_value(ccorr => BOOL, tails => 1\|2); # normal-approx. for last-loaded data
417							$p = $runs->p_value(exact => BOOL); # calc combinatorially for observed >= or < than expectation
418							$p = $runs->p_value(data => AREF, exact => BOOL); # given data
419							$p = $runs->p_value(observed => POS_INT, freqs => [POS_INT, POS_INT]); # no data sequence, specify known params
420
421							Returns the probability of getting the observed number of runs or a smaller number given the number of each of the two events. By default, a large sample is assumed, and the probability is obtained from the normalized deviation, as given by the L method.
422
423							If the option B is defined and not zero, then the probability is worked out combinatorially, as per Swed & Eisenhart (1943), Eq. 1, p. 66 (and also Siegal, 1956, Eqs. 6.12a and 6.12b, p. 138). This is only implemented as a one-tailed test; the B option has no effect. This tests the hypotheses that there are either too many or too few runs relative to chance expectation; which of these hypotheses is tested is based on the expected value returned by the L method, using L if there are more runs than expected, or L if there are fewer runs than expected; use these functions themselves to specify the hypothesis to be tested.
424
425							If there is only one state/event in the sequence, then the variance from the expected value of 1 is 0, and this method returns 1 (however long this single event sequence is, the observed number of runs cannot differ from the expected number of runs). If the sequence is empty, an empty string is returned.
426
427							Output from these tests has been checked against the tables and examples in Swed & Eisenhart (given to 7 decimal places), and found to agree.
428
429							The option B gives the returned I -value to so many decimal places.
430
431							I: pvalue
432
433							=cut
434
435							sub p_value {
436	19			19	1	4859	my ( $self, @args ) = @_;
437	19	100				59	my $args = ref $args[0] ? $args[0] : {@args};
438	19					34	my $vals = {
439							observed => $self->observed($args),
440							expected => $self->expected($args),
441							variance => $self->variance($args),
442							};
443	19	100				48	return $args->{'exact'}
444							? _p_exact( $self, $args, $vals )
445							: _p_norm( $self, $args, $vals );
446							}
447							*pvalue = \&p_value;
448
449							=head3 ztest_ok
450
451							$bool = $runs->ztest_ok(); # use data already loaded - anonymously; or specify its "name"
452							$bool = $runs->ztest_ok(data => AREF);
453
454							Returns true for the loaded sequence if its constituent sample numbers are sufficient for their expected runs to be normally approximated - using Siegal's (1956, p. 140) rule - ok if I of the two Is are greater than 20.
455
456							=cut
457
458							sub ztest_ok {
459	5			5	1	1658	my ( $self, @args ) = @_;
460	5					10	my ( $n1, $n2 ) = $self->bi_frequency(@args);
461	5	100	100			26	my $retval =
462							$n1 > 20 \|\| $n2 > 20
463							? 1
464							: 0;
465	5					9	return $retval;
466							}
467
468							=head2 Utils
469
470							Methods used internally, or for returning/printing descriptives, etc., in a bunch.
471
472							=head3 bi_frequency
473
474							@freq = $runs->bi_frequency(data => AREF); # or no args if using last pre-loaded data
475
476							Returns frequency of the two elements - or croaks if there are more than 2, and gives zero for any absent.
477
478							=cut
479
480							sub bi_frequency {
481	99			99	1	961	my ( $self, @args ) = @_;
482	99	100				172	my $args = ref $args[0] ? $args[0] : {@args};
483
484							# might be called internally from a method where an array of frequencies per state is optionally already given:
485							carp
486							'Argument named \'trials\' is deprecated; use \'freqs\' to give aref of frequencies per state'
487	99	50				153	if $args->{'trials'};
488	99	100				150	return @{ $args->{'freqs'} } if ref $args->{'freqs'};
	17					34
489	82					94	my $data = _get_data( $self, $args );
490
491							# build hash keying each element with its frequency:
492	82					1508	my %states = ();
493	82					65	for ( @{$data} ) {
	82					92
494	1983					1395	$states{$_}++;
495							}
496
497							# Check that number of states in the sequences are computable for Runs,
498							# and ensure that there is at least zero frequency for the 2 states:
499	82					69	my $nstates = scalar keys %states;
500	82					116	my @vals = values %states;
501
502	82	100				182	if ( !$nstates ) {
		100
		50
503	7					12	@vals = ( 0, 0 );
504							}
505							elsif ( $nstates == 1 ) {
506	27					21	push @vals, 0;
507							}
508							elsif ( $nstates > 2 ) {
509	0					0	croak
510							'Cannot compute runs: More than two states were found in the data';
511							}
512	82					155	return @vals;
513							}
514
515							=head3 n_max_seq
516
517							$n = $runs->n_max_seq(); # loaded data
518							$n = $runs->n_max_seq(data => AREF); # this sequence
519							$n = $runs->n_max_seq(observed => POS_INT, freqs => [POS_INT, POS_INT]); # these counts
520
521							Returns the number of possible sequences for the two given state frequencies. So the urn contains I1 black balls and I2 white balls, well mixed; taking I1 + I2 drawings from it without replacement, any sequence has the same probability of occurring; how many different sequences of black and white balls are possible? For the two counts, this is "sum of I1 + I2 I I1", or:
522
523							=for html N_max = ( N₁ + N₂ )! / N₁!N₂!
524
525							This is the denominator term in the runs L; not taking into account probability of obtaining so many of each event, of the proportion of black and white balls in the urn.
526
527							=cut
528
529							sub n_max_seq {
530	1			1	1	172	my ( $self, @args ) = @_;
531	1					3	return _pmf_denom( $self->bi_frequency(@args) );
532							}
533
534							=head3 m_seq_k
535
536							$n = $runs->m_seq_k(); # loaded data
537							$n = $runs->m_seq_k(data => AREF); # this sequence
538							$n = $runs->m_seq_k(observed => POS_INT, freqs => [POS_INT, POS_INT]); # these counts
539
540							Returns the number of sequences that can produce I runs from I elements of a single kind, with all other kinds of elements in the sequence assumed to be of a single kind, under the conditions of L. See Swed and Eisenhart (1943), or Barton and David (1958, p. 253). With the frequentist probability M / N, this is the numerator term in the runs L.
541
542							=cut
543
544							sub m_seq_k {
545	1			1	1	338	my ( $self, @args ) = @_;
546	1					3	return _pmf_num( $self->observed(@args), $self->bi_frequency(@args) );
547							}
548
549							=head3 stats_hash
550
551							$href = $runs->stats_hash(values => [qw/observed expected z_value/], precision_s => POS_INT, ccorr => BOOL); # among other values/options
552							$href = $runs->stats_hash(values =>
553							{
554							observed => BOOL,
555							expected => BOOL,
556							variance => BOOL,
557							z_value => BOOL,
558							p_value => BOOL,
559							},
560							exact => BOOL, # for p_value
561							ccorr => BOOL # for z_value
562							);
563
564							Returns a hashref for the counts and stats as specified in its "values" argument, and with any options for calculating them (e.g., exact for p_value). See L for details. If calling via a "runs" object, the option "stat => 'runs'" is not needed (unlike when using the parent "sequences" object).
565
566							=head3 dump
567
568							$runs->dump(values => [qw/observed expected z_value/], precision_s => POS_INT, ccorr => BOOL); # among other values/options
569							$runs->dump(values =>
570							{
571							observed => BOOL,
572							expected => BOOL,
573							variance => BOOL,
574							z_value => BOOL,
575							p_value => BOOL,
576							},
577							precision_s => POS_INT,
578							precision_p => POS_INT, # for p_value
579							flag => BOOL, # for p_value
580							exact => BOOL, # for p_value
581							ccorr => BOOL # for z_value
582							);
583
584							Print Runs-test results to STDOUT, including the stats as given a true value by their method names in a referenced hash of B, and with options relevant to thesemethods (see the template above). Default values to dump are observed() and p_value()). Optionally also give the data directly.
585
586							=cut
587
588							sub dump {
589	0			0	1	0	my ( $self, @args ) = @_;
590	0	0				0	my $args = ref $args[0] ? $args[0] : {@args};
591	0					0	$args->{'stat'} = 'runs';
592	0					0	$self->SUPER::dump($args);
593	0					0	return;
594							}
595
596							=head3 dump_data
597
598							$runs->dump_data(delim => "\n"); # print whatevers loaded (or specify by name, or as "data")
599
600							See L for details.
601
602							=cut
603
604							# Private methods:
605
606							sub _p_exact {
607	9			9		9	my ( $self, $args, $vals ) = @_;
608	9					7	my $pval;
609	9	100		18		26	if ( all { is_numeric($_) } ( $vals->{'observed'}, $vals->{'expected'} ) ) {
	18					106
610							$pval =
611	8	100				111	( $vals->{'observed'} - $vals->{'expected'} >= 0 )
612							? $self->cdfi($args)
613							: $self->cdf($args);
614	8	100				20	if ( $args->{'precision_p'} ) {
615	2					27	$pval = sprintf q{%.} . $args->{'precision_p'} . qw{f}, $pval;
616							}
617							}
618							else {
619	1					5	$pval = q{};
620							}
621	9					36	return $pval;
622							}
623
624							sub _p_norm {
625	10			10		10	my ( $self, $args, $vals ) = @_;
626	10					10	my $pval;
627	10	100				43	if ( !$vals->{'expected'} ) {
		100
628	1					2	$pval = q{};
629							}
630							elsif ( !$vals->{'variance'} ) {
631	2					3	$pval = 1;
632							}
633							else {
634	7					25	my $zed = Statistics::Zed->new();
635							$pval = $zed->p_value(
636	7					41	%{$vals},
637							ccorr => ( defined $args->{'ccorr'} ? $args->{'ccorr'} : 1 ),
638							tails => ( $args->{'tails'} \|\| 2 ),
639	7	100	50			100	precision_p => $args->{'precision_p'},
640							);
641							}
642	10					744	return $pval;
643							}
644
645							sub _pmf_num {
646	191			191		141	my ( $u, $m, $n ) = @_;
647	191					115	my $f;
648	191	100				220	if ( is_even($u) ) {
649	99					1255	my $k = $u / 2 - 1;
650	99					109	$f = 2 * _choose( $m - 1, $k ) * _choose( $n - 1, $k );
651							}
652							else {
653	92					1111	my $k = ( $u + 1 ) / 2;
654	92					103	$f =
655							_choose( $m - 1, $k - 1 ) * _choose( $n - 1, $k - 2 ) +
656							_choose( $m - 1, $k - 2 ) * _choose( $n - 1, $k - 1 );
657							}
658	191					230	return $f;
659							}
660
661							sub _pmf_denom {
662	17			17		199	my @args = @_;
663	17					49	return _choose( sum(@args), $args[0] );
664							}
665
666							sub _choose { # from Orwant et al., p. 573
667	583			583		378	my ( $n, $k ) = @_;
668	583					334	my ( $res, $j ) = ( 1, 1 );
669	583	50	33			1506	return 0 if $k > $n \|\| $k < 0;
670	583	100				637	$k = ( $n - $k ) if ( $n - $k ) < $k;
671	583					682	while ( $j <= $k ) {
672	4309					2534	$res *= $n--;
673	4309					4752	$res /= $j++;
674							}
675	583					701	return $res;
676							}
677
678							sub _sum_bi_frequency {
679	71			71		57	my ( $self, @args ) = @_;
680	71					136	my ( $n1, $n2 ) = $self->bi_frequency(@args);
681	71					51	my $sum;
682	71	50		142		196	if ( all { is_int($_) } ( $n1, $n2 ) ) {
	142					597
683	71					344	$sum = $n1 + $n2;
684							}
685	71					157	return ( $sum, $n1, $n2 );
686							}
687
688							sub _get_data {
689	137			137		113	my ( $self, $args ) = @_;
690	137	100				181	return ref $args->{'data'} ? $args->{'data'} : $self->get_aref( %{$args} );
	119					329
691							}
692
693							1;
694
695							__END__