| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Statistics::Test::Sequence; |
|
2
|
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
72710
|
use 5.006; |
|
|
3
|
|
|
|
|
12
|
|
|
|
3
|
|
|
|
|
111
|
|
|
4
|
3
|
|
|
3
|
|
17
|
use strict; |
|
|
3
|
|
|
|
|
4
|
|
|
|
3
|
|
|
|
|
126
|
|
|
5
|
3
|
|
|
3
|
|
16
|
use warnings; |
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
143
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
|
8
|
|
|
|
|
|
|
|
|
9
|
3
|
|
|
3
|
|
16
|
use Carp qw/croak/; |
|
|
3
|
|
|
|
|
3
|
|
|
|
3
|
|
|
|
|
278
|
|
|
10
|
3
|
|
|
3
|
|
2685
|
use Params::Util qw/_POSINT _ARRAY _CODE/; |
|
|
3
|
|
|
|
|
17814
|
|
|
|
3
|
|
|
|
|
294
|
|
|
11
|
3
|
|
|
3
|
|
4639
|
use Math::BigFloat; |
|
|
3
|
|
|
|
|
80801
|
|
|
|
3
|
|
|
|
|
20
|
|
|
12
|
3
|
|
|
3
|
|
163968
|
use Memoize; |
|
|
3
|
|
|
|
|
7273
|
|
|
|
3
|
|
|
|
|
2781
|
|
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Statistics::Test::Sequence - Sequence correlation test for random numbers |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
use Statistics::Test::Sequence; |
|
21
|
|
|
|
|
|
|
my $tester = Statistics::Test::Sequence->new(); |
|
22
|
|
|
|
|
|
|
$tester->set_data( [map {rand()} 1..1000000] ); |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my ($metric, $actual_freq, $expected_freq) = $tester->test(); |
|
25
|
|
|
|
|
|
|
use Data::Dumper; |
|
26
|
|
|
|
|
|
|
print "$metric\n"; |
|
27
|
|
|
|
|
|
|
print "Frequencies:\n"; |
|
28
|
|
|
|
|
|
|
print Dumper $actual_freq; |
|
29
|
|
|
|
|
|
|
print "Expected frequencies:\n"; |
|
30
|
|
|
|
|
|
|
print Dumper $expected_freq; |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
This module implements a sequence correlation test for random number |
|
35
|
|
|
|
|
|
|
generators. It shows pairwise correlation between subsequent |
|
36
|
|
|
|
|
|
|
random numbers. |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
The algorithm is as follows: (Following Blobel. Citation in SEE ALSO section.) |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=over 2 |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=item * |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Given C random numbers C. |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=item * |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
For all C, compare C with C. If C is greater |
|
49
|
|
|
|
|
|
|
then C, assign a 0-Bit to the number. Otherwise, assign a |
|
50
|
|
|
|
|
|
|
1-Bit. |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=item * |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
Find all sequences of equal Bits. For every sequence, increment |
|
55
|
|
|
|
|
|
|
a counter for the length C of that sequence. (Regardless of whether it's |
|
56
|
|
|
|
|
|
|
a sequence of 1's or 0's.) |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=item * |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
For uncorrelated random numbers, the number of sequences C |
|
61
|
|
|
|
|
|
|
of length C in the set of C random numbers is expected to be: |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
N(k) = 2*((k^2+3*k+1)*N - (k^3+3*k^2-k-4)) / (k+3)! |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=back |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=head1 METHODS |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=cut |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head2 new |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Creates a new random number tester. |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=cut |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub new { |
|
78
|
1
|
|
|
1
|
1
|
16
|
my $proto = shift; |
|
79
|
1
|
|
33
|
|
|
8
|
my $class = ref($proto)||$proto; |
|
80
|
|
|
|
|
|
|
|
|
81
|
1
|
|
|
|
|
3
|
my $self = { |
|
82
|
|
|
|
|
|
|
data => undef, |
|
83
|
|
|
|
|
|
|
}; |
|
84
|
|
|
|
|
|
|
|
|
85
|
1
|
|
|
|
|
4
|
bless $self => $class; |
|
86
|
|
|
|
|
|
|
|
|
87
|
1
|
|
|
|
|
7
|
return $self; |
|
88
|
|
|
|
|
|
|
} |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=head2 set_data |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Sets the random numbers to operate on. First argument |
|
93
|
|
|
|
|
|
|
must be either an array reference to an array of random |
|
94
|
|
|
|
|
|
|
numbers or a code reference. |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
If the first argument is a code reference, the second |
|
97
|
|
|
|
|
|
|
argument must be an integer C. The code reference is |
|
98
|
|
|
|
|
|
|
called C-times and its return values are used as |
|
99
|
|
|
|
|
|
|
random numbers. |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
The code reference semantics are particularily useful if |
|
102
|
|
|
|
|
|
|
you do not want to store all random numbers in memory at |
|
103
|
|
|
|
|
|
|
the same time. You can write a subroutine that, for example, |
|
104
|
|
|
|
|
|
|
generates and returns batches of 100 random numbers so no |
|
105
|
|
|
|
|
|
|
more than 101 of these numbers will be in memory at the |
|
106
|
|
|
|
|
|
|
same time. Note that if you return 100 numbers at once and |
|
107
|
|
|
|
|
|
|
pass in C, you will have a sequence of 5000 random |
|
108
|
|
|
|
|
|
|
numbers. |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=cut |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub set_data { |
|
113
|
2
|
|
|
2
|
1
|
5289
|
my $self = shift; |
|
114
|
2
|
|
|
|
|
4
|
my $data = shift; |
|
115
|
2
|
100
|
|
|
|
18
|
if (_ARRAY($data)) { |
|
|
|
50
|
|
|
|
|
|
|
116
|
1
|
|
|
|
|
8
|
$self->{data} = $data; |
|
117
|
1
|
|
|
|
|
3
|
return 1; |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
elsif (_CODE($data)) { |
|
120
|
1
|
|
|
|
|
4
|
$self->{data} = $data; |
|
121
|
1
|
|
|
|
|
238
|
my $n = shift; |
|
122
|
1
|
50
|
|
|
|
46
|
if (not _POSINT($n)) { |
|
123
|
0
|
|
|
|
|
0
|
croak("'set_data' needs an integer as second argument if the first argument is a code reference."); |
|
124
|
|
|
|
|
|
|
} |
|
125
|
1
|
|
|
|
|
16
|
$self->{n} = $n; |
|
126
|
1
|
|
|
|
|
4
|
return 1; |
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
else { |
|
129
|
0
|
|
|
|
|
0
|
croak("Invalid arguments to 'set_data'."); |
|
130
|
|
|
|
|
|
|
} |
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=head2 test |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Runs the sequence test on the data that was previously set using |
|
136
|
|
|
|
|
|
|
C. |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Returns three items: The first is the root mean square of the bin |
|
139
|
|
|
|
|
|
|
residuals divided by the number of random numbers. It I be |
|
140
|
|
|
|
|
|
|
used as a measure for the quality of the random number generator |
|
141
|
|
|
|
|
|
|
and should be as close to zero as possible. A better metric is to |
|
142
|
|
|
|
|
|
|
compare the following two return values. |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
The second return value is a reference to the array of frequencies. |
|
145
|
|
|
|
|
|
|
An example is in order here. Generating one million random numbers, |
|
146
|
|
|
|
|
|
|
I get: |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
[0, 416765, 181078, 56318, 11486, 1056, 150] |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
This means there were no sequences of length 0 (obvious), 416765 |
|
151
|
|
|
|
|
|
|
sequences of length 1, etc. There were no sequences of length 7 or |
|
152
|
|
|
|
|
|
|
greater. This example is a bad random number generator! (It's a |
|
153
|
|
|
|
|
|
|
linear congruent generator with C<(a*x_i+c)%m> and C, |
|
154
|
|
|
|
|
|
|
C, C, and C). |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
The third return value is similar in nature to the second in that it |
|
157
|
|
|
|
|
|
|
is a reference to an array containing sequence length frequencies. |
|
158
|
|
|
|
|
|
|
This one, however, contains the frequencies that would be expected for |
|
159
|
|
|
|
|
|
|
the given number of random numbers, were they uncorrelated. |
|
160
|
|
|
|
|
|
|
The number of bins has the maximum |
|
161
|
|
|
|
|
|
|
length of an occurring sequence as an upper limit. In the given example, |
|
162
|
|
|
|
|
|
|
you would get: (Dumped with Data::Dumper) |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
$VAR1 = [ |
|
165
|
|
|
|
|
|
|
'0', |
|
166
|
|
|
|
|
|
|
'416666.75', |
|
167
|
|
|
|
|
|
|
'183333.1', |
|
168
|
|
|
|
|
|
|
'52777.64722222222222222222222222222222222', |
|
169
|
|
|
|
|
|
|
'11507.89523809523809523809523809523809524', |
|
170
|
|
|
|
|
|
|
'2033.72068452380952380952380952380952381', |
|
171
|
|
|
|
|
|
|
'303.1287808641975308641975308641975308642', |
|
172
|
|
|
|
|
|
|
# ... |
|
173
|
|
|
|
|
|
|
]; |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
Note that where I put in a C<# ...>, you would really see a couple |
|
176
|
|
|
|
|
|
|
more lines of numbers until the numbers go below an expected |
|
177
|
|
|
|
|
|
|
frequency of C<0.1>. |
|
178
|
|
|
|
|
|
|
For C and C, you get about |
|
179
|
|
|
|
|
|
|
39 sequences, C is expected to be found 4-5 times, etc. |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=cut |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub test { |
|
184
|
2
|
|
|
2
|
1
|
1628
|
my $self = shift; |
|
185
|
2
|
|
|
|
|
7
|
my $data = $self->{data}; |
|
186
|
|
|
|
|
|
|
|
|
187
|
2
|
50
|
|
|
|
8
|
if (not defined $data) { |
|
188
|
0
|
|
|
|
|
0
|
croak("Set data using 'set_data' first."); |
|
189
|
|
|
|
|
|
|
} |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# bin counters |
|
192
|
2
|
|
|
|
|
2
|
my @bins; |
|
193
|
|
|
|
|
|
|
# current sequence type (> or <) |
|
194
|
2
|
|
|
|
|
3
|
my $current = undef; |
|
195
|
|
|
|
|
|
|
# current sequence length |
|
196
|
2
|
|
|
|
|
4
|
my $length = 0; |
|
197
|
|
|
|
|
|
|
# total number of random numbers |
|
198
|
2
|
|
|
|
|
3
|
my $numbers; |
|
199
|
|
|
|
|
|
|
|
|
200
|
2
|
100
|
|
|
|
7
|
if (_ARRAY($data)) { |
|
201
|
1
|
|
50
|
|
|
8
|
$current = ($data->[0] <=> $data->[1]) || 1; |
|
202
|
1
|
|
|
|
|
3
|
$length++; |
|
203
|
1
|
|
|
|
|
2
|
$numbers = @$data; |
|
204
|
|
|
|
|
|
|
|
|
205
|
1
|
|
|
|
|
4
|
foreach my $i (1 .. $#$data-1) { |
|
206
|
9998
|
|
50
|
|
|
24182
|
my $cmp = ($data->[$i] <=> $data->[$i+1]) || 1; |
|
207
|
9998
|
100
|
|
|
|
12521
|
if ($current == $cmp) { |
|
208
|
3275
|
|
|
|
|
3785
|
$length++; |
|
209
|
|
|
|
|
|
|
} |
|
210
|
|
|
|
|
|
|
else { |
|
211
|
6723
|
|
|
|
|
6036
|
$current = $cmp; |
|
212
|
6723
|
|
|
|
|
5653
|
$bins[$length]++; |
|
213
|
6723
|
|
|
|
|
6978
|
$length = 1; |
|
214
|
|
|
|
|
|
|
} |
|
215
|
|
|
|
|
|
|
} |
|
216
|
1
|
|
|
|
|
6
|
$bins[$length]++; |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
else { # CODE |
|
219
|
1
|
|
|
|
|
2
|
my @cache; |
|
220
|
1
|
|
|
|
|
4
|
my $calls = $self->{n}; |
|
221
|
1
|
|
|
|
|
2
|
my $first_run = 1; |
|
222
|
1
|
|
|
|
|
4
|
foreach (1..$calls) { |
|
223
|
|
|
|
|
|
|
# get new data |
|
224
|
100
|
|
|
|
|
254
|
push @cache, $data->(); |
|
225
|
|
|
|
|
|
|
# first run => initialize with first comparison |
|
226
|
100
|
100
|
66
|
|
|
3692
|
if ($first_run and @cache > 1) { |
|
227
|
1
|
|
50
|
|
|
4
|
$current = ($cache[0] <=> $cache[1]) || 1; |
|
228
|
1
|
|
|
|
|
2
|
shift @cache; |
|
229
|
1
|
|
|
|
|
1
|
$length++; # == 1 |
|
230
|
1
|
|
|
|
|
2
|
$numbers++; # == 1 |
|
231
|
1
|
|
|
|
|
1
|
$first_run = 0; |
|
232
|
|
|
|
|
|
|
} |
|
233
|
100
|
|
|
|
|
205
|
while (@cache > 1) { |
|
234
|
9998
|
|
|
|
|
9812
|
$numbers++; |
|
235
|
9998
|
|
|
|
|
9960
|
my $this = shift @cache; |
|
236
|
9998
|
|
50
|
|
|
19492
|
my $cmp = ($this <=> $cache[0]) || 1; |
|
237
|
9998
|
100
|
|
|
|
13538
|
if ($current == $cmp) { |
|
238
|
3273
|
|
|
|
|
6710
|
$length++; |
|
239
|
|
|
|
|
|
|
} |
|
240
|
|
|
|
|
|
|
else { |
|
241
|
6725
|
|
|
|
|
6112
|
$current = $cmp; |
|
242
|
6725
|
|
|
|
|
6059
|
$bins[$length]++; |
|
243
|
6725
|
|
|
|
|
13532
|
$length = 1; |
|
244
|
|
|
|
|
|
|
} |
|
245
|
|
|
|
|
|
|
} |
|
246
|
|
|
|
|
|
|
} |
|
247
|
1
|
|
|
|
|
4
|
$bins[$length]++; |
|
248
|
|
|
|
|
|
|
} |
|
249
|
|
|
|
|
|
|
|
|
250
|
2
|
|
|
|
|
10
|
my @expected = (0); # 0-bin is empty |
|
251
|
2
|
|
|
|
|
6
|
foreach my $bin (1..$#bins) { |
|
252
|
13
|
|
|
|
|
29162
|
$expected[$bin] = expected_frequency($bin, $numbers-1); |
|
253
|
|
|
|
|
|
|
} |
|
254
|
2
|
|
|
|
|
5270
|
my $last_bin = $#bins; |
|
255
|
2
|
|
|
|
|
11
|
while ($expected[$last_bin] > 0.1) { |
|
256
|
3
|
|
|
|
|
8307
|
$last_bin++; |
|
257
|
3
|
|
|
|
|
85
|
$expected[$last_bin] = expected_frequency($last_bin, $numbers-1); |
|
258
|
|
|
|
|
|
|
} |
|
259
|
|
|
|
|
|
|
|
|
260
|
2
|
|
|
|
|
5831
|
foreach my $bin (0..$last_bin) { |
|
261
|
18
|
100
|
|
|
|
40
|
$bins[$bin] = 0 if not defined $bins[$bin]; |
|
262
|
|
|
|
|
|
|
} |
|
263
|
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
|
|
265
|
2
|
|
|
|
|
8
|
my @diff = map { abs($bins[$_] - $expected[$_]) } 0..$#bins; |
|
|
18
|
|
|
|
|
5062
|
|
|
266
|
|
|
|
|
|
|
|
|
267
|
2
|
|
|
|
|
412
|
my $residual = 0; |
|
268
|
2
|
|
|
|
|
13
|
$residual += $_**2 for @diff; |
|
269
|
2
|
|
|
|
|
10229
|
$residual = sqrt($residual); |
|
270
|
2
|
|
|
|
|
11661
|
$residual = "$residual"; # de-bigfloatize |
|
271
|
|
|
|
|
|
|
|
|
272
|
2
|
|
|
|
|
126
|
@expected = map {"$_"} @expected; # de-bigfloatize |
|
|
18
|
|
|
|
|
711
|
|
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
return( |
|
275
|
2
|
|
|
|
|
139
|
$residual / ($numbers-1), |
|
276
|
|
|
|
|
|
|
\@bins, |
|
277
|
|
|
|
|
|
|
\@expected, |
|
278
|
|
|
|
|
|
|
); |
|
279
|
|
|
|
|
|
|
} |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=head1 SUBROUTINES |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=head2 expected_frequency |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
Returns the expected frequency of the sequence length C |
|
286
|
|
|
|
|
|
|
in a set of C random numbers assuming uncorrelated random |
|
287
|
|
|
|
|
|
|
numbers. |
|
288
|
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
Returns this as a L. |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
Expects C and C as arguments. |
|
292
|
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
This subroutine is memoized. (See L.) |
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=cut |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
memoize('expected_frequency'); |
|
298
|
|
|
|
|
|
|
sub expected_frequency { |
|
299
|
|
|
|
|
|
|
my $k = Math::BigFloat->new(shift); |
|
300
|
|
|
|
|
|
|
my $n = Math::BigFloat->new(shift); |
|
301
|
|
|
|
|
|
|
return( |
|
302
|
|
|
|
|
|
|
2 * ( ($k**2 + 3*$k + 1)*$n - ($k**3 + 3*$k**2 - $k - 4) ) |
|
303
|
|
|
|
|
|
|
/ faculty($k+3) |
|
304
|
|
|
|
|
|
|
); |
|
305
|
|
|
|
|
|
|
} |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=head2 faculty |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
Computes the factulty of the first argument recursively as a |
|
310
|
|
|
|
|
|
|
L. This subroutine is memoized. (See L.) |
|
311
|
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
=cut |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
memoize('faculty'); |
|
315
|
|
|
|
|
|
|
sub faculty { |
|
316
|
|
|
|
|
|
|
my $n = shift; |
|
317
|
|
|
|
|
|
|
return Math::BigFloat->bone() if $n <= 1; |
|
318
|
|
|
|
|
|
|
return $n * faculty($n-1); |
|
319
|
|
|
|
|
|
|
} |
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
1; |
|
322
|
|
|
|
|
|
|
__END__ |