line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::DOOP::Util::Run::GeneMerge; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
5
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
34
|
|
4
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
24
|
|
5
|
1
|
|
|
1
|
|
912
|
use POSIX; |
|
1
|
|
|
|
|
8147
|
|
|
1
|
|
|
|
|
6
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Bio::DOOP::Util::Run::GeneMerge - GeneMerge based GO analyzer |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 VERSION |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Version 0.02 |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=cut |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '0.02'; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 SYNOPSIS |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
#!/usr/bin/perl -w |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
use Bio::DOOP::DOOP; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
$test = Bio::DOOP::Util::Run::GeneMerge->new(); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
if ($test->getDescFile("GO/use/GO.BP.use") < 0){ |
28
|
|
|
|
|
|
|
print"Desc error\n" |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
if ($test->getAssocFile("GO/assoc/A_thaliana.converted.BP") < 0){ |
32
|
|
|
|
|
|
|
print"Assoc error\n" |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
if ($test->getPopFile("GO/pop.500") < 0){ |
36
|
|
|
|
|
|
|
print"Pop error\n" |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
if ($test->getStudyFile("GO/study.500/combined1314.list") < 0){ |
40
|
|
|
|
|
|
|
print"Study error\n" |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
$results = $test->getResults(); |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
foreach $res (@{$results}) { |
46
|
|
|
|
|
|
|
print $$res{'GOterm'}," ",$$res{'RawEs'},"\n"; |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 DESCRIPTION |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
This is a module based on GeneMerge v1.2. |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
Original program described in: |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
Cristian I. Castillo-Davis and Daniel L. Hartl |
56
|
|
|
|
|
|
|
GeneMerge - post-genomic analysis, data mining, and hypothesis testing |
57
|
|
|
|
|
|
|
Bioinformatics Vol. 19 no. 7 2003, Pages 891-892 |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
The original program is not really good for large scale analysis, |
60
|
|
|
|
|
|
|
because the design uses a lot of I/O processes. This version fetches |
61
|
|
|
|
|
|
|
everything into memory at start. |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head1 AUTHORS |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
Tibor Nagy, Godollo, Endre Sebestyen, Martonvasar, |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=head1 METHODS |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head2 new |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Create new GeneMerge object. |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
$genemerge = Bio::DOOP::Util::Run::GeneMerge->new; |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=cut |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub new { |
78
|
0
|
|
|
0
|
1
|
|
my $self = {}; |
79
|
0
|
|
|
|
|
|
my $dummy = shift; |
80
|
|
|
|
|
|
|
|
81
|
0
|
|
|
|
|
|
$self->{HoAssoc} = (); |
82
|
0
|
|
|
|
|
|
$self->{HoPopAssocCount} = (); |
83
|
0
|
|
|
|
|
|
$self->{HoPopAssocFreq} = (); |
84
|
0
|
|
|
|
|
|
$self->{PopGeneNo} = 0; |
85
|
0
|
|
|
|
|
|
$self->{HoDesc} = (); |
86
|
0
|
|
|
|
|
|
$self->{StudyGeneNo} = 0; |
87
|
0
|
|
|
|
|
|
$self->{StudyGeneNoAssoc} = 0; |
88
|
0
|
|
|
|
|
|
$self->{HoStudyGeneAssocCount} = (); |
89
|
0
|
|
|
|
|
|
$self->{HoAssocStudyGene} = (); |
90
|
0
|
|
|
|
|
|
$self->{StudyGeneUniqAssoc} = 0; |
91
|
0
|
|
|
|
|
|
$self->{BonferroniCorr} = 0; |
92
|
0
|
|
|
|
|
|
$self->{HoStudyGeneAssocPVal} = (); |
93
|
|
|
|
|
|
|
|
94
|
0
|
|
|
|
|
|
bless $self; |
95
|
0
|
|
|
|
|
|
return ($self); |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=head2 getAssocFile |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
The method loads the GO association file and stores it in memory. |
101
|
|
|
|
|
|
|
The file format is the following. Each line starts with a cluster id, and after some whitespace |
102
|
|
|
|
|
|
|
the associated GO ids are enumerated, separated by semicolons. |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
81001020 GO:0016020;GO:0003674;GO:0008150 |
105
|
|
|
|
|
|
|
81001110 GO:0005739;GO:0003674 |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
$genemerge->getAssocFile('/tmp/assoc.txt'); |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=cut |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub getAssocFile { |
112
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
113
|
0
|
|
|
|
|
|
my $filename = shift; |
114
|
|
|
|
|
|
|
|
115
|
0
|
0
|
|
|
|
|
open ASSOC, $filename or return(-1); |
116
|
|
|
|
|
|
|
|
117
|
0
|
|
|
|
|
|
while(){ |
118
|
0
|
|
|
|
|
|
chomp; |
119
|
0
|
|
|
|
|
|
my @assoc_line = split; |
120
|
0
|
|
|
|
|
|
my $assoc_gene = $assoc_line[0]; |
121
|
0
|
|
|
|
|
|
my @assoc_go = (); |
122
|
|
|
|
|
|
|
|
123
|
0
|
0
|
|
|
|
|
if ($assoc_line[1]) { |
124
|
0
|
|
|
|
|
|
@assoc_go = split /;/, $assoc_line[1]; |
125
|
0
|
|
|
|
|
|
@{$self->{HoAssoc}{$assoc_gene}} = @assoc_go; |
|
0
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
} |
129
|
0
|
|
|
|
|
|
close ASSOC; |
130
|
|
|
|
|
|
|
|
131
|
0
|
|
|
|
|
|
return(0); |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=head2 getPopFile |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
The method loads the population file and stores it in memory. |
137
|
|
|
|
|
|
|
The file format is the following. Each line contains one and only one |
138
|
|
|
|
|
|
|
cluster id. |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
81001020 |
141
|
|
|
|
|
|
|
81001110 |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
$genemerge->getPopFile('/tmp/pop.txt'); |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=cut |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub getPopFile { |
148
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
149
|
0
|
|
|
|
|
|
my $filename = shift; |
150
|
|
|
|
|
|
|
|
151
|
0
|
0
|
|
|
|
|
open POP, $filename or return(-1); |
152
|
0
|
|
|
|
|
|
while () { |
153
|
0
|
|
|
|
|
|
chomp; |
154
|
0
|
|
|
|
|
|
my $PopGene = $_; |
155
|
0
|
|
|
|
|
|
$self->{PopGeneNo}++; |
156
|
|
|
|
|
|
|
|
157
|
0
|
0
|
|
|
|
|
if (exists $self->{HoAssoc}{$PopGene}) { |
158
|
0
|
|
|
|
|
|
foreach my $AssocGO (@{$self->{HoAssoc}{$PopGene}}) { |
|
0
|
|
|
|
|
|
|
159
|
0
|
|
|
|
|
|
$self->{HoPopAssocCount}{$AssocGO}++; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
} |
163
|
0
|
|
|
|
|
|
close POP; |
164
|
|
|
|
|
|
|
|
165
|
0
|
|
|
|
|
|
$self->popFreq(); |
166
|
|
|
|
|
|
|
|
167
|
0
|
|
|
|
|
|
return(0); |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=head2 popFreq |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
The method calculates the population frequency. Do not use it directly. |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=cut |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
sub popFreq { |
177
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
178
|
|
|
|
|
|
|
|
179
|
0
|
|
|
|
|
|
foreach my $PopAssocCountKey (keys %{$self->{HoPopAssocCount}}) { |
|
0
|
|
|
|
|
|
|
180
|
0
|
|
|
|
|
|
my $freq = $self->{HoPopAssocCount}{$PopAssocCountKey} / $self->{PopGeneNo}; |
181
|
0
|
|
|
|
|
|
$self->{HoPopAssocFreq}{$PopAssocCountKey} = $freq; |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=head2 getDescFile |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
The method loads the GO description file. |
188
|
|
|
|
|
|
|
The file format is the following. Each line starts with the GO id, and separated by a tab, |
189
|
|
|
|
|
|
|
the description of the GO id. |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
GO:0000007 low-affinity zinc ion transporter activity |
192
|
|
|
|
|
|
|
GO:0000008 thioredoxin |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
$genemerge->getDescFile('/tmp/desc.txt'); |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=cut |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub getDescFile { |
199
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
200
|
0
|
|
|
|
|
|
my $filename = shift; |
201
|
|
|
|
|
|
|
|
202
|
0
|
0
|
|
|
|
|
open DESC, $filename or return(-1); |
203
|
0
|
|
|
|
|
|
while () { |
204
|
0
|
|
|
|
|
|
chomp; |
205
|
0
|
|
|
|
|
|
my @desc_line = split /\s/, $_, 2; |
206
|
0
|
|
|
|
|
|
$self->{HoDesc}{$desc_line[0]} = $desc_line[1]; |
207
|
|
|
|
|
|
|
} |
208
|
0
|
|
|
|
|
|
close DESC; |
209
|
|
|
|
|
|
|
|
210
|
0
|
|
|
|
|
|
return(0); |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=head2 getStudyFile |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
The method loads the study data set, counts GO frequencies, calculates P values |
216
|
|
|
|
|
|
|
based on the hypergeometric distribution, and corrects P values, based on the |
217
|
|
|
|
|
|
|
Bonferroni method. |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
The file format of the study file is the following. Each line contains one and only one |
220
|
|
|
|
|
|
|
cluster id. |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
81001020 |
223
|
|
|
|
|
|
|
81001110 |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
$genemerge->getStudyFile('/tmp/study.txt'); |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
=cut |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
sub getStudyFile { |
230
|
|
|
|
|
|
|
# TODO we should split this in 2 or 3. |
231
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
232
|
0
|
|
|
|
|
|
my $filename = shift; |
233
|
|
|
|
|
|
|
|
234
|
0
|
0
|
|
|
|
|
open STUDY, $filename or return(-1); |
235
|
0
|
|
|
|
|
|
while() { |
236
|
0
|
|
|
|
|
|
chomp; |
237
|
0
|
|
|
|
|
|
$self->{StudyGeneNo}++; |
238
|
0
|
|
|
|
|
|
my $StudyGene = $_; |
239
|
0
|
0
|
|
|
|
|
if (exists $self->{HoAssoc}{$StudyGene}) { |
240
|
0
|
|
|
|
|
|
foreach my $StudyGeneGO (@{$self->{HoAssoc}{$StudyGene}}) { |
|
0
|
|
|
|
|
|
|
241
|
0
|
|
|
|
|
|
$self->{HoStudyGeneAssocCount}{$StudyGeneGO}++; |
242
|
0
|
|
|
|
|
|
push @{$self->{HoAssocStudyGene}{$StudyGeneGO}}, $StudyGene; |
|
0
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
} else { |
245
|
0
|
|
|
|
|
|
$self->{StudyGeneNoAssoc}++; |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
} |
249
|
0
|
|
|
|
|
|
close STUDY; |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
#Bonferroni correction |
252
|
0
|
|
|
|
|
|
foreach my $StudyGeneAssocCountKey (keys %{$self->{HoStudyGeneAssocCount}}){ |
|
0
|
|
|
|
|
|
|
253
|
0
|
|
|
|
|
|
$self->{StudyGeneUniqAssoc}++; |
254
|
0
|
0
|
|
|
|
|
if($self->{HoPopAssocFreq}{$StudyGeneAssocCountKey} > (1 / $self->{PopGeneNo})) { |
255
|
0
|
|
|
|
|
|
$self->{BonferroniCorr}++; |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
#Calculate P-values based on hypergeometric distribution |
260
|
0
|
|
|
|
|
|
my $PVal = 0; |
261
|
0
|
|
|
|
|
|
my $PValC = 0; |
262
|
0
|
|
|
|
|
|
my $N = $self->{PopGeneNo}; |
263
|
0
|
|
|
|
|
|
my $K = $self->{StudyGeneNo}; |
264
|
|
|
|
|
|
|
|
265
|
0
|
|
|
|
|
|
foreach my $StudyGeneAssocCountKey (keys %{$self->{HoStudyGeneAssocCount}}){ |
|
0
|
|
|
|
|
|
|
266
|
0
|
|
|
|
|
|
my $P = $self->{HoPopAssocFreq}{$StudyGeneAssocCountKey}; |
267
|
0
|
|
|
|
|
|
my $R = $self->{HoStudyGeneAssocCount}{$StudyGeneAssocCountKey}; |
268
|
0
|
0
|
|
|
|
|
if ($R != 1) { |
269
|
0
|
|
|
|
|
|
$PVal = $self->hypergeometric($N,$P,$K,$R); |
270
|
0
|
0
|
|
|
|
|
$PValC = ($PVal * $self->{BonferroniCorr} >= 1) ? 1 : $PVal * $self->{BonferroniCorr}; |
271
|
|
|
|
|
|
|
} else { |
272
|
0
|
|
|
|
|
|
$PVal = 'NA'; |
273
|
0
|
|
|
|
|
|
$PValC = 'NA'; |
274
|
|
|
|
|
|
|
} |
275
|
0
|
|
|
|
|
|
${$self->{HoStudyGeneAssocPVal}{$StudyGeneAssocCountKey}}[0] = $PVal; |
|
0
|
|
|
|
|
|
|
276
|
0
|
|
|
|
|
|
${$self->{HoStudyGeneAssocPVal}{$StudyGeneAssocCountKey}}[1] = $PValC; |
|
0
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
|
279
|
0
|
|
|
|
|
|
return(0); |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=head2 getResults |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
The method gives back all the results as an arrayref of hashes. |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
$results = $genemerge->getResults(); |
287
|
|
|
|
|
|
|
foreach $result (@{$results}) { |
288
|
|
|
|
|
|
|
$goterm = $$result{'GOterm'}; |
289
|
|
|
|
|
|
|
$popfreq = $$result{'PopFreq'}; |
290
|
|
|
|
|
|
|
$popfrac = $$result{'PopFrac'}; |
291
|
|
|
|
|
|
|
$studyfrac = $$result{'StudyFrac'}; |
292
|
|
|
|
|
|
|
$studyfracall = $$result{'StudyFracAll'}; |
293
|
|
|
|
|
|
|
$raw_escore = $$result{'RawEs'}; |
294
|
|
|
|
|
|
|
$escore = $$result{'EScore'}; |
295
|
|
|
|
|
|
|
$desc = $$result{'Desc'}; |
296
|
|
|
|
|
|
|
@contrib = @{$$result{'Contrib'}}; |
297
|
|
|
|
|
|
|
} |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
=cut |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
sub getResults { |
302
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
303
|
0
|
|
|
|
|
|
my @results; |
304
|
|
|
|
|
|
|
|
305
|
0
|
|
|
|
|
|
foreach my $goterm (sort keys %{$self->{HoStudyGeneAssocCount}}) { |
|
0
|
|
|
|
|
|
|
306
|
0
|
|
|
|
|
|
my %result; |
307
|
0
|
|
|
|
|
|
$result{'GOterm'} = $goterm; |
308
|
0
|
|
|
|
|
|
$result{'PopFreq'} = $self->{HoPopAssocFreq}{$goterm}; |
309
|
0
|
|
|
|
|
|
$result{'PopFrac'} = $self->{HoPopAssocCount}{$goterm}; |
310
|
0
|
|
|
|
|
|
$result{'PopFracAll'} = $self->{PopGeneNo}; |
311
|
0
|
|
|
|
|
|
$result{'StudyFrac'} = $self->{HoStudyGeneAssocCount}{$goterm}; |
312
|
0
|
|
|
|
|
|
$result{'StudyFracAll'} = $self->{StudyGeneNo}; |
313
|
0
|
|
|
|
|
|
$result{'RawEs'} = ${$self->{HoStudyGeneAssocPVal}{$goterm}}[0]; |
|
0
|
|
|
|
|
|
|
314
|
0
|
|
|
|
|
|
$result{'EScore'} = ${$self->{HoStudyGeneAssocPVal}{$goterm}}[1]; |
|
0
|
|
|
|
|
|
|
315
|
0
|
|
|
|
|
|
$result{'Desc'} = $self->{HoDesc}{$goterm}; |
316
|
0
|
|
|
|
|
|
$result{'Contrib'} = \@{$self->{HoAssocStudyGene}{$goterm}}; |
|
0
|
|
|
|
|
|
|
317
|
0
|
|
|
|
|
|
push @results, \%result; |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
0
|
|
|
|
|
|
return(\@results); |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
=head2 hypergeometric |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
This is an internal function to calculate the hypergeometric distribution. Do not use it directly. |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
=cut |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
sub hypergeometric { |
330
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
331
|
0
|
|
|
|
|
|
my $n = shift; |
332
|
0
|
|
|
|
|
|
my $p = shift; |
333
|
0
|
|
|
|
|
|
my $k = shift; |
334
|
0
|
|
|
|
|
|
my $r = shift; |
335
|
|
|
|
|
|
|
|
336
|
0
|
|
|
|
|
|
my $i = '0'; |
337
|
0
|
|
|
|
|
|
my $q = '0'; |
338
|
0
|
|
|
|
|
|
my $np = '0'; |
339
|
0
|
|
|
|
|
|
my $nq = '0'; |
340
|
0
|
|
|
|
|
|
my $top = '0'; |
341
|
0
|
|
|
|
|
|
my $sum = '0'; |
342
|
0
|
|
|
|
|
|
my $lfoo = '0'; |
343
|
|
|
|
|
|
|
|
344
|
0
|
|
|
|
|
|
my $logNchooseK = '0'; |
345
|
|
|
|
|
|
|
|
346
|
0
|
|
|
|
|
|
$q = 1 - $p; |
347
|
|
|
|
|
|
|
|
348
|
0
|
|
|
|
|
|
$np = floor( $n * $p + 0.5 ); |
349
|
0
|
|
|
|
|
|
$nq = floor( $n * $q + 0.5 ); |
350
|
|
|
|
|
|
|
|
351
|
0
|
|
|
|
|
|
$logNchooseK = &logNchooseK( $n, $k ); |
352
|
|
|
|
|
|
|
|
353
|
0
|
0
|
|
|
|
|
$top = ($np < $k) ? $np : $k; |
354
|
|
|
|
|
|
|
|
355
|
0
|
|
|
|
|
|
$lfoo = &logNchooseK($np, $top) + &logNchooseK($n * (1 - $p), $k - $top); |
356
|
|
|
|
|
|
|
|
357
|
0
|
|
|
|
|
|
for ($i = $top ; $i >= $r ; $i--) { |
358
|
0
|
|
|
|
|
|
$sum += exp($lfoo - $logNchooseK); |
359
|
0
|
0
|
|
|
|
|
if ($i > $r) { $lfoo = $lfoo + log($i / ($np - $i + 1)) + log(($nq - $k + $i) / ($k - $i + 1)) } |
|
0
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
} |
361
|
0
|
|
|
|
|
|
return $sum; |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
=head2 logNchooseK |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
Another internal function for the correct statistical results. Do not use it directly. |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=cut |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
sub logNchooseK { |
371
|
0
|
|
|
0
|
1
|
|
my $n = shift; |
372
|
0
|
|
|
|
|
|
my $k = shift; |
373
|
|
|
|
|
|
|
|
374
|
0
|
|
|
|
|
|
my $i = '0'; |
375
|
0
|
|
|
|
|
|
my $result = '0'; |
376
|
|
|
|
|
|
|
|
377
|
0
|
0
|
|
|
|
|
$k = ($k > ($n - $k)) ? $n - $k : $k; |
378
|
|
|
|
|
|
|
|
379
|
0
|
|
|
|
|
|
for ($i = $n ; $i > ($n - $k) ; $i--) { $result += log($i) } |
|
0
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
|
381
|
0
|
|
|
|
|
|
$result -= &lFactorial($k); |
382
|
|
|
|
|
|
|
|
383
|
0
|
|
|
|
|
|
return $result; |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=head2 lFactorial |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
Factorial calculating function. Do not use it directly. |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=cut |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
sub lFactorial { |
393
|
0
|
|
|
0
|
1
|
|
my $number = shift; |
394
|
0
|
|
|
|
|
|
my $result = 0; |
395
|
0
|
|
|
|
|
|
my $i; |
396
|
|
|
|
|
|
|
|
397
|
0
|
|
|
|
|
|
for ($i = 2 ; $i <= $number ; $i++) { $result += log($i) } |
|
0
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
|
399
|
0
|
|
|
|
|
|
return $result; |
400
|
|
|
|
|
|
|
} |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
1; |