| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Bio::DOOP::Util::Run::GeneMerge; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
5
|
use strict; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
34
|
|
|
4
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
24
|
|
|
5
|
1
|
|
|
1
|
|
912
|
use POSIX; |
|
|
1
|
|
|
|
|
8147
|
|
|
|
1
|
|
|
|
|
6
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Bio::DOOP::Util::Run::GeneMerge - GeneMerge based GO analyzer |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 VERSION |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Version 0.02 |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=cut |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '0.02'; |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
#!/usr/bin/perl -w |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
use Bio::DOOP::DOOP; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
$test = Bio::DOOP::Util::Run::GeneMerge->new(); |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
if ($test->getDescFile("GO/use/GO.BP.use") < 0){ |
|
28
|
|
|
|
|
|
|
print"Desc error\n" |
|
29
|
|
|
|
|
|
|
} |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
if ($test->getAssocFile("GO/assoc/A_thaliana.converted.BP") < 0){ |
|
32
|
|
|
|
|
|
|
print"Assoc error\n" |
|
33
|
|
|
|
|
|
|
} |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
if ($test->getPopFile("GO/pop.500") < 0){ |
|
36
|
|
|
|
|
|
|
print"Pop error\n" |
|
37
|
|
|
|
|
|
|
} |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
if ($test->getStudyFile("GO/study.500/combined1314.list") < 0){ |
|
40
|
|
|
|
|
|
|
print"Study error\n" |
|
41
|
|
|
|
|
|
|
} |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
$results = $test->getResults(); |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
foreach $res (@{$results}) { |
|
46
|
|
|
|
|
|
|
print $$res{'GOterm'}," ",$$res{'RawEs'},"\n"; |
|
47
|
|
|
|
|
|
|
} |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
This is a module based on GeneMerge v1.2. |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
Original program described in: |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
Cristian I. Castillo-Davis and Daniel L. Hartl |
|
56
|
|
|
|
|
|
|
GeneMerge - post-genomic analysis, data mining, and hypothesis testing |
|
57
|
|
|
|
|
|
|
Bioinformatics Vol. 19 no. 7 2003, Pages 891-892 |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
The original program is not really good for large scale analysis, |
|
60
|
|
|
|
|
|
|
because the design uses a lot of I/O processes. This version fetches |
|
61
|
|
|
|
|
|
|
everything into memory at start. |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head1 AUTHORS |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
Tibor Nagy, Godollo, Endre Sebestyen, Martonvasar, |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=head1 METHODS |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head2 new |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Create new GeneMerge object. |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
$genemerge = Bio::DOOP::Util::Run::GeneMerge->new; |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=cut |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
sub new { |
|
78
|
0
|
|
|
0
|
1
|
|
my $self = {}; |
|
79
|
0
|
|
|
|
|
|
my $dummy = shift; |
|
80
|
|
|
|
|
|
|
|
|
81
|
0
|
|
|
|
|
|
$self->{HoAssoc} = (); |
|
82
|
0
|
|
|
|
|
|
$self->{HoPopAssocCount} = (); |
|
83
|
0
|
|
|
|
|
|
$self->{HoPopAssocFreq} = (); |
|
84
|
0
|
|
|
|
|
|
$self->{PopGeneNo} = 0; |
|
85
|
0
|
|
|
|
|
|
$self->{HoDesc} = (); |
|
86
|
0
|
|
|
|
|
|
$self->{StudyGeneNo} = 0; |
|
87
|
0
|
|
|
|
|
|
$self->{StudyGeneNoAssoc} = 0; |
|
88
|
0
|
|
|
|
|
|
$self->{HoStudyGeneAssocCount} = (); |
|
89
|
0
|
|
|
|
|
|
$self->{HoAssocStudyGene} = (); |
|
90
|
0
|
|
|
|
|
|
$self->{StudyGeneUniqAssoc} = 0; |
|
91
|
0
|
|
|
|
|
|
$self->{BonferroniCorr} = 0; |
|
92
|
0
|
|
|
|
|
|
$self->{HoStudyGeneAssocPVal} = (); |
|
93
|
|
|
|
|
|
|
|
|
94
|
0
|
|
|
|
|
|
bless $self; |
|
95
|
0
|
|
|
|
|
|
return ($self); |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=head2 getAssocFile |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
The method loads the GO association file and stores it in memory. |
|
101
|
|
|
|
|
|
|
The file format is the following. Each line starts with a cluster id, and after some whitespace |
|
102
|
|
|
|
|
|
|
the associated GO ids are enumerated, separated by semicolons. |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
81001020 GO:0016020;GO:0003674;GO:0008150 |
|
105
|
|
|
|
|
|
|
81001110 GO:0005739;GO:0003674 |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
$genemerge->getAssocFile('/tmp/assoc.txt'); |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=cut |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub getAssocFile { |
|
112
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
113
|
0
|
|
|
|
|
|
my $filename = shift; |
|
114
|
|
|
|
|
|
|
|
|
115
|
0
|
0
|
|
|
|
|
open ASSOC, $filename or return(-1); |
|
116
|
|
|
|
|
|
|
|
|
117
|
0
|
|
|
|
|
|
while(){ |
|
118
|
0
|
|
|
|
|
|
chomp; |
|
119
|
0
|
|
|
|
|
|
my @assoc_line = split; |
|
120
|
0
|
|
|
|
|
|
my $assoc_gene = $assoc_line[0]; |
|
121
|
0
|
|
|
|
|
|
my @assoc_go = (); |
|
122
|
|
|
|
|
|
|
|
|
123
|
0
|
0
|
|
|
|
|
if ($assoc_line[1]) { |
|
124
|
0
|
|
|
|
|
|
@assoc_go = split /;/, $assoc_line[1]; |
|
125
|
0
|
|
|
|
|
|
@{$self->{HoAssoc}{$assoc_gene}} = @assoc_go; |
|
|
0
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
} |
|
129
|
0
|
|
|
|
|
|
close ASSOC; |
|
130
|
|
|
|
|
|
|
|
|
131
|
0
|
|
|
|
|
|
return(0); |
|
132
|
|
|
|
|
|
|
} |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
=head2 getPopFile |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
The method loads the population file and stores it in memory. |
|
137
|
|
|
|
|
|
|
The file format is the following. Each line contains one and only one |
|
138
|
|
|
|
|
|
|
cluster id. |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
81001020 |
|
141
|
|
|
|
|
|
|
81001110 |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
$genemerge->getPopFile('/tmp/pop.txt'); |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=cut |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub getPopFile { |
|
148
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
149
|
0
|
|
|
|
|
|
my $filename = shift; |
|
150
|
|
|
|
|
|
|
|
|
151
|
0
|
0
|
|
|
|
|
open POP, $filename or return(-1); |
|
152
|
0
|
|
|
|
|
|
while () { |
|
153
|
0
|
|
|
|
|
|
chomp; |
|
154
|
0
|
|
|
|
|
|
my $PopGene = $_; |
|
155
|
0
|
|
|
|
|
|
$self->{PopGeneNo}++; |
|
156
|
|
|
|
|
|
|
|
|
157
|
0
|
0
|
|
|
|
|
if (exists $self->{HoAssoc}{$PopGene}) { |
|
158
|
0
|
|
|
|
|
|
foreach my $AssocGO (@{$self->{HoAssoc}{$PopGene}}) { |
|
|
0
|
|
|
|
|
|
|
|
159
|
0
|
|
|
|
|
|
$self->{HoPopAssocCount}{$AssocGO}++; |
|
160
|
|
|
|
|
|
|
} |
|
161
|
|
|
|
|
|
|
} |
|
162
|
|
|
|
|
|
|
} |
|
163
|
0
|
|
|
|
|
|
close POP; |
|
164
|
|
|
|
|
|
|
|
|
165
|
0
|
|
|
|
|
|
$self->popFreq(); |
|
166
|
|
|
|
|
|
|
|
|
167
|
0
|
|
|
|
|
|
return(0); |
|
168
|
|
|
|
|
|
|
} |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=head2 popFreq |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
The method calculates the population frequency. Do not use it directly. |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=cut |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
sub popFreq { |
|
177
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
178
|
|
|
|
|
|
|
|
|
179
|
0
|
|
|
|
|
|
foreach my $PopAssocCountKey (keys %{$self->{HoPopAssocCount}}) { |
|
|
0
|
|
|
|
|
|
|
|
180
|
0
|
|
|
|
|
|
my $freq = $self->{HoPopAssocCount}{$PopAssocCountKey} / $self->{PopGeneNo}; |
|
181
|
0
|
|
|
|
|
|
$self->{HoPopAssocFreq}{$PopAssocCountKey} = $freq; |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
} |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=head2 getDescFile |
|
186
|
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
The method loads the GO description file. |
|
188
|
|
|
|
|
|
|
The file format is the following. Each line starts with the GO id, and separated by a tab, |
|
189
|
|
|
|
|
|
|
the description of the GO id. |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
GO:0000007 low-affinity zinc ion transporter activity |
|
192
|
|
|
|
|
|
|
GO:0000008 thioredoxin |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
$genemerge->getDescFile('/tmp/desc.txt'); |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=cut |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub getDescFile { |
|
199
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
200
|
0
|
|
|
|
|
|
my $filename = shift; |
|
201
|
|
|
|
|
|
|
|
|
202
|
0
|
0
|
|
|
|
|
open DESC, $filename or return(-1); |
|
203
|
0
|
|
|
|
|
|
while () { |
|
204
|
0
|
|
|
|
|
|
chomp; |
|
205
|
0
|
|
|
|
|
|
my @desc_line = split /\s/, $_, 2; |
|
206
|
0
|
|
|
|
|
|
$self->{HoDesc}{$desc_line[0]} = $desc_line[1]; |
|
207
|
|
|
|
|
|
|
} |
|
208
|
0
|
|
|
|
|
|
close DESC; |
|
209
|
|
|
|
|
|
|
|
|
210
|
0
|
|
|
|
|
|
return(0); |
|
211
|
|
|
|
|
|
|
} |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=head2 getStudyFile |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
The method loads the study data set, counts GO frequencies, calculates P values |
|
216
|
|
|
|
|
|
|
based on the hypergeometric distribution, and corrects P values, based on the |
|
217
|
|
|
|
|
|
|
Bonferroni method. |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
The file format of the study file is the following. Each line contains one and only one |
|
220
|
|
|
|
|
|
|
cluster id. |
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
81001020 |
|
223
|
|
|
|
|
|
|
81001110 |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
$genemerge->getStudyFile('/tmp/study.txt'); |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
=cut |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
sub getStudyFile { |
|
230
|
|
|
|
|
|
|
# TODO we should split this in 2 or 3. |
|
231
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
232
|
0
|
|
|
|
|
|
my $filename = shift; |
|
233
|
|
|
|
|
|
|
|
|
234
|
0
|
0
|
|
|
|
|
open STUDY, $filename or return(-1); |
|
235
|
0
|
|
|
|
|
|
while() { |
|
236
|
0
|
|
|
|
|
|
chomp; |
|
237
|
0
|
|
|
|
|
|
$self->{StudyGeneNo}++; |
|
238
|
0
|
|
|
|
|
|
my $StudyGene = $_; |
|
239
|
0
|
0
|
|
|
|
|
if (exists $self->{HoAssoc}{$StudyGene}) { |
|
240
|
0
|
|
|
|
|
|
foreach my $StudyGeneGO (@{$self->{HoAssoc}{$StudyGene}}) { |
|
|
0
|
|
|
|
|
|
|
|
241
|
0
|
|
|
|
|
|
$self->{HoStudyGeneAssocCount}{$StudyGeneGO}++; |
|
242
|
0
|
|
|
|
|
|
push @{$self->{HoAssocStudyGene}{$StudyGeneGO}}, $StudyGene; |
|
|
0
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
} |
|
244
|
|
|
|
|
|
|
} else { |
|
245
|
0
|
|
|
|
|
|
$self->{StudyGeneNoAssoc}++; |
|
246
|
|
|
|
|
|
|
} |
|
247
|
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
} |
|
249
|
0
|
|
|
|
|
|
close STUDY; |
|
250
|
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
#Bonferroni correction |
|
252
|
0
|
|
|
|
|
|
foreach my $StudyGeneAssocCountKey (keys %{$self->{HoStudyGeneAssocCount}}){ |
|
|
0
|
|
|
|
|
|
|
|
253
|
0
|
|
|
|
|
|
$self->{StudyGeneUniqAssoc}++; |
|
254
|
0
|
0
|
|
|
|
|
if($self->{HoPopAssocFreq}{$StudyGeneAssocCountKey} > (1 / $self->{PopGeneNo})) { |
|
255
|
0
|
|
|
|
|
|
$self->{BonferroniCorr}++; |
|
256
|
|
|
|
|
|
|
} |
|
257
|
|
|
|
|
|
|
} |
|
258
|
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
#Calculate P-values based on hypergeometric distribution |
|
260
|
0
|
|
|
|
|
|
my $PVal = 0; |
|
261
|
0
|
|
|
|
|
|
my $PValC = 0; |
|
262
|
0
|
|
|
|
|
|
my $N = $self->{PopGeneNo}; |
|
263
|
0
|
|
|
|
|
|
my $K = $self->{StudyGeneNo}; |
|
264
|
|
|
|
|
|
|
|
|
265
|
0
|
|
|
|
|
|
foreach my $StudyGeneAssocCountKey (keys %{$self->{HoStudyGeneAssocCount}}){ |
|
|
0
|
|
|
|
|
|
|
|
266
|
0
|
|
|
|
|
|
my $P = $self->{HoPopAssocFreq}{$StudyGeneAssocCountKey}; |
|
267
|
0
|
|
|
|
|
|
my $R = $self->{HoStudyGeneAssocCount}{$StudyGeneAssocCountKey}; |
|
268
|
0
|
0
|
|
|
|
|
if ($R != 1) { |
|
269
|
0
|
|
|
|
|
|
$PVal = $self->hypergeometric($N,$P,$K,$R); |
|
270
|
0
|
0
|
|
|
|
|
$PValC = ($PVal * $self->{BonferroniCorr} >= 1) ? 1 : $PVal * $self->{BonferroniCorr}; |
|
271
|
|
|
|
|
|
|
} else { |
|
272
|
0
|
|
|
|
|
|
$PVal = 'NA'; |
|
273
|
0
|
|
|
|
|
|
$PValC = 'NA'; |
|
274
|
|
|
|
|
|
|
} |
|
275
|
0
|
|
|
|
|
|
${$self->{HoStudyGeneAssocPVal}{$StudyGeneAssocCountKey}}[0] = $PVal; |
|
|
0
|
|
|
|
|
|
|
|
276
|
0
|
|
|
|
|
|
${$self->{HoStudyGeneAssocPVal}{$StudyGeneAssocCountKey}}[1] = $PValC; |
|
|
0
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
} |
|
278
|
|
|
|
|
|
|
|
|
279
|
0
|
|
|
|
|
|
return(0); |
|
280
|
|
|
|
|
|
|
} |
|
281
|
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=head2 getResults |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
The method gives back all the results as an arrayref of hashes. |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
$results = $genemerge->getResults(); |
|
287
|
|
|
|
|
|
|
foreach $result (@{$results}) { |
|
288
|
|
|
|
|
|
|
$goterm = $$result{'GOterm'}; |
|
289
|
|
|
|
|
|
|
$popfreq = $$result{'PopFreq'}; |
|
290
|
|
|
|
|
|
|
$popfrac = $$result{'PopFrac'}; |
|
291
|
|
|
|
|
|
|
$studyfrac = $$result{'StudyFrac'}; |
|
292
|
|
|
|
|
|
|
$studyfracall = $$result{'StudyFracAll'}; |
|
293
|
|
|
|
|
|
|
$raw_escore = $$result{'RawEs'}; |
|
294
|
|
|
|
|
|
|
$escore = $$result{'EScore'}; |
|
295
|
|
|
|
|
|
|
$desc = $$result{'Desc'}; |
|
296
|
|
|
|
|
|
|
@contrib = @{$$result{'Contrib'}}; |
|
297
|
|
|
|
|
|
|
} |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
=cut |
|
300
|
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
sub getResults { |
|
302
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
303
|
0
|
|
|
|
|
|
my @results; |
|
304
|
|
|
|
|
|
|
|
|
305
|
0
|
|
|
|
|
|
foreach my $goterm (sort keys %{$self->{HoStudyGeneAssocCount}}) { |
|
|
0
|
|
|
|
|
|
|
|
306
|
0
|
|
|
|
|
|
my %result; |
|
307
|
0
|
|
|
|
|
|
$result{'GOterm'} = $goterm; |
|
308
|
0
|
|
|
|
|
|
$result{'PopFreq'} = $self->{HoPopAssocFreq}{$goterm}; |
|
309
|
0
|
|
|
|
|
|
$result{'PopFrac'} = $self->{HoPopAssocCount}{$goterm}; |
|
310
|
0
|
|
|
|
|
|
$result{'PopFracAll'} = $self->{PopGeneNo}; |
|
311
|
0
|
|
|
|
|
|
$result{'StudyFrac'} = $self->{HoStudyGeneAssocCount}{$goterm}; |
|
312
|
0
|
|
|
|
|
|
$result{'StudyFracAll'} = $self->{StudyGeneNo}; |
|
313
|
0
|
|
|
|
|
|
$result{'RawEs'} = ${$self->{HoStudyGeneAssocPVal}{$goterm}}[0]; |
|
|
0
|
|
|
|
|
|
|
|
314
|
0
|
|
|
|
|
|
$result{'EScore'} = ${$self->{HoStudyGeneAssocPVal}{$goterm}}[1]; |
|
|
0
|
|
|
|
|
|
|
|
315
|
0
|
|
|
|
|
|
$result{'Desc'} = $self->{HoDesc}{$goterm}; |
|
316
|
0
|
|
|
|
|
|
$result{'Contrib'} = \@{$self->{HoAssocStudyGene}{$goterm}}; |
|
|
0
|
|
|
|
|
|
|
|
317
|
0
|
|
|
|
|
|
push @results, \%result; |
|
318
|
|
|
|
|
|
|
} |
|
319
|
|
|
|
|
|
|
|
|
320
|
0
|
|
|
|
|
|
return(\@results); |
|
321
|
|
|
|
|
|
|
} |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
=head2 hypergeometric |
|
324
|
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
This is an internal function to calculate the hypergeometric distribution. Do not use it directly. |
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
=cut |
|
328
|
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
sub hypergeometric { |
|
330
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
|
331
|
0
|
|
|
|
|
|
my $n = shift; |
|
332
|
0
|
|
|
|
|
|
my $p = shift; |
|
333
|
0
|
|
|
|
|
|
my $k = shift; |
|
334
|
0
|
|
|
|
|
|
my $r = shift; |
|
335
|
|
|
|
|
|
|
|
|
336
|
0
|
|
|
|
|
|
my $i = '0'; |
|
337
|
0
|
|
|
|
|
|
my $q = '0'; |
|
338
|
0
|
|
|
|
|
|
my $np = '0'; |
|
339
|
0
|
|
|
|
|
|
my $nq = '0'; |
|
340
|
0
|
|
|
|
|
|
my $top = '0'; |
|
341
|
0
|
|
|
|
|
|
my $sum = '0'; |
|
342
|
0
|
|
|
|
|
|
my $lfoo = '0'; |
|
343
|
|
|
|
|
|
|
|
|
344
|
0
|
|
|
|
|
|
my $logNchooseK = '0'; |
|
345
|
|
|
|
|
|
|
|
|
346
|
0
|
|
|
|
|
|
$q = 1 - $p; |
|
347
|
|
|
|
|
|
|
|
|
348
|
0
|
|
|
|
|
|
$np = floor( $n * $p + 0.5 ); |
|
349
|
0
|
|
|
|
|
|
$nq = floor( $n * $q + 0.5 ); |
|
350
|
|
|
|
|
|
|
|
|
351
|
0
|
|
|
|
|
|
$logNchooseK = &logNchooseK( $n, $k ); |
|
352
|
|
|
|
|
|
|
|
|
353
|
0
|
0
|
|
|
|
|
$top = ($np < $k) ? $np : $k; |
|
354
|
|
|
|
|
|
|
|
|
355
|
0
|
|
|
|
|
|
$lfoo = &logNchooseK($np, $top) + &logNchooseK($n * (1 - $p), $k - $top); |
|
356
|
|
|
|
|
|
|
|
|
357
|
0
|
|
|
|
|
|
for ($i = $top ; $i >= $r ; $i--) { |
|
358
|
0
|
|
|
|
|
|
$sum += exp($lfoo - $logNchooseK); |
|
359
|
0
|
0
|
|
|
|
|
if ($i > $r) { $lfoo = $lfoo + log($i / ($np - $i + 1)) + log(($nq - $k + $i) / ($k - $i + 1)) } |
|
|
0
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
} |
|
361
|
0
|
|
|
|
|
|
return $sum; |
|
362
|
|
|
|
|
|
|
} |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
=head2 logNchooseK |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
Another internal function for the correct statistical results. Do not use it directly. |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
=cut |
|
369
|
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
sub logNchooseK { |
|
371
|
0
|
|
|
0
|
1
|
|
my $n = shift; |
|
372
|
0
|
|
|
|
|
|
my $k = shift; |
|
373
|
|
|
|
|
|
|
|
|
374
|
0
|
|
|
|
|
|
my $i = '0'; |
|
375
|
0
|
|
|
|
|
|
my $result = '0'; |
|
376
|
|
|
|
|
|
|
|
|
377
|
0
|
0
|
|
|
|
|
$k = ($k > ($n - $k)) ? $n - $k : $k; |
|
378
|
|
|
|
|
|
|
|
|
379
|
0
|
|
|
|
|
|
for ($i = $n ; $i > ($n - $k) ; $i--) { $result += log($i) } |
|
|
0
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
|
|
381
|
0
|
|
|
|
|
|
$result -= &lFactorial($k); |
|
382
|
|
|
|
|
|
|
|
|
383
|
0
|
|
|
|
|
|
return $result; |
|
384
|
|
|
|
|
|
|
} |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=head2 lFactorial |
|
387
|
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
Factorial calculating function. Do not use it directly. |
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=cut |
|
391
|
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
sub lFactorial { |
|
393
|
0
|
|
|
0
|
1
|
|
my $number = shift; |
|
394
|
0
|
|
|
|
|
|
my $result = 0; |
|
395
|
0
|
|
|
|
|
|
my $i; |
|
396
|
|
|
|
|
|
|
|
|
397
|
0
|
|
|
|
|
|
for ($i = 2 ; $i <= $number ; $i++) { $result += log($i) } |
|
|
0
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
|
|
399
|
0
|
|
|
|
|
|
return $result; |
|
400
|
|
|
|
|
|
|
} |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
1; |