File Coverage

blib/lib/Bio/DOOP/Util/Run/GeneMerge.pm

Criterion	Covered	Total	%
statement	9	164	5.4
branch	0	26	0.0
condition			n/a
subroutine	3	13	23.0
pod	10	10	100.0
total	22	213	10.3

line	stmt	bran	sub	pod	time	code
1						package Bio::DOOP::Util::Run::GeneMerge;
2
3	1		1		5	use strict;
	1				3
	1				34
4	1		1		5	use warnings;
	1				3
	1				24
5	1		1		912	use POSIX;
	1				8147
	1				6
6
7						=head1 NAME
8
9						Bio::DOOP::Util::Run::GeneMerge - GeneMerge based GO analyzer
10
11						=head1 VERSION
12
13						Version 0.02
14
15						=cut
16
17						our $VERSION = '0.02';
18
19						=head1 SYNOPSIS
20
21						#!/usr/bin/perl -w
22
23						use Bio::DOOP::DOOP;
24
25						$test = Bio::DOOP::Util::Run::GeneMerge->new();
26
27						if ($test->getDescFile("GO/use/GO.BP.use") < 0){
28						print"Desc error\n"
29						}
30
31						if ($test->getAssocFile("GO/assoc/A_thaliana.converted.BP") < 0){
32						print"Assoc error\n"
33						}
34
35						if ($test->getPopFile("GO/pop.500") < 0){
36						print"Pop error\n"
37						}
38
39						if ($test->getStudyFile("GO/study.500/combined1314.list") < 0){
40						print"Study error\n"
41						}
42
43						$results = $test->getResults();
44
45						foreach $res (@{$results}) {
46						print $$res{'GOterm'}," ",$$res{'RawEs'},"\n";
47						}
48
49						=head1 DESCRIPTION
50
51						This is a module based on GeneMerge v1.2.
52
53						Original program described in:
54
55						Cristian I. Castillo-Davis and Daniel L. Hartl
56						GeneMerge - post-genomic analysis, data mining, and hypothesis testing
57						Bioinformatics Vol. 19 no. 7 2003, Pages 891-892
58
59						The original program is not really good for large scale analysis,
60						because the design uses a lot of I/O processes. This version fetches
61						everything into memory at start.
62
63						=head1 AUTHORS
64
65						Tibor Nagy, Godollo, Endre Sebestyen, Martonvasar,
66
67						=head1 METHODS
68
69						=head2 new
70
71						Create new GeneMerge object.
72
73						$genemerge = Bio::DOOP::Util::Run::GeneMerge->new;
74
75						=cut
76
77						sub new {
78	0		0	1		my $self = {};
79	0					my $dummy = shift;
80
81	0					$self->{HoAssoc} = ();
82	0					$self->{HoPopAssocCount} = ();
83	0					$self->{HoPopAssocFreq} = ();
84	0					$self->{PopGeneNo} = 0;
85	0					$self->{HoDesc} = ();
86	0					$self->{StudyGeneNo} = 0;
87	0					$self->{StudyGeneNoAssoc} = 0;
88	0					$self->{HoStudyGeneAssocCount} = ();
89	0					$self->{HoAssocStudyGene} = ();
90	0					$self->{StudyGeneUniqAssoc} = 0;
91	0					$self->{BonferroniCorr} = 0;
92	0					$self->{HoStudyGeneAssocPVal} = ();
93
94	0					bless $self;
95	0					return ($self);
96						}
97
98						=head2 getAssocFile
99
100						The method loads the GO association file and stores it in memory.
101						The file format is the following. Each line starts with a cluster id, and after some whitespace
102						the associated GO ids are enumerated, separated by semicolons.
103
104						81001020 GO:0016020;GO:0003674;GO:0008150
105						81001110 GO:0005739;GO:0003674
106
107						$genemerge->getAssocFile('/tmp/assoc.txt');
108
109						=cut
110
111						sub getAssocFile {
112	0		0	1		my $self = shift;
113	0					my $filename = shift;
114
115	0	0				open ASSOC, $filename or return(-1);
116
117	0					while(){
118	0					chomp;
119	0					my @assoc_line = split;
120	0					my $assoc_gene = $assoc_line[0];
121	0					my @assoc_go = ();
122
123	0	0				if ($assoc_line[1]) {
124	0					@assoc_go = split /;/, $assoc_line[1];
125	0					@{$self->{HoAssoc}{$assoc_gene}} = @assoc_go;
	0
126						}
127
128						}
129	0					close ASSOC;
130
131	0					return(0);
132						}
133
134						=head2 getPopFile
135
136						The method loads the population file and stores it in memory.
137						The file format is the following. Each line contains one and only one
138						cluster id.
139
140						81001020
141						81001110
142
143						$genemerge->getPopFile('/tmp/pop.txt');
144
145						=cut
146
147						sub getPopFile {
148	0		0	1		my $self = shift;
149	0					my $filename = shift;
150
151	0	0				open POP, $filename or return(-1);
152	0					while () {
153	0					chomp;
154	0					my $PopGene = $_;
155	0					$self->{PopGeneNo}++;
156
157	0	0				if (exists $self->{HoAssoc}{$PopGene}) {
158	0					foreach my $AssocGO (@{$self->{HoAssoc}{$PopGene}}) {
	0
159	0					$self->{HoPopAssocCount}{$AssocGO}++;
160						}
161						}
162						}
163	0					close POP;
164
165	0					$self->popFreq();
166
167	0					return(0);
168						}
169
170						=head2 popFreq
171
172						The method calculates the population frequency. Do not use it directly.
173
174						=cut
175
176						sub popFreq {
177	0		0	1		my $self = shift;
178
179	0					foreach my $PopAssocCountKey (keys %{$self->{HoPopAssocCount}}) {
	0
180	0					my $freq = $self->{HoPopAssocCount}{$PopAssocCountKey} / $self->{PopGeneNo};
181	0					$self->{HoPopAssocFreq}{$PopAssocCountKey} = $freq;
182						}
183						}
184
185						=head2 getDescFile
186
187						The method loads the GO description file.
188						The file format is the following. Each line starts with the GO id, and separated by a tab,
189						the description of the GO id.
190
191						GO:0000007 low-affinity zinc ion transporter activity
192						GO:0000008 thioredoxin
193
194						$genemerge->getDescFile('/tmp/desc.txt');
195
196						=cut
197
198						sub getDescFile {
199	0		0	1		my $self = shift;
200	0					my $filename = shift;
201
202	0	0				open DESC, $filename or return(-1);
203	0					while () {
204	0					chomp;
205	0					my @desc_line = split /\s/, $_, 2;
206	0					$self->{HoDesc}{$desc_line[0]} = $desc_line[1];
207						}
208	0					close DESC;
209
210	0					return(0);
211						}
212
213						=head2 getStudyFile
214
215						The method loads the study data set, counts GO frequencies, calculates P values
216						based on the hypergeometric distribution, and corrects P values, based on the
217						Bonferroni method.
218
219						The file format of the study file is the following. Each line contains one and only one
220						cluster id.
221
222						81001020
223						81001110
224
225						$genemerge->getStudyFile('/tmp/study.txt');
226
227						=cut
228
229						sub getStudyFile {
230						# TODO we should split this in 2 or 3.
231	0		0	1		my $self = shift;
232	0					my $filename = shift;
233
234	0	0				open STUDY, $filename or return(-1);
235	0					while() {
236	0					chomp;
237	0					$self->{StudyGeneNo}++;
238	0					my $StudyGene = $_;
239	0	0				if (exists $self->{HoAssoc}{$StudyGene}) {
240	0					foreach my $StudyGeneGO (@{$self->{HoAssoc}{$StudyGene}}) {
	0
241	0					$self->{HoStudyGeneAssocCount}{$StudyGeneGO}++;
242	0					push @{$self->{HoAssocStudyGene}{$StudyGeneGO}}, $StudyGene;
	0
243						}
244						} else {
245	0					$self->{StudyGeneNoAssoc}++;
246						}
247
248						}
249	0					close STUDY;
250
251						#Bonferroni correction
252	0					foreach my $StudyGeneAssocCountKey (keys %{$self->{HoStudyGeneAssocCount}}){
	0
253	0					$self->{StudyGeneUniqAssoc}++;
254	0	0				if($self->{HoPopAssocFreq}{$StudyGeneAssocCountKey} > (1 / $self->{PopGeneNo})) {
255	0					$self->{BonferroniCorr}++;
256						}
257						}
258
259						#Calculate P-values based on hypergeometric distribution
260	0					my $PVal = 0;
261	0					my $PValC = 0;
262	0					my $N = $self->{PopGeneNo};
263	0					my $K = $self->{StudyGeneNo};
264
265	0					foreach my $StudyGeneAssocCountKey (keys %{$self->{HoStudyGeneAssocCount}}){
	0
266	0					my $P = $self->{HoPopAssocFreq}{$StudyGeneAssocCountKey};
267	0					my $R = $self->{HoStudyGeneAssocCount}{$StudyGeneAssocCountKey};
268	0	0				if ($R != 1) {
269	0					$PVal = $self->hypergeometric($N,$P,$K,$R);
270	0	0				$PValC = ($PVal * $self->{BonferroniCorr} >= 1) ? 1 : $PVal * $self->{BonferroniCorr};
271						} else {
272	0					$PVal = 'NA';
273	0					$PValC = 'NA';
274						}
275	0					${$self->{HoStudyGeneAssocPVal}{$StudyGeneAssocCountKey}}[0] = $PVal;
	0
276	0					${$self->{HoStudyGeneAssocPVal}{$StudyGeneAssocCountKey}}[1] = $PValC;
	0
277						}
278
279	0					return(0);
280						}
281
282						=head2 getResults
283
284						The method gives back all the results as an arrayref of hashes.
285
286						$results = $genemerge->getResults();
287						foreach $result (@{$results}) {
288						$goterm = $$result{'GOterm'};
289						$popfreq = $$result{'PopFreq'};
290						$popfrac = $$result{'PopFrac'};
291						$studyfrac = $$result{'StudyFrac'};
292						$studyfracall = $$result{'StudyFracAll'};
293						$raw_escore = $$result{'RawEs'};
294						$escore = $$result{'EScore'};
295						$desc = $$result{'Desc'};
296						@contrib = @{$$result{'Contrib'}};
297						}
298
299						=cut
300
301						sub getResults {
302	0		0	1		my $self = shift;
303	0					my @results;
304
305	0					foreach my $goterm (sort keys %{$self->{HoStudyGeneAssocCount}}) {
	0
306	0					my %result;
307	0					$result{'GOterm'} = $goterm;
308	0					$result{'PopFreq'} = $self->{HoPopAssocFreq}{$goterm};
309	0					$result{'PopFrac'} = $self->{HoPopAssocCount}{$goterm};
310	0					$result{'PopFracAll'} = $self->{PopGeneNo};
311	0					$result{'StudyFrac'} = $self->{HoStudyGeneAssocCount}{$goterm};
312	0					$result{'StudyFracAll'} = $self->{StudyGeneNo};
313	0					$result{'RawEs'} = ${$self->{HoStudyGeneAssocPVal}{$goterm}}[0];
	0
314	0					$result{'EScore'} = ${$self->{HoStudyGeneAssocPVal}{$goterm}}[1];
	0
315	0					$result{'Desc'} = $self->{HoDesc}{$goterm};
316	0					$result{'Contrib'} = \@{$self->{HoAssocStudyGene}{$goterm}};
	0
317	0					push @results, \%result;
318						}
319
320	0					return(\@results);
321						}
322
323						=head2 hypergeometric
324
325						This is an internal function to calculate the hypergeometric distribution. Do not use it directly.
326
327						=cut
328
329						sub hypergeometric {
330	0		0	1		my $self = shift;
331	0					my $n = shift;
332	0					my $p = shift;
333	0					my $k = shift;
334	0					my $r = shift;
335
336	0					my $i = '0';
337	0					my $q = '0';
338	0					my $np = '0';
339	0					my $nq = '0';
340	0					my $top = '0';
341	0					my $sum = '0';
342	0					my $lfoo = '0';
343
344	0					my $logNchooseK = '0';
345
346	0					$q = 1 - $p;
347
348	0					$np = floor( $n * $p + 0.5 );
349	0					$nq = floor( $n * $q + 0.5 );
350
351	0					$logNchooseK = &logNchooseK( $n, $k );
352
353	0	0				$top = ($np < $k) ? $np : $k;
354
355	0					$lfoo = &logNchooseK($np, $top) + &logNchooseK($n * (1 - $p), $k - $top);
356
357	0					for ($i = $top ; $i >= $r ; $i--) {
358	0					$sum += exp($lfoo - $logNchooseK);
359	0	0				if ($i > $r) { $lfoo = $lfoo + log($i / ($np - $i + 1)) + log(($nq - $k + $i) / ($k - $i + 1)) }
	0
360						}
361	0					return $sum;
362						}
363
364						=head2 logNchooseK
365
366						Another internal function for the correct statistical results. Do not use it directly.
367
368						=cut
369
370						sub logNchooseK {
371	0		0	1		my $n = shift;
372	0					my $k = shift;
373
374	0					my $i = '0';
375	0					my $result = '0';
376
377	0	0				$k = ($k > ($n - $k)) ? $n - $k : $k;
378
379	0					for ($i = $n ; $i > ($n - $k) ; $i--) { $result += log($i) }
	0
380
381	0					$result -= &lFactorial($k);
382
383	0					return $result;
384						}
385
386						=head2 lFactorial
387
388						Factorial calculating function. Do not use it directly.
389
390						=cut
391
392						sub lFactorial {
393	0		0	1		my $number = shift;
394	0					my $result = 0;
395	0					my $i;
396
397	0					for ($i = 2 ; $i <= $number ; $i++) { $result += log($i) }
	0
398
399	0					return $result;
400						}
401
402						1;