line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package obogaf::parser; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
require 5.006; |
4
|
|
|
|
|
|
|
our $VERSION= '1.272'; |
5
|
|
|
|
|
|
|
$VERSION= eval $VERSION; |
6
|
|
|
|
|
|
|
|
7
|
6
|
|
|
6
|
|
437498
|
use strict; |
|
6
|
|
|
|
|
90
|
|
|
6
|
|
|
|
|
169
|
|
8
|
6
|
|
|
6
|
|
29
|
use warnings; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
135
|
|
9
|
6
|
|
|
6
|
|
4067
|
use Graph; |
|
6
|
|
|
|
|
593460
|
|
|
6
|
|
|
|
|
214
|
|
10
|
6
|
|
|
6
|
|
2538
|
use IO::File; |
|
6
|
|
|
|
|
45332
|
|
|
6
|
|
|
|
|
574
|
|
11
|
6
|
|
|
6
|
|
2520
|
use PerlIO::gzip; |
|
6
|
|
|
|
|
2954
|
|
|
6
|
|
|
|
|
13249
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
sub build_edges{ |
14
|
8
|
|
|
8
|
1
|
15599
|
my ($obofile)= @_; |
15
|
8
|
|
|
|
|
21
|
my ($namespace, $idname, $isname, $pofname, $source, $destination, $pof, $res); |
16
|
8
|
100
|
|
|
|
44
|
if($obofile=~/.obo$/){ open FH, "<", "$obofile" or die "cannot open $obofile. $!.\n"; } else { die "cannot open $obofile. The extension must be obo.\n"; } |
|
7
|
100
|
|
|
|
262
|
|
|
1
|
|
|
|
|
7
|
|
17
|
6
|
|
|
|
|
110
|
while(){ |
18
|
275
|
|
|
|
|
304
|
chomp; |
19
|
275
|
100
|
|
|
|
540
|
next if $_=~/^\s*$/; |
20
|
263
|
100
|
|
|
|
805
|
if($_=~/^namespace:\s+(\D+)/){ |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
21
|
5
|
|
|
|
|
24
|
$namespace=$1; |
22
|
|
|
|
|
|
|
}elsif($_=~/^name:\s+(.+)/){ |
23
|
6
|
|
|
|
|
29
|
$idname=$1; |
24
|
|
|
|
|
|
|
}elsif($_=~/^id:\s+(\D+\d+)/){ |
25
|
6
|
|
|
|
|
44
|
$destination=$1; |
26
|
|
|
|
|
|
|
}elsif($_=~/^is_a:\s+(\D+\d+)/){ |
27
|
6
|
|
|
|
|
14
|
$source=$1; |
28
|
6
|
|
|
|
|
38
|
($isname)= ($_=~/!\s+(.+)/); |
29
|
6
|
100
|
|
|
|
16
|
if(defined $namespace){ |
30
|
5
|
|
|
|
|
34
|
$res .= "$namespace\t$source\t$destination\t$isname\t$idname\tis-a\n"; |
31
|
|
|
|
|
|
|
}else{ |
32
|
1
|
|
|
|
|
9
|
$res .= "$source\t$destination\t$isname\t$idname\tis-a\n"; |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
}elsif($_=~/^relationship: part_of\s+(\D+\d+)/){ |
35
|
6
|
|
|
|
|
54
|
$pof=$1; |
36
|
6
|
|
|
|
|
27
|
($pofname)= ($_=~/!\s+(.+)/); |
37
|
6
|
100
|
|
|
|
35
|
if(defined $namespace){ |
38
|
5
|
|
|
|
|
34
|
$res .= "$namespace\t$pof\t$destination\t$pofname\t$idname\tpart-of\n"; |
39
|
|
|
|
|
|
|
}else{ |
40
|
1
|
|
|
|
|
7
|
$res .= "$pof\t$destination\t$pofname\t$idname\tpart-of\n"; |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
} |
44
|
6
|
|
|
|
|
51
|
close FH; |
45
|
6
|
|
|
|
|
32
|
return \$res; |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub build_subonto{ |
49
|
6
|
|
|
6
|
1
|
15093
|
my ($edgesfile, $namespace)= @_; |
50
|
6
|
|
|
|
|
11
|
my ($res, %checker); |
51
|
6
|
100
|
|
|
|
205
|
open FH, "<", $edgesfile or die "cannot open $edgesfile. $!.\n"; |
52
|
5
|
|
|
|
|
86
|
while(){ |
53
|
11
|
100
|
|
|
|
146
|
next if $_=~/^[!,#]|^\s*$/; |
54
|
10
|
|
|
|
|
45
|
my @vals= split(/\t/, $_); |
55
|
10
|
|
|
|
|
22
|
$checker{$vals[0]}=1; |
56
|
10
|
100
|
|
|
|
30
|
if($vals[0] eq $namespace){ $res .= join("\t", @vals[1..$#vals]); } |
|
8
|
|
|
|
|
75
|
|
57
|
|
|
|
|
|
|
} |
58
|
5
|
|
|
|
|
43
|
close FH; |
59
|
5
|
100
|
|
|
|
23
|
unless(exists($checker{$namespace})){die "$edgesfile does not include $namespace or $namespace is not in the first column of $edgesfile.\n";} |
|
1
|
|
|
|
|
15
|
|
60
|
4
|
|
|
|
|
20
|
return \$res; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub make_stat{ |
64
|
5
|
|
|
5
|
1
|
2345
|
my ($edgesfile, $parentIndex, $childIndex)= @_; |
65
|
5
|
|
|
|
|
12
|
my (%indeg, %outdeg, %deg, $ed, $nd, $mindeg, $maxdeg, $medeg, $avgdeg, $den, $scc, $resdeg, $stat, $res); |
66
|
|
|
|
|
|
|
## create graph |
67
|
5
|
|
|
|
|
21
|
my $g= Graph->new(directed => 1); |
68
|
5
|
100
|
|
|
|
1331
|
open FH, "<", $edgesfile or die "cannot open $edgesfile. $!.\n"; |
69
|
4
|
|
|
|
|
75
|
while(){ |
70
|
7
|
|
|
|
|
416
|
chomp; |
71
|
7
|
|
|
|
|
32
|
my @vals= split(/\t/,$_); |
72
|
7
|
|
|
|
|
27
|
$g->add_edge($vals[$parentIndex], $vals[$childIndex]); |
73
|
|
|
|
|
|
|
} |
74
|
4
|
|
|
|
|
480
|
close FH; |
75
|
|
|
|
|
|
|
## compute indegree/outdegree/degree |
76
|
4
|
|
|
|
|
21
|
my @V= $g->vertices; |
77
|
4
|
|
|
|
|
223
|
foreach my $nd (@V){ |
78
|
11
|
|
|
|
|
45
|
my $i= $g->in_degree($nd); |
79
|
11
|
|
|
|
|
1267
|
my $o= $g->out_degree($nd); |
80
|
11
|
|
|
|
|
1152
|
my $d= $i+$o; |
81
|
11
|
|
|
|
|
21
|
$indeg{$nd}=$i; |
82
|
11
|
|
|
|
|
15
|
$outdeg{$nd}=$o; |
83
|
11
|
|
|
|
|
21
|
$deg{$nd}=$d; |
84
|
|
|
|
|
|
|
} |
85
|
4
|
50
|
|
|
|
18
|
foreach my $node (sort{$deg{$b}<=>$deg{$a} or ($a cmp $b)} keys %deg){ $resdeg .= "$node\t$deg{$node}\t$indeg{$node}\t$outdeg{$node}\n"; } |
|
10
|
|
|
|
|
33
|
|
|
11
|
|
|
|
|
37
|
|
86
|
|
|
|
|
|
|
## compute: median/max/min degree |
87
|
4
|
|
|
|
|
13
|
my @sortdeg= sort{$a<=>$b} values (%deg); |
|
10
|
|
|
|
|
22
|
|
88
|
4
|
|
|
|
|
9
|
my $len= $#sortdeg+1; |
89
|
4
|
|
|
|
|
10
|
my $mid = int $len/2; |
90
|
4
|
100
|
|
|
|
14
|
if($len % 2){ $medeg = $sortdeg[$mid]; }else{ $medeg = ( $sortdeg[$mid-1] + $sortdeg[$mid] ) / 2; } |
|
3
|
|
|
|
|
6
|
|
|
1
|
|
|
|
|
3
|
|
91
|
4
|
|
|
|
|
28
|
$medeg= sprintf("%.4f", $medeg); |
92
|
4
|
|
|
|
|
8
|
$mindeg= $sortdeg[0]; |
93
|
4
|
|
|
|
|
8
|
$maxdeg= $sortdeg[$#sortdeg]; |
94
|
|
|
|
|
|
|
## compute number of nodes and edges |
95
|
4
|
|
|
|
|
13
|
$ed= $g->edges; |
96
|
4
|
|
|
|
|
146
|
$nd= $g->vertices; |
97
|
|
|
|
|
|
|
## compute average degree and density |
98
|
4
|
|
|
|
|
209
|
$avgdeg= $ed/$nd; |
99
|
4
|
|
|
|
|
8
|
$den= $ed / ( $nd * ($nd -1) ); |
100
|
4
|
|
|
|
|
23
|
$avgdeg= sprintf("%.4f", $avgdeg); |
101
|
4
|
|
|
|
|
14
|
$den= sprintf("%.4e", $den); |
102
|
|
|
|
|
|
|
## return stat |
103
|
4
|
|
|
|
|
12
|
$stat .= "nodes: $nd\nedges: $ed\nmax degree: $maxdeg\nmin degree: $mindeg\n"; |
104
|
4
|
|
|
|
|
10
|
$stat .= "median degree: $medeg\naverage degree: $avgdeg\ndensity: $den\n"; |
105
|
4
|
|
|
|
|
11
|
$res= "#oboterm degree indegree outdegree\n".$resdeg."\n"."~summary stat~\n".$stat; |
106
|
4
|
|
|
|
|
60
|
return $res; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub get_parents_or_children_list{ |
110
|
10
|
|
|
10
|
1
|
15818
|
my ($edgesfile, $parentIndex, $childIndex, $chdORpar)= @_; |
111
|
10
|
|
|
|
|
16
|
my (%nodelist); |
112
|
10
|
100
|
100
|
|
|
39
|
if($chdORpar ne "parents" && $chdORpar ne "children"){ die "$chdORpar can be 'parents' or 'children'.\n";} |
|
2
|
|
|
|
|
14
|
|
113
|
8
|
100
|
|
|
|
260
|
open FH, "<", $edgesfile or die "cannot open $edgesfile. $!.\n"; |
114
|
6
|
|
|
|
|
96
|
while(){ |
115
|
12
|
|
|
|
|
24
|
chomp; |
116
|
12
|
|
|
|
|
49
|
my @vals= split(/\t/,$_); |
117
|
12
|
100
|
|
|
|
24
|
if($chdORpar eq "parents"){ |
118
|
6
|
|
|
|
|
42
|
$nodelist{$vals[$childIndex]} .= $vals[$parentIndex]."|"; |
119
|
|
|
|
|
|
|
}else{ |
120
|
6
|
|
|
|
|
45
|
$nodelist{$vals[$parentIndex]} .= $vals[$childIndex]."|"; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
} |
123
|
6
|
|
|
|
|
55
|
close FH; |
124
|
6
|
|
|
|
|
24
|
foreach my $term (keys %nodelist){ chop $nodelist{$term}; } |
|
9
|
|
|
|
|
17
|
|
125
|
6
|
|
|
|
|
29
|
return \%nodelist; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub gene2biofun{ |
129
|
6
|
|
|
6
|
1
|
14814
|
my ($annfile, $geneIndex, $classIndex)= @_; |
130
|
6
|
|
|
|
|
13
|
my (%gene2biofun, @genes, @biofun, $stat)= (); |
131
|
6
|
|
|
|
|
15
|
my ($sample, $oboterm)= (0)x2; |
132
|
6
|
100
|
|
|
|
20
|
if ($annfile =~ /.gz$/){ |
133
|
2
|
100
|
|
|
|
96
|
open FH, "<:gzip", $annfile or die "cannot open $annfile. $!.\n"; |
134
|
|
|
|
|
|
|
}else{ |
135
|
4
|
100
|
|
|
|
136
|
open FH, "<", "$annfile" or die "cannot open $annfile. $!.\n"; |
136
|
|
|
|
|
|
|
} |
137
|
4
|
|
|
|
|
90
|
while(){ |
138
|
27
|
100
|
|
|
|
384
|
next if $_=~/^[!,#]|^\s*$/; |
139
|
24
|
|
|
|
|
29
|
chomp; |
140
|
24
|
|
|
|
|
83
|
my @vals=split(/\t/,$_); |
141
|
24
|
|
|
|
|
42
|
push(@genes, $vals[$geneIndex]); |
142
|
24
|
|
|
|
|
30
|
push(@biofun, $vals[$classIndex]); |
143
|
24
|
|
|
|
|
93
|
$gene2biofun{$vals[$geneIndex]} .= $vals[$classIndex]."|"; |
144
|
|
|
|
|
|
|
} |
145
|
4
|
|
|
|
|
48
|
close FH; |
146
|
4
|
|
|
|
|
16
|
foreach my $gene (keys %gene2biofun){ chop $gene2biofun{$gene}; } |
|
8
|
|
|
|
|
13
|
|
147
|
4
|
|
|
|
|
19
|
my %seen=(); |
148
|
4
|
|
|
|
|
9
|
my @uniqgenes= grep{!$seen{$_}++} @genes; |
|
24
|
|
|
|
|
50
|
|
149
|
4
|
|
|
|
|
7
|
$sample= scalar(@uniqgenes); |
150
|
4
|
|
|
|
|
8
|
undef %seen; |
151
|
4
|
|
|
|
|
6
|
my @uniqpbiofun= grep{!$seen{$_}++} @biofun; |
|
24
|
|
|
|
|
48
|
|
152
|
4
|
|
|
|
|
7
|
$oboterm= scalar(@uniqpbiofun); |
153
|
4
|
|
|
|
|
11
|
$stat .= "genes: $sample\nontology terms: $oboterm\n"; |
154
|
4
|
|
|
|
|
26
|
return (\%gene2biofun, \$stat); |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub map_OBOterm_between_release{ |
158
|
9
|
|
|
9
|
1
|
20389
|
my ($obofile, $annfile, $classIndex)= @_; |
159
|
9
|
|
|
|
|
21
|
my (%altid, %oldclass, %old2new, $header, $id, $fln, $pair, $stat, $pstat); |
160
|
9
|
|
|
|
|
74
|
my ($alt, $classes, $seen, $unseen)= (0)x4; |
161
|
|
|
|
|
|
|
## step 0: pairing altid_2_id (key: alt_id) |
162
|
9
|
100
|
|
|
|
49
|
if($obofile=~/.obo$/){ open FH, "<", "$obofile" or die "cannot open $obofile. $!.\n"; } else { die "cannot open $obofile. The extension must be obo.\n"; } |
|
8
|
100
|
|
|
|
274
|
|
|
1
|
|
|
|
|
8
|
|
163
|
7
|
|
|
|
|
119
|
while (){ |
164
|
322
|
|
|
|
|
431
|
chomp; |
165
|
322
|
100
|
|
|
|
683
|
next if $_=~/^\s*$/; |
166
|
308
|
100
|
|
|
|
515
|
if($_=~/^id:\s+(\D+\d+)/){ $id=$1; } |
|
7
|
|
|
|
|
22
|
|
167
|
308
|
100
|
|
|
|
723
|
if($_=~/^alt_id:\s+(\D+\d+)/){ $altid{$1}=$id; } |
|
49
|
|
|
|
|
168
|
|
168
|
|
|
|
|
|
|
} |
169
|
7
|
|
|
|
|
60
|
close FH; |
170
|
7
|
|
|
|
|
20
|
$alt= keys(%altid); |
171
|
|
|
|
|
|
|
# step 1: storing old ontology terms in a hash |
172
|
7
|
100
|
|
|
|
24
|
if ($annfile =~ /.gz$/){ |
173
|
2
|
100
|
|
|
|
110
|
open FH, "<:gzip", $annfile or die "cannot open $annfile. $!.\n"; |
174
|
|
|
|
|
|
|
}else{ |
175
|
5
|
100
|
|
|
|
169
|
open FH, "<", "$annfile" or die "cannot open $annfile. $!.\n"; |
176
|
|
|
|
|
|
|
} |
177
|
5
|
|
|
|
|
118
|
while(){ |
178
|
28
|
|
|
|
|
58
|
chomp; |
179
|
28
|
100
|
|
|
|
469
|
if($_=~/^[!,#]|^\s*$/){ $header .= "$_\n"; } |
|
2
|
|
|
|
|
7
|
|
180
|
28
|
100
|
|
|
|
407
|
next if $_=~/^[!,#]|^\s*$/; |
181
|
26
|
|
|
|
|
109
|
my @vals=split(/\t/,$_); |
182
|
26
|
|
|
|
|
134
|
$oldclass{$vals[$classIndex]}=$vals[$classIndex]; |
183
|
|
|
|
|
|
|
} |
184
|
5
|
|
|
|
|
59
|
close FH; |
185
|
5
|
|
|
|
|
14
|
$classes= keys(%oldclass); |
186
|
|
|
|
|
|
|
## step 2: mapping old GO terms to the new one using *alt_id* as key |
187
|
5
|
|
|
|
|
9
|
my $tmp= ""; |
188
|
5
|
|
|
|
|
27
|
foreach my $k (sort{$a cmp $b} keys(%altid)){ |
|
62
|
|
|
|
|
88
|
|
189
|
35
|
100
|
|
|
|
62
|
if($oldclass{$k}){ |
190
|
8
|
|
|
|
|
16
|
$old2new{$k}=$altid{$oldclass{$k}}; ## pairing |
191
|
8
|
|
|
|
|
10
|
$seen++; |
192
|
8
|
|
|
|
|
12
|
$tmp= $altid{$oldclass{$k}}; |
193
|
|
|
|
|
|
|
}else{ |
194
|
27
|
|
|
|
|
31
|
$tmp= "unseen"; |
195
|
27
|
|
|
|
|
36
|
$unseen++; |
196
|
|
|
|
|
|
|
} |
197
|
35
|
100
|
|
|
|
63
|
if($tmp ne "unseen"){ |
198
|
8
|
|
|
|
|
22
|
$pair .= "$k\t$altid{$oldclass{$k}}\n"; |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
## step 3: substitute ALT-ID with the updated ID, then the annotation file is returned. |
202
|
5
|
100
|
|
|
|
19
|
if ($annfile =~ /.gz$/){ |
203
|
1
|
50
|
|
|
|
44
|
open FH, "<:gzip", $annfile or die "cannot open $annfile. $!.\n"; |
204
|
|
|
|
|
|
|
}else{ |
205
|
4
|
50
|
|
|
|
119
|
open FH, "<", "$annfile" or die "cannot open $annfile. $!.\n"; |
206
|
|
|
|
|
|
|
} |
207
|
5
|
|
|
|
|
69
|
while(){ |
208
|
28
|
|
|
|
|
56
|
chomp; |
209
|
28
|
100
|
|
|
|
453
|
next if $_=~/^[!,#]|^\s*$/; |
210
|
26
|
|
|
|
|
104
|
my @vals= split(/\t/, $_); |
211
|
26
|
|
|
|
|
41
|
my $oboterm= $vals[$classIndex]; |
212
|
26
|
100
|
|
|
|
53
|
if($old2new{$oboterm}){ |
213
|
16
|
|
|
|
|
22
|
$oboterm= $old2new{$oboterm}; |
214
|
16
|
|
|
|
|
138
|
$_=~ s/$vals[$classIndex]/$oboterm/g; |
215
|
16
|
|
|
|
|
87
|
$fln .= "$_\n"; |
216
|
|
|
|
|
|
|
}else{ |
217
|
10
|
|
|
|
|
74
|
$fln .= "$_\n"; |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
} |
220
|
5
|
|
|
|
|
46
|
close FH; |
221
|
5
|
100
|
|
|
|
16
|
if(defined $header){$fln = $header.$fln;} |
|
1
|
|
|
|
|
4
|
|
222
|
|
|
|
|
|
|
## print mapping stat |
223
|
5
|
|
|
|
|
18
|
$stat .= "Tot. ontology terms:\t$classes\nTot. altID:\t$alt\nTot. altID seen:\t$seen\nTot. altID unseen:\t$unseen\n"; |
224
|
5
|
100
|
|
|
|
9
|
unless(not defined $pair){ |
225
|
4
|
|
|
|
|
12
|
$pstat .= "#alt-id id\n$pair\n$stat"; |
226
|
4
|
|
|
|
|
39
|
return (\$fln, \$pstat); |
227
|
|
|
|
|
|
|
} |
228
|
1
|
|
|
|
|
11
|
return (\$fln, \$stat); |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
1; |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
__END__ |