File Coverage

blib/lib/obogaf/parser.pm

Criterion	Covered	Total	%
statement	192	192	100.0
branch	83	86	96.5
condition	3	3	100.0
subroutine	11	11	100.0
pod	6	6	100.0
total	295	298	98.9

line	stmt	bran	cond	sub	pod	time	code
1							package obogaf::parser;
2
3							require 5.006;
4							our $VERSION= '1.272';
5							$VERSION= eval $VERSION;
6
7	6			6		437498	use strict;
	6					90
	6					169
8	6			6		29	use warnings;
	6					11
	6					135
9	6			6		4067	use Graph;
	6					593460
	6					214
10	6			6		2538	use IO::File;
	6					45332
	6					574
11	6			6		2520	use PerlIO::gzip;
	6					2954
	6					13249
12
13							sub build_edges{
14	8			8	1	15599	my ($obofile)= @_;
15	8					21	my ($namespace, $idname, $isname, $pofname, $source, $destination, $pof, $res);
16	8	100				44	if($obofile=~/.obo$/){ open FH, "<", "$obofile" or die "cannot open $obofile. $!.\n"; } else { die "cannot open $obofile. The extension must be obo.\n"; }
	7	100				262
	1					7
17	6					110	while(){
18	275					304	chomp;
19	275	100				540	next if $_=~/^\s*$/;
20	263	100				805	if($_=~/^namespace:\s+(\D+)/){
		100
		100
		100
		100
21	5					24	$namespace=$1;
22							}elsif($_=~/^name:\s+(.+)/){
23	6					29	$idname=$1;
24							}elsif($_=~/^id:\s+(\D+\d+)/){
25	6					44	$destination=$1;
26							}elsif($_=~/^is_a:\s+(\D+\d+)/){
27	6					14	$source=$1;
28	6					38	($isname)= ($_=~/!\s+(.+)/);
29	6	100				16	if(defined $namespace){
30	5					34	$res .= "$namespace\t$source\t$destination\t$isname\t$idname\tis-a\n";
31							}else{
32	1					9	$res .= "$source\t$destination\t$isname\t$idname\tis-a\n";
33							}
34							}elsif($_=~/^relationship: part_of\s+(\D+\d+)/){
35	6					54	$pof=$1;
36	6					27	($pofname)= ($_=~/!\s+(.+)/);
37	6	100				35	if(defined $namespace){
38	5					34	$res .= "$namespace\t$pof\t$destination\t$pofname\t$idname\tpart-of\n";
39							}else{
40	1					7	$res .= "$pof\t$destination\t$pofname\t$idname\tpart-of\n";
41							}
42							}
43							}
44	6					51	close FH;
45	6					32	return \$res;
46							}
47
48							sub build_subonto{
49	6			6	1	15093	my ($edgesfile, $namespace)= @_;
50	6					11	my ($res, %checker);
51	6	100				205	open FH, "<", $edgesfile or die "cannot open $edgesfile. $!.\n";
52	5					86	while(){
53	11	100				146	next if $_=~/^[!,#]\|^\s*$/;
54	10					45	my @vals= split(/\t/, $_);
55	10					22	$checker{$vals[0]}=1;
56	10	100				30	if($vals[0] eq $namespace){ $res .= join("\t", @vals[1..$#vals]); }
	8					75
57							}
58	5					43	close FH;
59	5	100				23	unless(exists($checker{$namespace})){die "$edgesfile does not include $namespace or $namespace is not in the first column of $edgesfile.\n";}
	1					15
60	4					20	return \$res;
61							}
62
63							sub make_stat{
64	5			5	1	2345	my ($edgesfile, $parentIndex, $childIndex)= @_;
65	5					12	my (%indeg, %outdeg, %deg, $ed, $nd, $mindeg, $maxdeg, $medeg, $avgdeg, $den, $scc, $resdeg, $stat, $res);
66							## create graph
67	5					21	my $g= Graph->new(directed => 1);
68	5	100				1331	open FH, "<", $edgesfile or die "cannot open $edgesfile. $!.\n";
69	4					75	while(){
70	7					416	chomp;
71	7					32	my @vals= split(/\t/,$_);
72	7					27	$g->add_edge($vals[$parentIndex], $vals[$childIndex]);
73							}
74	4					480	close FH;
75							## compute indegree/outdegree/degree
76	4					21	my @V= $g->vertices;
77	4					223	foreach my $nd (@V){
78	11					45	my $i= $g->in_degree($nd);
79	11					1267	my $o= $g->out_degree($nd);
80	11					1152	my $d= $i+$o;
81	11					21	$indeg{$nd}=$i;
82	11					15	$outdeg{$nd}=$o;
83	11					21	$deg{$nd}=$d;
84							}
85	4	50				18	foreach my $node (sort{$deg{$b}<=>$deg{$a} or ($a cmp $b)} keys %deg){ $resdeg .= "$node\t$deg{$node}\t$indeg{$node}\t$outdeg{$node}\n"; }
	10					33
	11					37
86							## compute: median/max/min degree
87	4					13	my @sortdeg= sort{$a<=>$b} values (%deg);
	10					22
88	4					9	my $len= $#sortdeg+1;
89	4					10	my $mid = int $len/2;
90	4	100				14	if($len % 2){ $medeg = $sortdeg[$mid]; }else{ $medeg = ( $sortdeg[$mid-1] + $sortdeg[$mid] ) / 2; }
	3					6
	1					3
91	4					28	$medeg= sprintf("%.4f", $medeg);
92	4					8	$mindeg= $sortdeg[0];
93	4					8	$maxdeg= $sortdeg[$#sortdeg];
94							## compute number of nodes and edges
95	4					13	$ed= $g->edges;
96	4					146	$nd= $g->vertices;
97							## compute average degree and density
98	4					209	$avgdeg= $ed/$nd;
99	4					8	$den= $ed / ( $nd * ($nd -1) );
100	4					23	$avgdeg= sprintf("%.4f", $avgdeg);
101	4					14	$den= sprintf("%.4e", $den);
102							## return stat
103	4					12	$stat .= "nodes: $nd\nedges: $ed\nmax degree: $maxdeg\nmin degree: $mindeg\n";
104	4					10	$stat .= "median degree: $medeg\naverage degree: $avgdeg\ndensity: $den\n";
105	4					11	$res= "#oboterm degree indegree outdegree\n".$resdeg."\n"."~summary stat~\n".$stat;
106	4					60	return $res;
107							}
108
109							sub get_parents_or_children_list{
110	10			10	1	15818	my ($edgesfile, $parentIndex, $childIndex, $chdORpar)= @_;
111	10					16	my (%nodelist);
112	10	100	100			39	if($chdORpar ne "parents" && $chdORpar ne "children"){ die "$chdORpar can be 'parents' or 'children'.\n";}
	2					14
113	8	100				260	open FH, "<", $edgesfile or die "cannot open $edgesfile. $!.\n";
114	6					96	while(){
115	12					24	chomp;
116	12					49	my @vals= split(/\t/,$_);
117	12	100				24	if($chdORpar eq "parents"){
118	6					42	$nodelist{$vals[$childIndex]} .= $vals[$parentIndex]."\|";
119							}else{
120	6					45	$nodelist{$vals[$parentIndex]} .= $vals[$childIndex]."\|";
121							}
122							}
123	6					55	close FH;
124	6					24	foreach my $term (keys %nodelist){ chop $nodelist{$term}; }
	9					17
125	6					29	return \%nodelist;
126							}
127
128							sub gene2biofun{
129	6			6	1	14814	my ($annfile, $geneIndex, $classIndex)= @_;
130	6					13	my (%gene2biofun, @genes, @biofun, $stat)= ();
131	6					15	my ($sample, $oboterm)= (0)x2;
132	6	100				20	if ($annfile =~ /.gz$/){
133	2	100				96	open FH, "<:gzip", $annfile or die "cannot open $annfile. $!.\n";
134							}else{
135	4	100				136	open FH, "<", "$annfile" or die "cannot open $annfile. $!.\n";
136							}
137	4					90	while(){
138	27	100				384	next if $_=~/^[!,#]\|^\s*$/;
139	24					29	chomp;
140	24					83	my @vals=split(/\t/,$_);
141	24					42	push(@genes, $vals[$geneIndex]);
142	24					30	push(@biofun, $vals[$classIndex]);
143	24					93	$gene2biofun{$vals[$geneIndex]} .= $vals[$classIndex]."\|";
144							}
145	4					48	close FH;
146	4					16	foreach my $gene (keys %gene2biofun){ chop $gene2biofun{$gene}; }
	8					13
147	4					19	my %seen=();
148	4					9	my @uniqgenes= grep{!$seen{$_}++} @genes;
	24					50
149	4					7	$sample= scalar(@uniqgenes);
150	4					8	undef %seen;
151	4					6	my @uniqpbiofun= grep{!$seen{$_}++} @biofun;
	24					48
152	4					7	$oboterm= scalar(@uniqpbiofun);
153	4					11	$stat .= "genes: $sample\nontology terms: $oboterm\n";
154	4					26	return (\%gene2biofun, \$stat);
155							}
156
157							sub map_OBOterm_between_release{
158	9			9	1	20389	my ($obofile, $annfile, $classIndex)= @_;
159	9					21	my (%altid, %oldclass, %old2new, $header, $id, $fln, $pair, $stat, $pstat);
160	9					74	my ($alt, $classes, $seen, $unseen)= (0)x4;
161							## step 0: pairing altid_2_id (key: alt_id)
162	9	100				49	if($obofile=~/.obo$/){ open FH, "<", "$obofile" or die "cannot open $obofile. $!.\n"; } else { die "cannot open $obofile. The extension must be obo.\n"; }
	8	100				274
	1					8
163	7					119	while (){
164	322					431	chomp;
165	322	100				683	next if $_=~/^\s*$/;
166	308	100				515	if($_=~/^id:\s+(\D+\d+)/){ $id=$1; }
	7					22
167	308	100				723	if($_=~/^alt_id:\s+(\D+\d+)/){ $altid{$1}=$id; }
	49					168
168							}
169	7					60	close FH;
170	7					20	$alt= keys(%altid);
171							# step 1: storing old ontology terms in a hash
172	7	100				24	if ($annfile =~ /.gz$/){
173	2	100				110	open FH, "<:gzip", $annfile or die "cannot open $annfile. $!.\n";
174							}else{
175	5	100				169	open FH, "<", "$annfile" or die "cannot open $annfile. $!.\n";
176							}
177	5					118	while(){
178	28					58	chomp;
179	28	100				469	if($_=~/^[!,#]\|^\s*$/){ $header .= "$_\n"; }
	2					7
180	28	100				407	next if $_=~/^[!,#]\|^\s*$/;
181	26					109	my @vals=split(/\t/,$_);
182	26					134	$oldclass{$vals[$classIndex]}=$vals[$classIndex];
183							}
184	5					59	close FH;
185	5					14	$classes= keys(%oldclass);
186							## step 2: mapping old GO terms to the new one using alt_id as key
187	5					9	my $tmp= "";
188	5					27	foreach my $k (sort{$a cmp $b} keys(%altid)){
	62					88
189	35	100				62	if($oldclass{$k}){
190	8					16	$old2new{$k}=$altid{$oldclass{$k}}; ## pairing
191	8					10	$seen++;
192	8					12	$tmp= $altid{$oldclass{$k}};
193							}else{
194	27					31	$tmp= "unseen";
195	27					36	$unseen++;
196							}
197	35	100				63	if($tmp ne "unseen"){
198	8					22	$pair .= "$k\t$altid{$oldclass{$k}}\n";
199							}
200							}
201							## step 3: substitute ALT-ID with the updated ID, then the annotation file is returned.
202	5	100				19	if ($annfile =~ /.gz$/){
203	1	50				44	open FH, "<:gzip", $annfile or die "cannot open $annfile. $!.\n";
204							}else{
205	4	50				119	open FH, "<", "$annfile" or die "cannot open $annfile. $!.\n";
206							}
207	5					69	while(){
208	28					56	chomp;
209	28	100				453	next if $_=~/^[!,#]\|^\s*$/;
210	26					104	my @vals= split(/\t/, $_);
211	26					41	my $oboterm= $vals[$classIndex];
212	26	100				53	if($old2new{$oboterm}){
213	16					22	$oboterm= $old2new{$oboterm};
214	16					138	$_=~ s/$vals[$classIndex]/$oboterm/g;
215	16					87	$fln .= "$_\n";
216							}else{
217	10					74	$fln .= "$_\n";
218							}
219							}
220	5					46	close FH;
221	5	100				16	if(defined $header){$fln = $header.$fln;}
	1					4
222							## print mapping stat
223	5					18	$stat .= "Tot. ontology terms:\t$classes\nTot. altID:\t$alt\nTot. altID seen:\t$seen\nTot. altID unseen:\t$unseen\n";
224	5	100				9	unless(not defined $pair){
225	4					12	$pstat .= "#alt-id id\n$pair\n$stat";
226	4					39	return (\$fln, \$pstat);
227							}
228	1					11	return (\$fln, \$stat);
229							}
230
231							1;
232
233							__END__