File Coverage

blib/lib/obogaf/parser.pm

Criterion	Covered	Total	%
statement	228	228	100.0
branch	96	102	94.1
condition	5	6	83.3
subroutine	13	13	100.0
pod	7	7	100.0
total	349	356	98.0

line	stmt	bran	cond	sub	pod	time	code
1							package obogaf::parser;
2							require 5.006;
3							our $VERSION= '1.373';
4							$VERSION= eval $VERSION;
5
6							require Exporter;
7							our @ISA= qw(Exporter);
8							our %EXPORT_TAGS=(
9							all => [qw(
10							&build_edges
11							&build_subonto
12							&make_stat
13							&get_parents_or_children_list
14							&obo_filter
15							&gene2biofun
16							&map_OBOterm_between_release
17							)],
18							);
19							our @EXPORT_OK= (@{$EXPORT_TAGS{'all'}});
20
21	7			7		611698	use strict;
	7					72
	7					211
22	7			7		38	use warnings;
	7					12
	7					164
23	7			7		5825	use Graph;
	7					821520
	7					360
24	7			7		4432	use List::MoreUtils qw(uniq);
	7					92691
	7					44
25	7			7		11583	use IO::File;
	7					64000
	7					826
26	7			7		3431	use PerlIO::gzip;
	7					4131
	7					23163
27
28							sub build_edges{
29	8			8	1	17661	my ($obofile)= @_;
30	8					19	my ($namespace, $idname, $isname, $pofname, $source, $destination, $pof, $res);
31	8	100				51	if($obofile=~/.obo$/i){ open FH, "<", "$obofile" or die "cannot open $obofile. $!.\n"; }
	8	50				348
32	6					135	while(){
33	275					360	chomp;
34	275	100				650	next if $_=~/^\s*$/;
35	263	100				1015	if($_=~/^namespace:\s+(\D+)/){
		100
		100
		100
		100
36	5					20	$namespace=$1;
37							}elsif($_=~/^name:\s+(.+)/){
38	6					55	$idname=$1;
39							}elsif($_=~/^id:\s+(\D+\d+)/){
40	6					39	$destination=$1;
41							}elsif($_=~/^is_a:\s+(\D+\d+)/){
42	6					17	$source=$1;
43	6					28	($isname)= ($_=~/!\s+(.+)/);
44	6	100				21	if(defined $namespace){
45	5					59	$res .= "$namespace\t$source\t$destination\t$isname\t$idname\tis-a\n";
46							}else{
47	1					7	$res .= "$source\t$destination\t$isname\t$idname\tis-a\n";
48							}
49							}elsif($_=~/^relationship: part_of\s+(\D+\d+)/){
50	6					23	$pof=$1;
51	6					38	($pofname)= ($_=~/!\s+(.+)/);
52	6	100				17	if(defined $namespace){
53	5					53	$res .= "$namespace\t$pof\t$destination\t$pofname\t$idname\tpart-of\n";
54							}else{
55	1					5	$res .= "$pof\t$destination\t$pofname\t$idname\tpart-of\n";
56							}
57							}
58							}
59	6					72	close FH;
60	6					35	return \$res;
61							}
62
63							sub build_subonto{
64	6			6	1	19242	my ($edgesfile, $namespace)= @_;
65	6					13	my ($res, %checker);
66	6	100				240	open FH, "<", $edgesfile or die "cannot open $edgesfile. $!.\n";
67	5					113	while(){
68	11	100				165	next if $_=~/^[!,#]\|^\s*$/;
69	10					46	my @vals= split(/\t/, $_);
70	10					25	$checker{$vals[0]}=1;
71	10	100				36	if($vals[0] eq $namespace){ $res .= join("\t", @vals[1..$#vals]); }
	8					98
72							}
73	5					54	close FH;
74	5	100				23	unless(exists($checker{$namespace})){die "$edgesfile does not include $namespace or $namespace is not in the first column of $edgesfile.\n";}
	1					17
75	4					27	return \$res;
76							}
77
78							sub make_stat{
79	5			5	1	2506	my ($edgesfile, $parentIndex, $childIndex)= @_;
80	5					15	my (%indeg, %outdeg, %deg, $ed, $nd, $mindeg, $maxdeg, $medeg, $avgdeg, $den, $scc, $resdeg, $stat, $res);
81							## create graph
82	5					23	my $g= Graph->new(directed => 1);
83	5	100				1397	open FH, "<", $edgesfile or die "cannot open $edgesfile. $!.\n";
84	4					87	while(){
85	7					463	chomp;
86	7					31	my @vals= split(/\t/,$_);
87	7					31	$g->add_edge($vals[$parentIndex], $vals[$childIndex]);
88							}
89	4					418	close FH;
90							## compute indegree/outdegree/degree
91	4					20	my @V= $g->vertices;
92	4					223	foreach my $nd (@V){
93	11					24	my $i= $g->in_degree($nd);
94	11					1285	my $o= $g->out_degree($nd);
95	11					1236	my $d= $i+$o;
96	11					21	$indeg{$nd}=$i;
97	11					17	$outdeg{$nd}=$o;
98	11					22	$deg{$nd}=$d;
99							}
100	4	50				20	foreach my $node (sort{$deg{$b}<=>$deg{$a} or ($a cmp $b)} keys %deg){ $resdeg .= "$node\t$deg{$node}\t$indeg{$node}\t$outdeg{$node}\n"; }
	9					30
	11					37
101							## compute: median/max/min degree
102	4					12	my @sortdeg= sort{$a<=>$b} values (%deg);
	10					22
103	4					6	my $len= $#sortdeg+1;
104	4					15	my $mid = int $len/2;
105	4	100				11	if($len % 2){ $medeg = $sortdeg[$mid]; }else{ $medeg = ( $sortdeg[$mid-1] + $sortdeg[$mid] ) / 2; }
	3					7
	1					4
106	4					30	$medeg= sprintf("%.4f", $medeg);
107	4					7	$mindeg= $sortdeg[0];
108	4					21	$maxdeg= $sortdeg[$#sortdeg];
109							## compute number of nodes and edges
110	4					16	$ed= $g->edges;
111	4					160	$nd= $g->vertices;
112							## compute average degree and density
113	4					165	$avgdeg= $ed/$nd;
114	4					10	$den= $ed / ( $nd * ($nd -1) );
115	4					27	$avgdeg= sprintf("%.4f", $avgdeg);
116	4					14	$den= sprintf("%.4e", $den);
117							## return stat
118	4					15	$stat .= "nodes: $nd\nedges: $ed\nmax degree: $maxdeg\nmin degree: $mindeg\n";
119	4					12	$stat .= "median degree: $medeg\naverage degree: $avgdeg\ndensity: $den\n";
120	4					11	$res= "#oboterm degree indegree outdegree\n".$resdeg."\n"."~summary stat~\n".$stat;
121	4					62	return $res;
122							}
123
124							sub get_parents_or_children_list{
125	10			10	1	18555	my ($edgesfile, $parentIndex, $childIndex, $chdORpar)= @_;
126	10					21	my (%nodelist, %outlist);
127	10	100	100			49	if($chdORpar ne "parents" && $chdORpar ne "children"){ die "$chdORpar can be 'parents' or 'children'.\n";}
	2					16
128	8	100				283	open FH, "<", $edgesfile or die "cannot open $edgesfile. $!.\n";
129	6					128	while(){
130	12					26	chomp;
131	12					48	my @vals= split(/\t/,$_);
132	12	100				29	if($chdORpar eq "parents"){
133	6					8	push (@{$nodelist{$vals[$childIndex]}}, $vals[$parentIndex]);
	6					47
134							}else{
135	6					8	push (@{$nodelist{$vals[$parentIndex]}}, $vals[$childIndex]);
	6					49
136							}
137							}
138	6					56	close FH;
139	6					24	foreach my $term (keys %nodelist){
140	9					15	$outlist{$term} = join('\|', sort{($a=~/(\d+)/)[0] <=> ($b=~/(\d+)/)[0]} uniq @{$nodelist{$term}});
	3					42
	9					62
141							}
142	6					38	return \%outlist;
143							}
144
145							sub obo_filter{
146	5			5	1	18321	my ($obofile, $termsfile)= @_;
147	5					9	my (@oboterms, $header, $counter, $res, $obo);
148							## obo terms of interest
149	5	100				229	open FH, "<", "$termsfile" or die "cannot open $termsfile. $!.\n";
150	4					82	while(){
151	14					28	chomp;
152	14	100				53	next if $_=~/^\s*$/;
153	12					54	push(@oboterms, $_);
154							}
155	4					52	close FH;
156	4					9	my @unique= do { my %seen; grep { !$seen{$_}++ } @oboterms};
	4					13
	4					8
	12					52
157	4					9	@oboterms=@unique;
158							## store obofile header
159	4	100				24	if($obofile=~/.obo$/i){ open FH, "<", "$obofile" or die "cannot open $obofile. $!.\n"; }
	4	50				137
160	3					59	while(){
161	357	100				763	next unless 1 .. /\[Term\]/;
162	81	100				148	next if /^\[Term\]/;
163	78					154	$header .= $_;
164							}
165							## extract from obo file the terms of interest
166	3					16	foreach my $oboterm (@oboterms){
167	8					22	$counter++;
168	8					250	open FH, $obofile;
169	8					149	while(){
170	952	100	66			3232	if(/^id:\s+($oboterm)/ .. (/^$/ \|\| eof FH)){ ## check for end of file
171	40					155	$res .= $_;
172							}
173							}
174	8					78	close FH;
175	8					307	print($counter, "\t", $oboterm, "\tdone\n");
176							}
177	3	100				16	unless(defined $res){ die "none obo terms listed in $termsfile file was found in $obofile file. $!.\n" }
	1					18
178							## print obofile filtered
179	2					11	$obo = $header.$res."\n";
180	2					41	$obo=~ s/\b(id:\s+(\D+\d+))/\[Term\]\n$1/g; ## note: word boundary (\b) is necessary to match only "id:"" and not "alt_id:" in the string $obo
181	2					14	return \$obo;
182							}
183
184							sub gene2biofun{
185	6			6	1	18509	my ($annfile, $geneIndex, $classIndex)= @_;
186	6					16	my (%gene2biofun, %biofun, @genes, @biofun, $stat)= ();
187	6					15	my ($sample, $oboterm)= (0)x2;
188	6	100				30	if($annfile=~/.gz$/i){ open FH, "<:gzip", $annfile or die "cannot open $annfile. $!.\n"; } else { open FH, "<", "$annfile" or die "cannot open $annfile. $!.\n"; }
	2	100				114
	4	100				174
189	4					118	while(){
190	27	100				475	next if $_=~/^[!,#]\|^\s*$/;
191	24					44	chomp;
192	24					103	my @vals=split(/\t/,$_);
193	24					67	push(@genes, $vals[$geneIndex]);
194	24					36	push(@biofun, $vals[$classIndex]);
195	24					30	push(@{$biofun{$vals[$geneIndex]}}, $vals[$classIndex]);
	24					117
196							}
197	4					43	close FH;
198	4					18	foreach my $gene (keys %biofun){
199	8					16	$gene2biofun{$gene} = join('\|', sort{($a=~/(\d+)/)[0] <=> ($b=~/(\d+)/)[0]} uniq @{$biofun{$gene}});
	24					139
	8					56
200							}
201	4					41	my @uniqgenes= uniq @genes;
202	4					25	my @uniqpbiofun= uniq @biofun;
203	4					10	$sample= scalar(@uniqgenes);
204	4					8	$oboterm= scalar(@uniqpbiofun);
205	4					14	$stat .= "genes: $sample\nontology terms: $oboterm\n";
206	4					32	return (\%gene2biofun, \$stat);
207							}
208
209							sub map_OBOterm_between_release{
210	9			9	1	20335	my ($obofile, $annfile, $classIndex)= @_;
211	9					19	my (%altid, %oldclass, %old2new, $header, $id, $fln, $pair, $stat, $pstat);
212	9					23	my ($alt, $classes, $seen, $unseen)= (0)x4;
213							## step 0: pairing altid_2_id (key: alt_id)
214	9	100				48	if($obofile=~/.obo$/i){ open FH, "<", "$obofile" or die "cannot open $obofile. $!.\n"; }
	9	50				307
215	7					115	while (){
216	322					408	chomp;
217	322	100				764	next if $_=~/^\s*$/;
218	308	100				522	if($_=~/^id:\s+(\D+\d+)/){ $id=$1; }
	7					22
219	308	100				747	if($_=~/^alt_id:\s+(\D+\d+)/){ $altid{$1}=$id; }
	49					166
220							}
221	7					65	close FH;
222	7					22	$alt= keys(%altid);
223							# step 1: storing old ontology terms in a hash
224	7	100				32	if($annfile=~/.gz$/i){ open FH, "<:gzip", $annfile or die "cannot open $annfile. $!.\n"; } else { open FH, "<", "$annfile" or die "cannot open $annfile. $!.\n"; }
	2	100				139
	5	100				165
225	5					117	while(){
226	28					49	chomp;
227	28	100				451	if($_=~/^[!,#]\|^\s*$/){ $header .= "$_\n"; }
	2					6
228	28	100				400	next if $_=~/^[!,#]\|^\s*$/;
229	26					115	my @vals=split(/\t/,$_);
230	26					138	$oldclass{$vals[$classIndex]}=$vals[$classIndex];
231							}
232	5					59	close FH;
233	5					13	$classes= keys(%oldclass);
234							## step 2: mapping old GO terms to the new one using alt_id as key
235	5					9	my $tmp= "";
236	5					27	foreach my $k (sort{$a cmp $b} keys(%altid)){
	66					92
237	35	100				53	if($oldclass{$k}){
238	8					18	$old2new{$k}=$altid{$oldclass{$k}}; ## pairing
239	8					11	$seen++;
240	8					13	$tmp= $altid{$oldclass{$k}};
241							}else{
242	27					36	$tmp= "unseen";
243	27					34	$unseen++;
244							}
245	35	100				73	if($tmp ne "unseen"){
246	8					22	$pair .= "$k\t$altid{$oldclass{$k}}\n";
247							}
248							}
249							## step 3: substitute ALT-ID with the updated ID, then the annotation file is returned.
250	5	50				23	if($annfile=~/.gz$/i){ open FH, "<:gzip", $annfile or die "cannot open $annfile. $!.\n"; } else { open FH, "<", "$annfile" or die "cannot open $annfile. $!.\n"; }
	1	50				58
	4	100				134
251	5					81	while(){
252	28					54	chomp;
253	28	100				418	next if $_=~/^[!,#]\|^\s*$/;
254	26					107	my @vals= split(/\t/, $_);
255	26					45	my $oboterm= $vals[$classIndex];
256	26	100				52	if($old2new{$oboterm}){
257	16					21	$oboterm= $old2new{$oboterm};
258	16					139	$_=~ s/$vals[$classIndex]/$oboterm/g;
259	16					77	$fln .= "$_\n";
260							}else{
261	10					73	$fln .= "$_\n";
262							}
263							}
264	5					45	close FH;
265	5	100				17	if(defined $header){$fln = $header.$fln;}
	1					4
266							## print mapping stat
267	5					17	$stat .= "Tot. ontology terms:\t$classes\nTot. altID:\t$alt\nTot. altID seen:\t$seen\nTot. altID unseen:\t$unseen\n";
268	5	100				11	unless(not defined $pair){
269	4					13	$pstat .= "#alt-id id\n$pair\n$stat";
270	4					37	return (\$fln, \$pstat);
271							}
272	1					9	return (\$fln, \$stat);
273							}
274
275							1;
276
277							__END__