line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $Id: NCBIParser.pm 2113 2010-09-29 Erick Antezana $ |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Module : NCBIParser.pm |
4
|
|
|
|
|
|
|
# Purpose : Parse NCBI files: names and nodes |
5
|
|
|
|
|
|
|
# License : Copyright ( c ) 2006, 2007, 2008 Cell Cycle Ontology. All rights reserved. |
6
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or |
7
|
|
|
|
|
|
|
# modify it under the same terms as Perl itself. |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
package OBO::Parser::NCBIParser; |
10
|
|
|
|
|
|
|
|
11
|
1
|
|
|
1
|
|
770
|
use OBO::Core::Term; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
36
|
|
12
|
1
|
|
|
1
|
|
7
|
use OBO::Parser::OBOParser; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
26
|
|
13
|
1
|
|
|
1
|
|
7
|
use OBO::Core::Ontology; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
21
|
|
14
|
|
|
|
|
|
|
|
15
|
1
|
|
|
1
|
|
3
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
44
|
|
16
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
35
|
|
17
|
1
|
|
|
1
|
|
7
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
671
|
|
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
$Carp::Verbose = 0; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
sub new { |
22
|
1
|
|
|
1
|
0
|
8
|
my $class = shift; |
23
|
1
|
|
|
|
|
2
|
my $self = {}; |
24
|
1
|
|
|
|
|
2
|
bless ( $self, $class ); |
25
|
1
|
|
|
|
|
2
|
return $self; |
26
|
|
|
|
|
|
|
} |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head2 parse |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
Usage - $ncbi_parser->parse ( $ncbi_nodes_path ) |
31
|
|
|
|
|
|
|
Returns - ref to a hash { child_id => parent_id } |
32
|
|
|
|
|
|
|
Args - nodes.dmp path ( string ) |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
Usage - $ncbi_parser->parse ( $ncbi_names_path, $name_type ) |
35
|
|
|
|
|
|
|
Returns - ref to a hash { ncbi_id => ncbi_name } |
36
|
|
|
|
|
|
|
Args - |
37
|
|
|
|
|
|
|
1. names.dmp path or nodes.dmp path, string |
38
|
|
|
|
|
|
|
2. ncbi name type ( string, e.g. 'scientific name' ) if the first arg is names.dmp otherwise none |
39
|
|
|
|
|
|
|
Function - parses the complete NCBI taxonomy |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=cut |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
sub parse { |
44
|
|
|
|
|
|
|
my ( |
45
|
2
|
|
|
2
|
1
|
400
|
$self, |
46
|
|
|
|
|
|
|
$input_path, |
47
|
|
|
|
|
|
|
$name_type # optional |
48
|
|
|
|
|
|
|
) = @_; |
49
|
2
|
50
|
|
|
|
8
|
croak "Not enough arguments! " if ( @_ < 1 ); |
50
|
2
|
|
33
|
|
|
139
|
open my $IN, '<', $input_path || croak "Can't open file '$input_path': $! "; |
51
|
2
|
|
|
|
|
41
|
my @in_lines = <$IN>; |
52
|
2
|
|
|
|
|
4
|
my %map; |
53
|
2
|
100
|
|
|
|
6
|
if ( $name_type ) { # parsing names.dmp, |
54
|
|
|
|
|
|
|
# %map: ncbi_id => scientific_name |
55
|
1
|
|
|
|
|
4
|
foreach my $line ( @in_lines ){ |
56
|
22
|
|
|
|
|
52
|
my @fields = split /\t\|/, $line; |
57
|
22
|
100
|
|
|
|
57
|
if ( $fields[3] eq "\t$name_type" ) { |
58
|
10
|
|
|
|
|
8
|
my $key = $fields[0]; |
59
|
10
|
|
|
|
|
12
|
my $value = substr $fields[1], 1; |
60
|
10
|
|
|
|
|
21
|
$map{$key} = $value; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
else { # parsing nodes.dmp |
65
|
|
|
|
|
|
|
# %map: child_id => parent_id |
66
|
1
|
|
|
|
|
6
|
foreach my $line ( @in_lines ){ |
67
|
8
|
|
|
|
|
33
|
my @fields = split /\t\|/, $line; |
68
|
8
|
|
|
|
|
11
|
my $key = $fields[0]; |
69
|
8
|
|
|
|
|
10
|
my $value = substr $fields[1], 1; |
70
|
8
|
|
|
|
|
24
|
$map{$key} = $value; |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
} |
73
|
2
|
|
|
|
|
76
|
close ( $IN ); |
74
|
2
|
|
|
|
|
15
|
return \%map; |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=head2 work |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Usage - $NCBIParser->work ( $onto, $nodes, $names, $ncbi_ids ) |
80
|
|
|
|
|
|
|
Returns - map of added terms { NCBI ID => OBO::Core::Term object } |
81
|
|
|
|
|
|
|
Args - |
82
|
|
|
|
|
|
|
1. input ontology, OBO::Core::Ontology object |
83
|
|
|
|
|
|
|
2. ref to a hash { child_id => parent_id } |
84
|
|
|
|
|
|
|
3. ref to a hash { ncbi_id => scientific_name } |
85
|
|
|
|
|
|
|
4. parental ontology term for the root of the taxonomy, OBO::Core::Term object |
86
|
|
|
|
|
|
|
5. ref to a list of NCBI taxon ids ( \d+ ) |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Function - adds NCBI taxonomy to the input ontology for the specified taxa |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=cut |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub work { |
93
|
|
|
|
|
|
|
my ( |
94
|
1
|
|
|
1
|
1
|
7
|
$self, |
95
|
|
|
|
|
|
|
$ontology, |
96
|
|
|
|
|
|
|
$nodes, |
97
|
|
|
|
|
|
|
$names, |
98
|
|
|
|
|
|
|
$parent, |
99
|
|
|
|
|
|
|
$ncbi_ids, |
100
|
|
|
|
|
|
|
) = @_; |
101
|
|
|
|
|
|
|
|
102
|
1
|
|
|
|
|
2
|
my %selected_nodes = ( ); # taxon id => parent id |
103
|
1
|
|
|
|
|
2
|
my %selected_names = ( ); # taxon id => taxon name |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# the hashes %selected_nodes and %selected_names are being populated: |
106
|
1
|
|
|
|
|
2
|
foreach my $ncbi_id ( @{$ncbi_ids} ) { |
|
1
|
|
|
|
|
2
|
|
107
|
1
|
|
|
|
|
4
|
getParentsRecursively ( $ncbi_id, $nodes, $names, \%selected_nodes, \%selected_names ); |
108
|
|
|
|
|
|
|
} |
109
|
1
|
|
|
|
|
1
|
my %map; # NCBI_ID=>taxon_term |
110
|
|
|
|
|
|
|
# the terms are created and added to the ontology and %map |
111
|
1
|
|
|
|
|
4
|
foreach my $ncbi_id ( keys %selected_nodes ){ |
112
|
4
|
|
|
|
|
4
|
my $selected_name = $selected_names{$ncbi_id}; |
113
|
4
|
|
|
|
|
12
|
my $taxon = OBO::Core::Term->new ( ); |
114
|
4
|
|
|
|
|
13
|
$taxon->id ( "NCBI:$ncbi_id" ); |
115
|
4
|
|
|
|
|
11
|
$taxon->name ( $selected_name ); |
116
|
4
|
|
|
|
|
9
|
$ontology->add_term ( $taxon ); |
117
|
4
|
|
|
|
|
9
|
$map{$ncbi_id} = $taxon; |
118
|
|
|
|
|
|
|
} # the end of foreach |
119
|
|
|
|
|
|
|
# Connect children to parents by 'is_a' relationships but not if the child is root ( cyclic is_a ) |
120
|
1
|
|
|
|
|
4
|
foreach my $ncbi_id ( keys %selected_nodes ) { |
121
|
4
|
50
|
|
|
|
9
|
my $child = $map{$ncbi_id} or croak "No term for '$ncbi_id' in the map: $! "; |
122
|
4
|
50
|
|
|
|
9
|
my $parent = $map{$selected_nodes{$ncbi_id}} or croak "No term for '$selected_nodes{$ncbi_id}' in the map: $! "; |
123
|
4
|
100
|
|
|
|
16
|
$ontology->create_rel ( $child, 'is_a', $parent ) if ( $ncbi_id != 1 ); |
124
|
|
|
|
|
|
|
} |
125
|
1
|
50
|
|
|
|
4
|
my $root = $ontology->get_term_by_id ( 'NCBI:1' ) or croak "No term in the ontology for 'root': $!"; |
126
|
1
|
|
|
|
|
4
|
$ontology->create_rel ( $root, 'is_a', $parent ); |
127
|
1
|
|
|
|
|
6
|
return \%map; |
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
######################################################################## |
131
|
|
|
|
|
|
|
# Subroutines |
132
|
|
|
|
|
|
|
######################################################################## |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
sub getParentsRecursively { |
135
|
4
|
|
|
4
|
0
|
5
|
my ( $ncbi_id, $nodes, $names, $selected_nodes, $selected_names ) = @_; |
136
|
4
|
|
|
|
|
3
|
my $child_id = $ncbi_id; |
137
|
4
|
|
|
|
|
3
|
my $parent_id = ${$nodes}{$ncbi_id}; |
|
4
|
|
|
|
|
5
|
|
138
|
4
|
|
|
|
|
4
|
my $child_name = ${$names}{$ncbi_id}; |
|
4
|
|
|
|
|
4
|
|
139
|
4
|
|
|
|
|
3
|
my $parent_name = ${$names}{$ncbi_id}; |
|
4
|
|
|
|
|
5
|
|
140
|
4
|
|
|
|
|
5
|
$selected_nodes->{$child_id} = $parent_id; |
141
|
4
|
|
|
|
|
5
|
$selected_names->{$child_id} = $child_name; |
142
|
4
|
100
|
|
|
|
17
|
getParentsRecursively ( $parent_id, $nodes, $names, $selected_nodes, $selected_names ) if ( $child_id != 1 ); |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
1; |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
__END__ |