line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $Id: GoaParser.pm 2159 2013-02-20 Erick Antezana $ |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Module : GoaParser.pm |
4
|
|
|
|
|
|
|
# Purpose : Parse GOA files |
5
|
|
|
|
|
|
|
# License : Copyright (c) 2006-2015 by Vladimir Mironov. All rights reserved. |
6
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or |
7
|
|
|
|
|
|
|
# modify it under the same terms as Perl itself. |
8
|
|
|
|
|
|
|
# Contact : erick.antezana@gmail.com |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
package OBO::Parser::GoaParser; |
12
|
|
|
|
|
|
|
|
13
|
2
|
|
|
2
|
|
32036
|
use strict; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
49
|
|
14
|
2
|
|
|
2
|
|
10
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
45
|
|
15
|
2
|
|
|
2
|
|
9
|
use Carp; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
115
|
|
16
|
|
|
|
|
|
|
|
17
|
2
|
|
|
2
|
|
1465
|
use open qw(:std :utf8); # Make All I/O Default to UTF-8 |
|
2
|
|
|
|
|
2328
|
|
|
2
|
|
|
|
|
11
|
|
18
|
|
|
|
|
|
|
|
19
|
2
|
|
|
2
|
|
1583
|
use OBO::Core::Term; |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
66
|
|
20
|
2
|
|
|
2
|
|
1205
|
use OBO::APO::GoaAssociation; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
56
|
|
21
|
2
|
|
|
2
|
|
1074
|
use OBO::APO::GoaAssociationSet; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
2737
|
|
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
# use Data::Dumper; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
$Carp::Verbose = 0; |
26
|
|
|
|
|
|
|
my $verbose = 0; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub new { |
29
|
2
|
|
|
2
|
0
|
13
|
my $class = shift; |
30
|
2
|
|
|
|
|
5
|
my $self = {}; |
31
|
|
|
|
|
|
|
|
32
|
2
|
|
|
|
|
5
|
bless ( $self, $class ); |
33
|
2
|
|
|
|
|
6
|
return $self; |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head2 parse |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
Usage - $GoaParser->parse ( $FH, $map ) |
39
|
|
|
|
|
|
|
Returns - OBO::APO::GoaAssociationSet object |
40
|
|
|
|
|
|
|
Args - |
41
|
|
|
|
|
|
|
1. indirect filehandle to a GOA associations file |
42
|
|
|
|
|
|
|
2. ref to a hash { UniProtAC => UniProtID } or { GO ID => GO term name } to filter by, optional |
43
|
|
|
|
|
|
|
Function - converts a GOA associations file into a OBO::APO::GoaAssociationSet object |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=cut |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub parse { |
48
|
|
|
|
|
|
|
# TODO remove the option of filtering by GO map |
49
|
|
|
|
|
|
|
# TODO accomodate annotations including multiple taxa like: 'taxon:10090|taxon:10360', taxon:9606|taxon:44130 |
50
|
|
|
|
|
|
|
my ( |
51
|
4
|
|
|
4
|
1
|
994
|
$self, |
52
|
|
|
|
|
|
|
$in_file_path, |
53
|
|
|
|
|
|
|
$map # hash ref, either UP map or GO map, optional |
54
|
|
|
|
|
|
|
) = @_; |
55
|
|
|
|
|
|
|
|
56
|
4
|
50
|
|
|
|
140
|
open my $FH, '<', $in_file_path or croak "Can't open file '$in_file_path': $! "; |
57
|
4
|
|
|
|
|
42
|
my $goaAssocSet = OBO::APO::GoaAssociationSet->new ( ); |
58
|
|
|
|
|
|
|
# assumption: the map is not empty and no blank lines |
59
|
4
|
|
|
|
|
7
|
my $map_source; |
60
|
4
|
100
|
|
|
|
13
|
if ( $map ) { |
61
|
|
|
|
|
|
|
# identify the type of the map |
62
|
2
|
|
|
|
|
3
|
my @ids = keys %{ $map }; |
|
2
|
|
|
|
|
8
|
|
63
|
2
|
100
|
|
|
|
15
|
if ( $ids[0] =~ /\AGO:\d{7}\z/xms ) { |
|
|
50
|
|
|
|
|
|
64
|
1
|
|
|
|
|
2
|
$map_source = 'GO'; |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
elsif ( $ids[0] =~ /\A\w{6}\z/xms ){ |
67
|
1
|
|
|
|
|
2
|
$map_source = 'UniProtKB'; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
else { |
70
|
0
|
|
|
|
|
0
|
carp "An illegal ID in the map! "; |
71
|
|
|
|
|
|
|
} |
72
|
2
|
50
|
|
|
|
8
|
print "Filtering by $map_source map\n" if $verbose; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
# Populate the OBO::APO::GoaAssociationSet class with objects |
75
|
4
|
|
|
|
|
5
|
my $count_added =0; |
76
|
4
|
|
|
|
|
7
|
my $count_rejected =0; |
77
|
4
|
|
|
|
|
55
|
while ( <$FH> ){ |
78
|
29
|
|
|
|
|
42
|
chomp; |
79
|
29
|
|
|
|
|
226
|
my @fields = split ( /\t/ ); |
80
|
29
|
50
|
|
|
|
76
|
next if ( @fields != 15 ); |
81
|
29
|
|
|
|
|
64
|
my $goaAssoc = load_data ( \@fields, $map, $map_source ); |
82
|
29
|
50
|
|
|
|
59
|
$goaAssoc ? $count_added++ : $count_rejected++; |
83
|
29
|
50
|
|
|
|
111
|
$goaAssocSet->add_unique ( $goaAssoc ) if $goaAssoc; |
84
|
|
|
|
|
|
|
}# end of while $FH |
85
|
4
|
|
|
|
|
62
|
close $FH; |
86
|
4
|
50
|
|
|
|
13
|
print "accepted associations: $count_added, rejected associations: $count_rejected\n" if $verbose; |
87
|
4
|
50
|
|
|
|
21
|
! $goaAssocSet->is_empty ( ) ? return $goaAssocSet : carp "The set is empty ! $! "; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=head2 work |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Usage - $GoaParser->work ( $ontology, $data, $up_map, $parent_protein_name ) # the last arg is optional |
93
|
|
|
|
|
|
|
Returns - a data structure with added proteins ( { NCBI_ID => {UP_AC => OBO::Core::Term object}} ) |
94
|
|
|
|
|
|
|
Args - |
95
|
|
|
|
|
|
|
1. OBO::Core::Ontology object, |
96
|
|
|
|
|
|
|
2. OBO::APO::GoaAssociationSet object, |
97
|
|
|
|
|
|
|
3. parent term name for proteins ( string ), # to link new proteins to, e.g. 'gene regulation protein' |
98
|
|
|
|
|
|
|
Function - adds GO associations ( and optionally protein terms ) to ontology |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=cut |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub work { |
103
|
|
|
|
|
|
|
my ( |
104
|
2
|
|
|
2
|
1
|
335
|
$self, |
105
|
|
|
|
|
|
|
$onto, |
106
|
|
|
|
|
|
|
$data, |
107
|
|
|
|
|
|
|
$parent_protein_name, |
108
|
|
|
|
|
|
|
) = @_ ; |
109
|
2
|
50
|
33
|
|
|
20
|
croak "Not enough arguments! " if ! ( $onto and $data ); |
110
|
|
|
|
|
|
|
|
111
|
2
|
|
|
|
|
4
|
my $parent_protein; |
112
|
|
|
|
|
|
|
my $taxon; |
113
|
2
|
100
|
|
|
|
6
|
if ( $parent_protein_name ) { |
114
|
1
|
|
33
|
|
|
7
|
$parent_protein = $onto->get_term_by_name ( $parent_protein_name ) || croak "No term for $parent_protein_name in ontology: $! "; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
2
|
|
|
|
|
5
|
my $is_a = 'is_a'; |
118
|
2
|
|
|
|
|
4
|
my $participates_in = 'participates_in'; |
119
|
2
|
|
|
|
|
4
|
my $located_in = 'located_in'; |
120
|
2
|
|
|
|
|
3
|
my $has_function = 'has_function'; |
121
|
2
|
|
|
|
|
4
|
my $has_source = 'has_source'; |
122
|
2
|
|
|
|
|
6
|
my @rel_types = ( $is_a, $participates_in, $located_in, $has_function, $has_source ); |
123
|
2
|
|
|
|
|
59
|
foreach ( @rel_types ){ |
124
|
10
|
50
|
|
|
|
34
|
croak "'$_' is not in ontology" unless ( $onto->{RELATIONSHIP_TYPES}->{$_} ); |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
2
|
|
|
|
|
3
|
my %proteins; # { NCBI_ID => {UP_AC => OBO::Core::Term object}} |
128
|
|
|
|
|
|
|
my %go_terms; # { GO_id => OBO::Core::Term object } |
129
|
0
|
|
|
|
|
0
|
my %taxa; # { NCBI_id => OBO::Core::Term object } |
130
|
|
|
|
|
|
|
|
131
|
2
|
|
|
|
|
4
|
foreach my $goaAssoc ( @{$data->{SET}} ){ |
|
2
|
|
|
|
|
5
|
|
132
|
|
|
|
|
|
|
# get GO term |
133
|
16
|
|
|
|
|
48
|
my $go_id = $goaAssoc->go_id ( ); |
134
|
16
|
50
|
|
|
|
40
|
next if ( $go_id eq 'GO:0008150' ); # remove associations with 'biological_process' |
135
|
16
|
|
|
|
|
27
|
my $go_term = $go_terms{$go_id}; |
136
|
16
|
100
|
|
|
|
34
|
if ( ! $go_term ) { # $go_term is not yet in the hash |
137
|
10
|
|
|
|
|
28
|
$go_term = $onto->get_term_by_id ( $go_id ); |
138
|
10
|
50
|
|
|
|
25
|
next if ( ! $go_term ); |
139
|
10
|
|
|
|
|
22
|
$go_terms{$go_id} = $go_term; |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# get taxon |
143
|
|
|
|
|
|
|
# for multiple taxa in a single association: taxon:9606|taxon:44130 |
144
|
16
|
|
|
|
|
44
|
$goaAssoc->taxon ( ) =~ /\Ataxon:(\d+)/xms; |
145
|
16
|
50
|
|
|
|
49
|
my $ncbi_id = $1 or carp "No NCBI id: $!"; |
146
|
16
|
|
|
|
|
27
|
my $taxon_id = "NCBI:$ncbi_id"; |
147
|
16
|
|
|
|
|
28
|
my $taxon = $taxa{$ncbi_id}; |
148
|
16
|
100
|
|
|
|
33
|
if ( ! $taxon ) { |
149
|
2
|
|
33
|
|
|
7
|
$taxon = $onto->get_term_by_id ( $taxon_id ) || carp "No taxon term for $taxon_id in ontology: $! "; |
150
|
2
|
50
|
|
|
|
6
|
next if ! $taxon; |
151
|
2
|
|
|
|
|
5
|
$taxa{$ncbi_id} = $taxon; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# get protein term |
155
|
16
|
|
|
|
|
42
|
my $obj_id = $goaAssoc->obj_id ( ); |
156
|
16
|
|
|
|
|
45
|
my $db = $goaAssoc->obj_src ( ); |
157
|
16
|
|
|
|
|
35
|
my $prot_id = "$db:$obj_id"; |
158
|
16
|
|
|
|
|
39
|
my $protein = $proteins{$ncbi_id}{$obj_id}; |
159
|
16
|
100
|
|
|
|
31
|
if ( ! $protein ) { # $protein is not yet in the hash |
160
|
2
|
|
|
|
|
7
|
$protein = $onto->get_term_by_id ( $prot_id ); |
161
|
2
|
100
|
|
|
|
8
|
if ( ! $protein ) { # $protein is not yet in the ontolgy |
162
|
1
|
50
|
|
|
|
2
|
if ( $parent_protein_name ) { # testing whether new protein terms should be created |
163
|
1
|
|
|
|
|
9
|
$protein = OBO::Core::Term->new ( ); |
164
|
1
|
|
|
|
|
4
|
$protein->id ( $prot_id ); |
165
|
1
|
|
|
|
|
5
|
$onto->add_term ( $protein ); |
166
|
1
|
|
|
|
|
9
|
$onto->create_rel ( $protein, $has_source, $taxon ); |
167
|
1
|
|
|
|
|
3
|
$onto->create_rel ( $protein, $is_a, $parent_protein ); |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
else { |
170
|
|
|
|
|
|
|
# the protein not in the ontology and no new terms should be created |
171
|
|
|
|
|
|
|
# normally should not happen |
172
|
0
|
|
|
|
|
0
|
carp "No protein term in the ontology for $prot_id"; |
173
|
0
|
|
|
|
|
0
|
next; |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
} |
176
|
2
|
|
|
|
|
5
|
$proteins{$ncbi_id}{$obj_id} = $protein; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# create relations |
181
|
16
|
|
|
|
|
42
|
my $aspect = $goaAssoc->aspect ( ); |
182
|
16
|
50
|
|
|
|
58
|
if ( $aspect eq 'F' ) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
183
|
0
|
|
|
|
|
0
|
$onto->create_rel ( $protein, $has_function, $go_term ); |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
elsif ( $aspect eq 'C' ) { |
186
|
14
|
|
|
|
|
35
|
$onto->create_rel ( $protein, $located_in, $go_term ); |
187
|
|
|
|
|
|
|
# $onto->create_rel ( $go_term, $location_of, $protein ); # inverse of 'located_in' |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
elsif ( $aspect eq 'P' ) { |
190
|
2
|
|
|
|
|
7
|
$onto->create_rel ( $protein, $participates_in, $go_term ); |
191
|
|
|
|
|
|
|
# $onto->create_rel ( $go_term, $has_participant, $protein ); # inverse of 'participates_in' |
192
|
|
|
|
|
|
|
} |
193
|
0
|
|
|
|
|
0
|
else {carp "An illegal GO aspect '$aspect'\n"} |
194
|
|
|
|
|
|
|
} |
195
|
2
|
|
|
|
|
11
|
return \%proteins; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub load_data { |
199
|
29
|
|
|
29
|
0
|
50
|
my ( $fields, $map, $map_source ) = @_; |
200
|
29
|
|
|
|
|
33
|
my @fields = @{$fields}; |
|
29
|
|
|
|
|
113
|
|
201
|
29
|
|
|
|
|
59
|
foreach ( @fields ) { |
202
|
435
|
|
|
|
|
624
|
$_ =~ s/^\s+//; |
203
|
435
|
|
|
|
|
737
|
$_ =~ s/\s+$//; |
204
|
|
|
|
|
|
|
} |
205
|
29
|
100
|
|
|
|
62
|
if ( $map ) { |
206
|
16
|
100
|
|
|
|
35
|
if ( $map_source eq 'GO' ) { |
207
|
8
|
50
|
|
|
|
23
|
return 0 unless $map->{$fields[4]}; |
208
|
|
|
|
|
|
|
} |
209
|
16
|
100
|
|
|
|
35
|
if ( $map_source eq 'UniProtKB' ) { |
210
|
8
|
50
|
|
|
|
26
|
return 0 unless $map->{$fields[1]}; |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
} |
213
|
29
|
|
|
|
|
99
|
my $goaAssoc = OBO::APO::GoaAssociation->new ( ); |
214
|
29
|
|
|
|
|
89
|
$goaAssoc->assc_id ( $. ); |
215
|
29
|
|
|
|
|
77
|
$goaAssoc->obj_src ( $fields[0] ); |
216
|
29
|
|
|
|
|
80
|
$goaAssoc->obj_id ( $fields[1] ); |
217
|
29
|
|
|
|
|
73
|
$goaAssoc->obj_symb ( $fields[2] ); |
218
|
29
|
|
|
|
|
73
|
$goaAssoc->qualifier ( $fields[3] ); |
219
|
29
|
|
|
|
|
71
|
$goaAssoc->go_id ( $fields[4] ); |
220
|
29
|
|
|
|
|
78
|
$goaAssoc->refer ( $fields[5] ); |
221
|
29
|
|
|
|
|
75
|
$goaAssoc->evid_code ( $fields[6] ); |
222
|
29
|
|
|
|
|
103
|
$goaAssoc->sup_ref ( $fields[7] ); |
223
|
29
|
|
|
|
|
73
|
$goaAssoc->aspect ( $fields[8] ); |
224
|
29
|
|
|
|
|
75
|
$goaAssoc->description ( $fields[9] ); |
225
|
29
|
|
|
|
|
76
|
$goaAssoc->synonym ( $fields[10] ); |
226
|
29
|
|
|
|
|
75
|
$goaAssoc->type ( $fields[11] ); |
227
|
29
|
|
|
|
|
74
|
$goaAssoc->taxon ( $fields[12] ); |
228
|
29
|
|
|
|
|
76
|
$goaAssoc->date ( $fields[13] ); |
229
|
29
|
|
|
|
|
75
|
$goaAssoc->annot_src ( $fields[14] ); |
230
|
29
|
|
|
|
|
84
|
return $goaAssoc; |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
1; |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
__END__ |