line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# Please direct questions and support issues to |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Cared for by Keith James |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Copyright Genome Research Ltd. |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# You may distribute this module under the same terms as Perl itself |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=encoding utf-8 |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 NAME |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
Bio::Tools::Geneid - Results of one geneid run |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 SYNOPSIS |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
use Bio::Tools::Geneid; |
22
|
|
|
|
|
|
|
my $gid = Bio::Tools::Geneid(-file => "geneid.out"); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
while (my $gene = $gid->next_prediction) |
25
|
|
|
|
|
|
|
{ |
26
|
|
|
|
|
|
|
my @transcripts = $gene->transcripts; |
27
|
|
|
|
|
|
|
foreach my $t (@transcripts) |
28
|
|
|
|
|
|
|
{ |
29
|
|
|
|
|
|
|
my @exons = $t->exons; |
30
|
|
|
|
|
|
|
foreach my $e (@exons) |
31
|
|
|
|
|
|
|
{ |
32
|
|
|
|
|
|
|
printf("Exon %d..%d\n", $e->start, $e->end); |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
} |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=head1 DESCRIPTION |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
This is the parser for the output of geneid by Enrique Blanco and |
40
|
|
|
|
|
|
|
Roderic GuigE<243> (IMIM-UPF). See http://www1.imim.es/software/geneid. It |
41
|
|
|
|
|
|
|
relies on native geneid output format internally and will work with |
42
|
|
|
|
|
|
|
geneid versions 1.0 and 1.1. Currently this module supports only the |
43
|
|
|
|
|
|
|
default mode of operation which is to predict exons and assemble an |
44
|
|
|
|
|
|
|
optimal gene prediction. |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
It takes either a file handle or a file name and returns a |
47
|
|
|
|
|
|
|
Bio::SeqFeature::Gene::GeneStructure object. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 FEEDBACK |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=head2 Mailing Lists |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
54
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
55
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
58
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head2 Support |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
I |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
67
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
68
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
69
|
|
|
|
|
|
|
with code and data examples if at all possible. |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head2 Reporting Bugs |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
74
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
75
|
|
|
|
|
|
|
web: |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=head1 AUTHOR - Keith James |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
Email: kdj@sanger.ac.uk |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=head1 APPENDIX |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. |
86
|
|
|
|
|
|
|
Internal methods are usually preceded with a _ |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=cut |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
package Bio::Tools::Geneid; |
91
|
|
|
|
|
|
|
|
92
|
1
|
|
|
1
|
|
515
|
use vars qw($SOURCE_TAG); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
44
|
|
93
|
1
|
|
|
1
|
|
4
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
18
|
|
94
|
|
|
|
|
|
|
|
95
|
1
|
|
|
1
|
|
225
|
use Bio::Tools::AnalysisResult; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
28
|
|
96
|
1
|
|
|
1
|
|
298
|
use Bio::SeqFeature::Generic; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
31
|
|
97
|
1
|
|
|
1
|
|
281
|
use Bio::SeqFeature::Gene::Exon; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
30
|
|
98
|
1
|
|
|
1
|
|
258
|
use Bio::SeqFeature::Gene::Transcript; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
32
|
|
99
|
1
|
|
|
1
|
|
260
|
use Bio::SeqFeature::Gene::GeneStructure; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
37
|
|
100
|
|
|
|
|
|
|
|
101
|
1
|
|
|
1
|
|
5
|
use base qw(Bio::Root::Root Bio::Root::IO); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
759
|
|
102
|
|
|
|
|
|
|
$SOURCE_TAG = 'geneid'; |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=head2 new |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
Title : new |
107
|
|
|
|
|
|
|
Usage : $obj->new(-file = "
|
108
|
|
|
|
|
|
|
$obj->new(-fh => \*GI); |
109
|
|
|
|
|
|
|
Function: Constructor for geneid wrapper. Takes either a file |
110
|
|
|
|
|
|
|
: or filehandle |
111
|
|
|
|
|
|
|
Returns : L |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=cut |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
sub new |
116
|
|
|
|
|
|
|
{ |
117
|
1
|
|
|
1
|
1
|
14
|
my($class, @args) = @_; |
118
|
1
|
|
|
|
|
23
|
my $self = $class->SUPER::new(@args); |
119
|
1
|
|
|
|
|
9
|
$self->_initialize_io(@args); |
120
|
1
|
|
|
|
|
4
|
return $self; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=head2 next_prediction |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
Title : next_prediction |
126
|
|
|
|
|
|
|
Usage : while($gene = $geneid->next_prediction) |
127
|
|
|
|
|
|
|
{ |
128
|
|
|
|
|
|
|
# do something |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
Function: Returns the gene structure prediction of the geneid result |
131
|
|
|
|
|
|
|
file. Call this method repeatedly until FALSE is returned. |
132
|
|
|
|
|
|
|
Returns : A Bio::SeqFeature::Gene::GeneStructure object |
133
|
|
|
|
|
|
|
Args : None |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
=cut |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
sub next_prediction |
138
|
|
|
|
|
|
|
{ |
139
|
4
|
|
|
4
|
1
|
23
|
my ($self) = @_; |
140
|
|
|
|
|
|
|
|
141
|
4
|
|
|
|
|
6
|
my ($gene, $transcript, $current_gene_id); |
142
|
4
|
|
|
|
|
5
|
my $transcript_score = 0; |
143
|
|
|
|
|
|
|
|
144
|
4
|
|
|
|
|
7
|
my ($gene_id, $exon_type, $exon_start, $exon_end, $exon_score, |
145
|
|
|
|
|
|
|
$exon_strand, $start_phase, $end_phase, $start_sig_score, |
146
|
|
|
|
|
|
|
$end_sig_score, $coding_pot_score, $homol_score); |
147
|
|
|
|
|
|
|
|
148
|
4
|
|
|
|
|
12
|
while (defined($_ = $self->_readline)) |
149
|
|
|
|
|
|
|
{ |
150
|
39
|
|
|
|
|
86
|
$self->debug($_); |
151
|
|
|
|
|
|
|
|
152
|
39
|
|
|
|
|
89
|
s/^\s+//; |
153
|
39
|
|
|
|
|
164
|
s/\s+$//; |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# We have a choice of geneid, gff or XML formats. The native |
156
|
|
|
|
|
|
|
# geneid format has more information than gff. However, we |
157
|
|
|
|
|
|
|
# then need to perform the hack of extracting the sequence ID |
158
|
|
|
|
|
|
|
# from the header of the embedded Fasta file which comes after |
159
|
|
|
|
|
|
|
# the exon data, as it is not stored elsewhere. Ack. |
160
|
|
|
|
|
|
|
# the new version of geneID includes the sequence ID in a slightly |
161
|
|
|
|
|
|
|
# different format and a new "or" statement was added below |
162
|
|
|
|
|
|
|
# also removed "unless defined $self->_target_id;" inorder to continue |
163
|
|
|
|
|
|
|
# generating new sequence IDs. |
164
|
|
|
|
|
|
|
|
165
|
39
|
100
|
66
|
|
|
128
|
if (/^>(\S+)\|GeneId/ or /^# Sequence (\S+)/) |
166
|
|
|
|
|
|
|
{ |
167
|
3
|
|
|
|
|
8
|
my $target_id = $1; |
168
|
3
|
|
|
|
|
8
|
$self->_target_id($target_id); |
169
|
3
|
|
|
|
|
6
|
next; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
36
|
100
|
|
|
|
127
|
next unless (/(Single|First|Internal|Terminal)/); |
173
|
|
|
|
|
|
|
|
174
|
10
|
|
|
|
|
81
|
my @fields = split(/\s+/, $_); |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
# Grab gene_id from eol first as there are issues with |
177
|
|
|
|
|
|
|
# inconsistent whitespace in the AA coords field |
178
|
10
|
|
|
|
|
22
|
$gene_id = pop @fields; |
179
|
|
|
|
|
|
|
|
180
|
10
|
|
|
|
|
36
|
($exon_type, $exon_start, $exon_end, $exon_score, |
181
|
|
|
|
|
|
|
$exon_strand, $start_phase, $end_phase, $start_sig_score, |
182
|
|
|
|
|
|
|
$end_sig_score, $coding_pot_score, $homol_score) = @fields[0..10]; |
183
|
|
|
|
|
|
|
|
184
|
10
|
100
|
|
|
|
28
|
if (! defined $current_gene_id) |
|
|
100
|
|
|
|
|
|
185
|
|
|
|
|
|
|
{ |
186
|
|
|
|
|
|
|
# Starting the requested prediction |
187
|
3
|
|
|
|
|
4
|
$current_gene_id = $gene_id; |
188
|
3
|
|
|
|
|
5
|
$transcript_score = $exon_score; |
189
|
|
|
|
|
|
|
|
190
|
3
|
|
|
|
|
22
|
$gene = Bio::SeqFeature::Gene::GeneStructure->new(-source => |
191
|
|
|
|
|
|
|
$SOURCE_TAG); |
192
|
3
|
|
|
|
|
16
|
$transcript = Bio::SeqFeature::Gene::Transcript->new(-source => |
193
|
|
|
|
|
|
|
$SOURCE_TAG); |
194
|
|
|
|
|
|
|
|
195
|
3
|
|
|
|
|
9
|
$self->_add_exon($gene, $transcript, $exon_type, $exon_start, $exon_end, $exon_score, |
196
|
|
|
|
|
|
|
$exon_strand, $start_phase, $end_phase, $start_sig_score, |
197
|
|
|
|
|
|
|
$end_sig_score, $coding_pot_score, $homol_score); |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
elsif ($gene_id eq $current_gene_id) |
200
|
|
|
|
|
|
|
{ |
201
|
|
|
|
|
|
|
# Still in requested prediction |
202
|
5
|
|
|
|
|
18
|
$transcript_score += $exon_score; |
203
|
|
|
|
|
|
|
|
204
|
5
|
|
|
|
|
13
|
$self->_add_exon($gene, $transcript, $exon_type, $exon_start, $exon_end, $exon_score, |
205
|
|
|
|
|
|
|
$exon_strand, $start_phase, $end_phase, $start_sig_score, |
206
|
|
|
|
|
|
|
$end_sig_score, $coding_pot_score, $homol_score); |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
else |
209
|
|
|
|
|
|
|
{ |
210
|
|
|
|
|
|
|
# Found following prediction |
211
|
2
|
|
|
|
|
12
|
$self->_pushback($_); |
212
|
2
|
|
|
|
|
4
|
last; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
4
|
100
|
|
|
|
9
|
if (defined $gene) |
217
|
|
|
|
|
|
|
{ |
218
|
3
|
|
|
|
|
5
|
$transcript->seq_id($self->_target_id); |
219
|
3
|
|
|
|
|
10
|
$transcript->score($transcript_score); |
220
|
3
|
|
|
|
|
11
|
$gene->add_transcript($transcript); |
221
|
3
|
|
|
|
|
7
|
$gene->seq_id($self->_target_id); |
222
|
|
|
|
|
|
|
|
223
|
3
|
|
|
|
|
9
|
foreach my $exon ($gene->exons) |
224
|
|
|
|
|
|
|
{ |
225
|
8
|
|
|
|
|
11
|
$exon->seq_id($self->_target_id); |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
|
228
|
3
|
|
|
|
|
8
|
$self->_set_strand($gene); |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
4
|
|
|
|
|
9
|
return $gene; |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=head2 _add_exon |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
Title : _add_exon |
237
|
|
|
|
|
|
|
Usage : $obj->_add_exon($gene, $transcript, ... exon data ...) |
238
|
|
|
|
|
|
|
Function: Adds a new exon to both gene and transcript from the data |
239
|
|
|
|
|
|
|
: supplied as args |
240
|
|
|
|
|
|
|
Example : |
241
|
|
|
|
|
|
|
Returns : Nothing |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=cut |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
sub _add_exon |
246
|
|
|
|
|
|
|
{ |
247
|
8
|
|
|
8
|
|
25
|
my ($self, $gene, $transcript, $exon_type, $exon_start, $exon_end, |
248
|
|
|
|
|
|
|
$exon_score, $exon_strand, $start_phase, $end_phase, $start_sig_score, |
249
|
|
|
|
|
|
|
$end_sig_score, $coding_pot_score, $homol_score) = @_; |
250
|
|
|
|
|
|
|
|
251
|
8
|
|
|
|
|
17
|
$exon_type =~ s/First/Initial/; |
252
|
|
|
|
|
|
|
|
253
|
8
|
100
|
|
|
|
13
|
my $strand = $exon_strand eq '+' ? 1 : -1; |
254
|
|
|
|
|
|
|
|
255
|
8
|
|
|
|
|
35
|
my $exon = Bio::SeqFeature::Gene::Exon->new(-source => $SOURCE_TAG, |
256
|
|
|
|
|
|
|
-start => $exon_start, |
257
|
|
|
|
|
|
|
-end => $exon_end, |
258
|
|
|
|
|
|
|
-strand => $strand, |
259
|
|
|
|
|
|
|
-score => $exon_score); |
260
|
8
|
|
|
|
|
18
|
$exon->is_coding(1); |
261
|
8
|
|
|
|
|
15
|
$exon->add_tag_value("Type", $exon_type); |
262
|
8
|
|
|
|
|
16
|
$exon->add_tag_value('phase', $start_phase); |
263
|
8
|
|
|
|
|
11
|
$exon->add_tag_value('end_phase', $end_phase); |
264
|
8
|
|
|
|
|
16
|
$exon->add_tag_value('start_signal_score', $start_sig_score); |
265
|
8
|
|
|
|
|
15
|
$exon->add_tag_value('end_signal_score', $end_sig_score); |
266
|
8
|
|
|
|
|
16
|
$exon->add_tag_value('coding_potential_score', $coding_pot_score); |
267
|
8
|
|
|
|
|
13
|
$exon->add_tag_value('homology_score', $homol_score); |
268
|
|
|
|
|
|
|
|
269
|
8
|
100
|
|
|
|
13
|
$transcript->strand($strand) unless $transcript->strand != 0; |
270
|
8
|
|
|
|
|
17
|
$transcript->add_exon($exon, $exon_type); |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
=head2 _set_strand |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
Title : _set_strand |
276
|
|
|
|
|
|
|
Usage : $obj->_set_strand($gene) |
277
|
|
|
|
|
|
|
Function: Sets the overall gene strand to the same strand as all |
278
|
|
|
|
|
|
|
: the exons if they are all on the same strand, or to strand 0 |
279
|
|
|
|
|
|
|
: if the exons are on different strands. |
280
|
|
|
|
|
|
|
Example : |
281
|
|
|
|
|
|
|
Returns : Nothing |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
=cut |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
sub _set_strand |
286
|
|
|
|
|
|
|
{ |
287
|
3
|
|
|
3
|
|
4
|
my ($self, $gene) = @_; |
288
|
|
|
|
|
|
|
|
289
|
3
|
|
|
|
|
4
|
my $fwd = 0; |
290
|
3
|
|
|
|
|
3
|
my $rev = 0; |
291
|
|
|
|
|
|
|
|
292
|
3
|
|
|
|
|
6
|
my @exons = $gene->exons; |
293
|
3
|
|
|
|
|
5
|
foreach my $exon (@exons) |
294
|
|
|
|
|
|
|
{ |
295
|
8
|
|
|
|
|
17
|
my $strand = $exon->strand; |
296
|
|
|
|
|
|
|
|
297
|
8
|
100
|
|
|
|
21
|
if ($strand == 1) |
|
|
50
|
|
|
|
|
|
298
|
|
|
|
|
|
|
{ |
299
|
2
|
|
|
|
|
3
|
$fwd++; |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
elsif ($strand == -1) |
302
|
|
|
|
|
|
|
{ |
303
|
6
|
|
|
|
|
16
|
$rev++; |
304
|
|
|
|
|
|
|
} |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
|
307
|
3
|
100
|
|
|
|
15
|
if ($#exons == $fwd) |
|
|
50
|
|
|
|
|
|
308
|
|
|
|
|
|
|
{ |
309
|
1
|
|
|
|
|
3
|
$gene->strand(1); |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
elsif ($#exons == $rev) |
312
|
|
|
|
|
|
|
{ |
313
|
0
|
|
|
|
|
0
|
$gene->strand(-1); |
314
|
|
|
|
|
|
|
} |
315
|
|
|
|
|
|
|
else |
316
|
|
|
|
|
|
|
{ |
317
|
2
|
|
|
|
|
5
|
$gene->strand(0); |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
3
|
|
|
|
|
5
|
return $gene; |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
=head2 _target_id |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
Title : _target_id |
326
|
|
|
|
|
|
|
Usage : $obj->_target_id |
327
|
|
|
|
|
|
|
Function: get/set for genomic sequence id |
328
|
|
|
|
|
|
|
Example : |
329
|
|
|
|
|
|
|
Returns : A target ID |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
=cut |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
sub _target_id |
334
|
|
|
|
|
|
|
{ |
335
|
17
|
|
|
17
|
|
21
|
my ($self,$val) = @_; |
336
|
17
|
100
|
|
|
|
25
|
if ($val) |
337
|
|
|
|
|
|
|
{ |
338
|
3
|
|
|
|
|
6
|
$self->{'_target_id'} = $val; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
|
341
|
17
|
|
|
|
|
42
|
return $self->{'_target_id'}; |
342
|
|
|
|
|
|
|
} |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
1; |