line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# BioPerl module for Bio::Tools::GFF |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Cared for by the Bioperl core team |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Copyright Matthew Pocock |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::Tools::GFF - A Bio::SeqAnalysisParserI compliant GFF format parser |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
use Bio::Tools::GFF; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# specify input via -fh or -file |
23
|
|
|
|
|
|
|
my $gffio = Bio::Tools::GFF->new(-fh => \*STDIN, -gff_version => 2); |
24
|
|
|
|
|
|
|
my $feature; |
25
|
|
|
|
|
|
|
# loop over the input stream |
26
|
|
|
|
|
|
|
while($feature = $gffio->next_feature()) { |
27
|
|
|
|
|
|
|
# do something with feature |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
$gffio->close(); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# you can also obtain a GFF parser as a SeqAnalasisParserI in |
32
|
|
|
|
|
|
|
# HT analysis pipelines (see Bio::SeqAnalysisParserI and |
33
|
|
|
|
|
|
|
# Bio::Factory::SeqAnalysisParserFactory) |
34
|
|
|
|
|
|
|
my $factory = Bio::Factory::SeqAnalysisParserFactory->new(); |
35
|
|
|
|
|
|
|
my $parser = $factory->get_parser(-input => \*STDIN, -method => "gff"); |
36
|
|
|
|
|
|
|
while($feature = $parser->next_feature()) { |
37
|
|
|
|
|
|
|
# do something with feature |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head1 DESCRIPTION |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
This class provides a simple GFF parser and writer. In the sense of a |
43
|
|
|
|
|
|
|
SeqAnalysisParser, it parses an input file or stream into SeqFeatureI |
44
|
|
|
|
|
|
|
objects, but is not in any way specific to a particular analysis |
45
|
|
|
|
|
|
|
program and the output that program produces. |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
That is, if you can get your analysis program spit out GFF, here is |
48
|
|
|
|
|
|
|
your result parser. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 GFF3 AND SEQUENCE DATA |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
GFF3 supports sequence data; see |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
http://www.sequenceontology.org/gff3.shtml |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
There are a number of ways to deal with this - |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
If you call |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
$gffio->ignore_sequence(1) |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
prior to parsing the sequence data is ignored; this is useful if you |
63
|
|
|
|
|
|
|
just want the features. It avoids the memory overhead in building and |
64
|
|
|
|
|
|
|
caching sequences |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Alternatively, you can call either |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
$gffio->get_seqs() |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
Or |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
$gffio->seq_id_by_h() |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
At the B of parsing to get either a list or hashref of Bio::Seq |
75
|
|
|
|
|
|
|
objects (see the documentation for each of these methods) |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
Note that these objects will not have the features attached - you have |
78
|
|
|
|
|
|
|
to do this yourself, OR call |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
$gffio->features_attached_to_seqs(1) |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
PRIOR to parsing; this will ensure that the Seqs have the features |
83
|
|
|
|
|
|
|
attached; ie you will then be able to call |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
$seq->get_SeqFeatures(); |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
And use Bio::SeqIO methods |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
Note that auto-attaching the features to seqs will incur a higher |
90
|
|
|
|
|
|
|
memory overhead as the features must be cached until the sequence data |
91
|
|
|
|
|
|
|
is found |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head1 TODO |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
Make a Bio::SeqIO class specifically for GFF3 with sequence data |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=head1 FEEDBACK |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=head2 Mailing Lists |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
102
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
103
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
106
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=head2 Support |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
I |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
115
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
116
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
117
|
|
|
|
|
|
|
with code and data examples if at all possible. |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=head2 Reporting Bugs |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
122
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted the web: |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=head1 AUTHOR - Matthew Pocock |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Email mrp-at-sanger.ac.uk |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
=head1 CONTRIBUTORS |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
Jason Stajich, jason-at-biperl-dot-org |
133
|
|
|
|
|
|
|
Chris Mungall, cjm-at-fruitfly-dot-org |
134
|
|
|
|
|
|
|
Steffen Grossmann [SG], grossman at molgen.mpg.de |
135
|
|
|
|
|
|
|
Malcolm Cook, mec-at-stowers-institute.org |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=head1 APPENDIX |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _ |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=cut |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# Let the code begin... |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
package Bio::Tools::GFF; |
146
|
|
|
|
|
|
|
|
147
|
146
|
|
|
146
|
|
998
|
use vars qw($HAS_HTML_ENTITIES); |
|
146
|
|
|
|
|
174
|
|
|
146
|
|
|
|
|
5833
|
|
148
|
146
|
|
|
146
|
|
503
|
use strict; |
|
146
|
|
|
|
|
159
|
|
|
146
|
|
|
|
|
2354
|
|
149
|
|
|
|
|
|
|
|
150
|
146
|
|
|
146
|
|
38297
|
use Bio::Seq::SeqFactory; |
|
146
|
|
|
|
|
207
|
|
|
146
|
|
|
|
|
3083
|
|
151
|
146
|
|
|
146
|
|
32111
|
use Bio::LocatableSeq; |
|
146
|
|
|
|
|
225
|
|
|
146
|
|
|
|
|
3183
|
|
152
|
146
|
|
|
146
|
|
959
|
use Bio::SeqFeature::Generic; |
|
146
|
|
|
|
|
181
|
|
|
146
|
|
|
|
|
3034
|
|
153
|
|
|
|
|
|
|
|
154
|
146
|
|
|
146
|
|
469
|
use base qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO); |
|
146
|
|
|
|
|
162
|
|
|
146
|
|
|
|
|
68642
|
|
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
my $i = 0; |
157
|
|
|
|
|
|
|
my %GFF3_ID_Tags = map { $_ => $i++ } qw(ID Parent Target); |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# for skipping data that may be represented elsewhere; currently, this is |
160
|
|
|
|
|
|
|
# only the score |
161
|
|
|
|
|
|
|
my %SKIPPED_TAGS = map { $_ => 1 } qw(score); |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=head2 new |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Title : new |
167
|
|
|
|
|
|
|
Usage : my $parser = Bio::Tools::GFF->new(-gff_version => 2, |
168
|
|
|
|
|
|
|
-file => "filename.gff"); |
169
|
|
|
|
|
|
|
or |
170
|
|
|
|
|
|
|
my $writer = Bio::Tools::GFF->new(-gff_version => 3, |
171
|
|
|
|
|
|
|
-file => ">filename.gff3"); |
172
|
|
|
|
|
|
|
Function: Creates a new instance. Recognized named parameters are -file, -fh, |
173
|
|
|
|
|
|
|
and -gff_version. |
174
|
|
|
|
|
|
|
Returns : a new object |
175
|
|
|
|
|
|
|
Args : named parameters |
176
|
|
|
|
|
|
|
-gff_version => [1,2,3] |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=cut |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
{ # make a class variable such that we can generate unique ID's over |
181
|
|
|
|
|
|
|
# a session, no matter how many instances of GFF.pm we make |
182
|
|
|
|
|
|
|
# since these have to be unique within the scope of a GFF file. |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
my $gff3_featureID = 0; |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
sub _incrementGFF3ID { |
187
|
8
|
|
|
8
|
|
7
|
my ($self) = @_; |
188
|
8
|
|
|
|
|
11
|
return ++ $gff3_featureID; |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
sub new { |
194
|
16
|
|
|
16
|
1
|
514
|
my ($class, @args) = @_; |
195
|
16
|
|
|
|
|
52
|
my $self = $class->SUPER::new(@args); |
196
|
|
|
|
|
|
|
|
197
|
16
|
|
|
|
|
50
|
my ($gff_version, $noparse) = $self->_rearrange([qw(GFF_VERSION NOPARSE)],@args); |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# initialize IO |
200
|
16
|
|
|
|
|
54
|
$self->_initialize_io(@args); |
201
|
16
|
50
|
|
|
|
48
|
$self->_parse_header() unless $noparse; |
202
|
|
|
|
|
|
|
|
203
|
16
|
|
50
|
|
|
27
|
$gff_version ||= 2; |
204
|
16
|
50
|
|
|
|
33
|
if( ! $self->gff_version($gff_version) ) { |
205
|
0
|
|
|
|
|
0
|
$self->throw("Can't build a GFF object with the unknown version ". |
206
|
|
|
|
|
|
|
$gff_version); |
207
|
|
|
|
|
|
|
} |
208
|
16
|
|
|
|
|
20
|
$self->{'_first'} = 1; |
209
|
16
|
|
|
|
|
36
|
return $self; |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=head2 _parse_header |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
Title : _parse_header |
216
|
|
|
|
|
|
|
Usage : $gffio->_parse_header() |
217
|
|
|
|
|
|
|
Function: used to turn parse GFF header lines. currently |
218
|
|
|
|
|
|
|
produces Bio::LocatableSeq objects from ##sequence-region |
219
|
|
|
|
|
|
|
lines |
220
|
|
|
|
|
|
|
Returns : 1 on success |
221
|
|
|
|
|
|
|
Args : none |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=cut |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
sub _parse_header{ |
227
|
16
|
|
|
16
|
|
16
|
my ($self) = @_; |
228
|
|
|
|
|
|
|
|
229
|
16
|
|
|
|
|
15
|
my @unhandled; |
230
|
16
|
|
|
|
|
55
|
local $^W = 0; # hide warnings when we try and parse from a file opened |
231
|
|
|
|
|
|
|
# for writing - there isn't really a better way to do |
232
|
|
|
|
|
|
|
# AFAIK - cannot detech if a FH is read or write. |
233
|
16
|
|
|
|
|
41
|
while(my $line = $self->_readline()){ |
234
|
46
|
|
|
|
|
35
|
my $handled = 0; |
235
|
46
|
50
|
|
|
|
63
|
next if /^\s+$/; |
236
|
46
|
100
|
|
|
|
131
|
if($line =~ /^\#\#sequence-region\s+(\S+)\s+(\S+)\s+(\S+)\s*/){ |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
237
|
42
|
|
|
|
|
93
|
my($seqid,$start,$end) = ($1,$2,$3); |
238
|
42
|
|
|
|
|
23
|
push @{ $self->{'segments'} }, Bio::LocatableSeq->new( |
|
42
|
|
|
|
|
83
|
|
239
|
|
|
|
|
|
|
-id => unescape($seqid), |
240
|
|
|
|
|
|
|
-start => $start, |
241
|
|
|
|
|
|
|
-end => $end, |
242
|
|
|
|
|
|
|
-length => ($end - $start + 1), ## make the length explicit |
243
|
|
|
|
|
|
|
); |
244
|
42
|
|
|
|
|
48
|
$handled = 1; |
245
|
|
|
|
|
|
|
} elsif($line =~ /^(\#\#feature-ontology)/) { |
246
|
|
|
|
|
|
|
#to be implemented |
247
|
0
|
|
|
|
|
0
|
$self->warn("$1 header tag parsing unimplemented"); |
248
|
|
|
|
|
|
|
} elsif($line =~ /^(\#\#attribute-ontology)/) { |
249
|
|
|
|
|
|
|
#to be implemented |
250
|
0
|
|
|
|
|
0
|
$self->warn("$1 header tag parsing unimplemented"); |
251
|
|
|
|
|
|
|
} elsif($line =~ /^(\#\#source-ontology)/) { |
252
|
|
|
|
|
|
|
#to be implemented |
253
|
0
|
|
|
|
|
0
|
$self->warn("$1 header tag parsing unimplemented"); |
254
|
|
|
|
|
|
|
} elsif($line =~ /^(\#\#\#)/) { |
255
|
|
|
|
|
|
|
#to be implemented |
256
|
0
|
|
|
|
|
0
|
$self->warn("$1 header tag parsing unimplemented"); |
257
|
|
|
|
|
|
|
} elsif($line =~ /^(\#\#FASTA)/) { |
258
|
|
|
|
|
|
|
# initial ##FASTA is optional - artemis does not use it |
259
|
0
|
|
|
|
|
0
|
$line = $self->_readline(); |
260
|
0
|
0
|
|
|
|
0
|
if ($line !~ /^\>(\S+)/) { |
261
|
0
|
|
|
|
|
0
|
$self->throw("##FASTA directive must be followed by fasta header, not: $line"); |
262
|
|
|
|
|
|
|
} |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
|
265
|
46
|
50
|
|
|
|
77
|
if ($line =~ /^\>(.*)/) { |
266
|
|
|
|
|
|
|
# seq data can be at header or footer |
267
|
0
|
|
|
|
|
0
|
my $seq = $self->_parse_sequence($line); |
268
|
0
|
0
|
|
|
|
0
|
if ($seq) { |
269
|
0
|
|
|
|
|
0
|
$self->_seq_by_id_h->{$seq->primary_id} = $seq; |
270
|
|
|
|
|
|
|
} |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
|
274
|
46
|
100
|
|
|
|
55
|
if(!$handled){ |
275
|
4
|
|
|
|
|
5
|
push @unhandled, $line; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
#looks like the header is over! |
279
|
46
|
100
|
|
|
|
148
|
last unless $line =~ /^\#/; |
280
|
|
|
|
|
|
|
} |
281
|
|
|
|
|
|
|
|
282
|
16
|
|
|
|
|
30
|
foreach my $line (@unhandled){ |
283
|
4
|
|
|
|
|
13
|
$self->_pushback($line); |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
16
|
|
|
|
|
29
|
return 1; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
sub _parse_sequence { |
290
|
0
|
|
|
0
|
|
0
|
my ($self, $line) = @_; |
291
|
|
|
|
|
|
|
|
292
|
0
|
0
|
|
|
|
0
|
if ($line =~ /^\>(.*)/) { |
293
|
|
|
|
|
|
|
|
294
|
0
|
|
|
|
|
0
|
my $seqid = $1; |
295
|
0
|
|
|
|
|
0
|
$seqid =~ s/\s+$//; |
296
|
0
|
|
|
|
|
0
|
my $desc = ''; |
297
|
0
|
0
|
|
|
|
0
|
if ($seqid =~ /(\S+)\s+(.*)/) { |
298
|
0
|
|
|
|
|
0
|
($seqid, $desc) = ($1,$2); |
299
|
|
|
|
|
|
|
} |
300
|
0
|
|
|
|
|
0
|
my $res = ''; |
301
|
0
|
|
|
|
|
0
|
while (my $line = $self->_readline) { |
302
|
0
|
0
|
|
|
|
0
|
if ($line =~ /^\#/) { |
303
|
0
|
|
|
|
|
0
|
last; |
304
|
|
|
|
|
|
|
} |
305
|
0
|
0
|
|
|
|
0
|
if ($line =~ /^\>/) { |
306
|
0
|
|
|
|
|
0
|
$self->_pushback($line); |
307
|
0
|
|
|
|
|
0
|
last; |
308
|
|
|
|
|
|
|
} |
309
|
0
|
|
|
|
|
0
|
$line =~ s/\s//g; |
310
|
0
|
|
|
|
|
0
|
$res .= $line; |
311
|
|
|
|
|
|
|
} |
312
|
0
|
0
|
|
|
|
0
|
return if $self->ignore_sequence; |
313
|
|
|
|
|
|
|
|
314
|
0
|
|
|
|
|
0
|
my $seqfactory = Bio::Seq::SeqFactory->new('Bio::Seq'); |
315
|
0
|
|
|
|
|
0
|
my $seq = $seqfactory->create(-seq=>$res, |
316
|
|
|
|
|
|
|
-id=>$seqid, |
317
|
|
|
|
|
|
|
-desc=>$desc); |
318
|
0
|
|
|
|
|
0
|
$seq->accession_number($seqid); |
319
|
0
|
0
|
|
|
|
0
|
if ($self->features_attached_to_seqs) { |
320
|
|
|
|
|
|
|
my @feats = |
321
|
0
|
|
|
|
|
0
|
@{$self->_feature_idx_by_seq_id->{$seqid}}; |
|
0
|
|
|
|
|
0
|
|
322
|
0
|
|
|
|
|
0
|
$seq->add_SeqFeature($_) foreach @feats; |
323
|
0
|
|
|
|
|
0
|
@{$self->_feature_idx_by_seq_id->{$seqid}} = (); |
|
0
|
|
|
|
|
0
|
|
324
|
|
|
|
|
|
|
} |
325
|
0
|
|
|
|
|
0
|
return $seq; |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
else { |
328
|
0
|
|
|
|
|
0
|
$self->throw("expected fasta header, not: $line"); |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
} |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
=head2 next_segment |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
Title : next_segment |
336
|
|
|
|
|
|
|
Usage : my $seq = $gffio->next_segment; |
337
|
|
|
|
|
|
|
Function: Returns a Bio::LocatableSeq object corresponding to a |
338
|
|
|
|
|
|
|
GFF "##sequence-region" header line. |
339
|
|
|
|
|
|
|
Example : |
340
|
|
|
|
|
|
|
Returns : A Bio::LocatableSeq object, or undef if |
341
|
|
|
|
|
|
|
there are no more sequences. |
342
|
|
|
|
|
|
|
Args : none |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=cut |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
sub next_segment{ |
348
|
1
|
|
|
1
|
1
|
3
|
my ($self,@args) = @_; |
349
|
1
|
50
|
|
|
|
4
|
return shift @{ $self->{'segments'} } if defined $self->{'segments'}; |
|
1
|
|
|
|
|
2
|
|
350
|
0
|
|
|
|
|
0
|
return; |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=head2 next_feature |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
Title : next_feature |
357
|
|
|
|
|
|
|
Usage : $seqfeature = $gffio->next_feature(); |
358
|
|
|
|
|
|
|
Function: Returns the next feature available in the input file or stream, or |
359
|
|
|
|
|
|
|
undef if there are no more features. |
360
|
|
|
|
|
|
|
Example : |
361
|
|
|
|
|
|
|
Returns : A Bio::SeqFeatureI implementing object, or undef if there are no |
362
|
|
|
|
|
|
|
more features. |
363
|
|
|
|
|
|
|
Args : none |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=cut |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub next_feature { |
368
|
4
|
|
|
4
|
1
|
183
|
my ($self) = @_; |
369
|
|
|
|
|
|
|
|
370
|
4
|
|
|
|
|
4
|
my $gff_string; |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
# be graceful about empty lines or comments, and make sure we return undef |
373
|
|
|
|
|
|
|
# if the input's consumed |
374
|
4
|
|
66
|
|
|
29
|
while(($gff_string = $self->_readline()) && defined($gff_string)) { |
375
|
3
|
50
|
|
|
|
9
|
if ($gff_string =~ /^\#\#\#/) { |
376
|
|
|
|
|
|
|
# all forward refs have been seen; TODO |
377
|
|
|
|
|
|
|
} |
378
|
3
|
50
|
33
|
|
|
22
|
next if($gff_string =~ /^\#/ || $gff_string =~ /^\s*$/ || |
|
|
|
33
|
|
|
|
|
379
|
|
|
|
|
|
|
$gff_string =~ m{^//}); |
380
|
|
|
|
|
|
|
|
381
|
3
|
|
|
|
|
8
|
while ($gff_string =~ /^\>(.+)/) { |
382
|
|
|
|
|
|
|
# fasta can be in header or footer |
383
|
0
|
|
|
|
|
0
|
my $seq = $self->_parse_sequence($gff_string); |
384
|
0
|
0
|
|
|
|
0
|
if ($seq) { |
385
|
0
|
|
|
|
|
0
|
$self->_seq_by_id_h->{$seq->primary_id} = $seq; |
386
|
0
|
|
|
|
|
0
|
$gff_string = $self->_readline; |
387
|
0
|
0
|
|
|
|
0
|
last unless $gff_string; |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
} |
390
|
3
|
|
|
|
|
3
|
last; |
391
|
|
|
|
|
|
|
} |
392
|
4
|
100
|
|
|
|
12
|
return unless $gff_string; |
393
|
|
|
|
|
|
|
|
394
|
3
|
|
|
|
|
9
|
my $feat = Bio::SeqFeature::Generic->new(); |
395
|
3
|
|
|
|
|
8
|
$self->from_gff_string($feat, $gff_string); |
396
|
|
|
|
|
|
|
|
397
|
3
|
50
|
|
|
|
9
|
if ($self->features_attached_to_seqs) { |
398
|
0
|
|
|
|
|
0
|
push(@{$self->_feature_idx_by_seq_id->{$feat->seq_id}}, |
|
0
|
|
|
|
|
0
|
|
399
|
|
|
|
|
|
|
$feat); |
400
|
|
|
|
|
|
|
} |
401
|
|
|
|
|
|
|
|
402
|
3
|
|
|
|
|
7
|
return $feat; |
403
|
|
|
|
|
|
|
} |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
sub _feature_idx_by_seq_id { |
406
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
407
|
0
|
0
|
|
|
|
0
|
$self->{__feature_idx_by_seq_id} = shift if @_; |
408
|
|
|
|
|
|
|
$self->{__feature_idx_by_seq_id} = {} |
409
|
0
|
0
|
|
|
|
0
|
unless $self->{__feature_idx_by_seq_id}; |
410
|
0
|
|
|
|
|
0
|
return $self->{__feature_idx_by_seq_id}; |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=head2 from_gff_string |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
Title : from_gff_string |
417
|
|
|
|
|
|
|
Usage : $gff->from_gff_string($feature, $gff_string); |
418
|
|
|
|
|
|
|
Function: Sets properties of a SeqFeatureI object from a GFF-formatted |
419
|
|
|
|
|
|
|
string. Interpretation of the string depends on the version |
420
|
|
|
|
|
|
|
that has been specified at initialization. |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
This method is used by next_feature(). It actually dispatches to |
423
|
|
|
|
|
|
|
one of the version-specific (private) methods. |
424
|
|
|
|
|
|
|
Example : |
425
|
|
|
|
|
|
|
Returns : void |
426
|
|
|
|
|
|
|
Args : A Bio::SeqFeatureI implementing object to be initialized |
427
|
|
|
|
|
|
|
The GFF-formatted string to initialize it from |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=cut |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
sub from_gff_string { |
432
|
3
|
|
|
3
|
1
|
4
|
my ($self, $feat, $gff_string) = @_; |
433
|
|
|
|
|
|
|
|
434
|
3
|
100
|
|
|
|
5
|
if($self->gff_version() == 1) { |
|
|
100
|
|
|
|
|
|
435
|
1
|
|
|
|
|
2
|
return $self->_from_gff1_string($feat, $gff_string); |
436
|
|
|
|
|
|
|
} elsif( $self->gff_version() == 3 ) { |
437
|
1
|
|
|
|
|
5
|
return $self->_from_gff3_string($feat, $gff_string); |
438
|
|
|
|
|
|
|
} else { |
439
|
1
|
|
|
|
|
4
|
return $self->_from_gff2_string($feat, $gff_string); |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
=head2 _from_gff1_string |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
Title : _from_gff1_string |
447
|
|
|
|
|
|
|
Usage : |
448
|
|
|
|
|
|
|
Function: |
449
|
|
|
|
|
|
|
Example : |
450
|
|
|
|
|
|
|
Returns : void |
451
|
|
|
|
|
|
|
Args : A Bio::SeqFeatureI implementing object to be initialized |
452
|
|
|
|
|
|
|
The GFF-formatted string to initialize it from |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
=cut |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
sub _from_gff1_string { |
457
|
1
|
|
|
1
|
|
2
|
my ($gff, $feat, $string) = @_; |
458
|
1
|
|
|
|
|
2
|
chomp $string; |
459
|
1
|
|
|
|
|
5
|
my ($seqname, $source, $primary, $start, $end, $score, |
460
|
|
|
|
|
|
|
$strand, $frame, @group) = split(/\t/, $string); |
461
|
|
|
|
|
|
|
|
462
|
1
|
50
|
|
|
|
3
|
if ( !defined $frame ) { |
463
|
0
|
|
|
|
|
0
|
$feat->throw("[$string] does not look like GFF to me"); |
464
|
|
|
|
|
|
|
} |
465
|
1
|
50
|
|
|
|
6
|
$frame = 0 unless( $frame =~ /^\d+$/); |
466
|
1
|
|
|
|
|
3
|
$feat->seq_id($seqname); |
467
|
1
|
|
|
|
|
3
|
$feat->source_tag($source); |
468
|
1
|
|
|
|
|
2
|
$feat->primary_tag($primary); |
469
|
1
|
|
|
|
|
2
|
$feat->start($start); |
470
|
1
|
|
|
|
|
2
|
$feat->end($end); |
471
|
1
|
|
|
|
|
2
|
$feat->frame($frame); |
472
|
1
|
50
|
|
|
|
2
|
if ( $score eq '.' ) { |
473
|
|
|
|
|
|
|
#$feat->score(undef); |
474
|
|
|
|
|
|
|
} else { |
475
|
1
|
|
|
|
|
2
|
$feat->score($score); |
476
|
|
|
|
|
|
|
} |
477
|
1
|
50
|
|
|
|
3
|
if ( $strand eq '-' ) { $feat->strand(-1); } |
|
1
|
|
|
|
|
2
|
|
478
|
1
|
50
|
|
|
|
3
|
if ( $strand eq '+' ) { $feat->strand(1); } |
|
0
|
|
|
|
|
0
|
|
479
|
1
|
50
|
|
|
|
3
|
if ( $strand eq '.' ) { $feat->strand(0); } |
|
0
|
|
|
|
|
0
|
|
480
|
1
|
|
|
|
|
3
|
foreach my $g ( @group ) { |
481
|
0
|
0
|
|
|
|
0
|
if ( $g =~ /(\S+)=(\S+)/ ) { |
482
|
0
|
|
|
|
|
0
|
my $tag = $1; |
483
|
0
|
|
|
|
|
0
|
my $value = $2; |
484
|
0
|
|
|
|
|
0
|
$feat->add_tag_value($1, $2); |
485
|
|
|
|
|
|
|
} else { |
486
|
0
|
|
|
|
|
0
|
$feat->add_tag_value('group', $g); |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
} |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
=head2 _from_gff2_string |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
Title : _from_gff2_string |
495
|
|
|
|
|
|
|
Usage : |
496
|
|
|
|
|
|
|
Function: |
497
|
|
|
|
|
|
|
Example : |
498
|
|
|
|
|
|
|
Returns : void |
499
|
|
|
|
|
|
|
Args : A Bio::SeqFeatureI implementing object to be initialized |
500
|
|
|
|
|
|
|
The GFF2-formatted string to initialize it from |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=cut |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
sub _from_gff2_string { |
506
|
1
|
|
|
1
|
|
2
|
my ($gff, $feat, $string) = @_; |
507
|
1
|
|
|
|
|
2
|
chomp($string); |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
# according to the Sanger website, GFF2 should be single-tab |
510
|
|
|
|
|
|
|
# separated elements, and the free-text at the end should contain |
511
|
|
|
|
|
|
|
# text-translated tab symbols but no "real" tabs, so splitting on |
512
|
|
|
|
|
|
|
# \t is safe, and $attribs gets the entire attributes field to be |
513
|
|
|
|
|
|
|
# parsed later |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
# sendu: but the tag value pair can (should?) be separated by a tab. The |
516
|
|
|
|
|
|
|
# 'no tabs' thing seems to apply only to the free text that is allowed for |
517
|
|
|
|
|
|
|
# the value |
518
|
|
|
|
|
|
|
|
519
|
1
|
|
|
|
|
8
|
my ($seqname, $source, $primary, $start, |
520
|
|
|
|
|
|
|
$end, $score, $strand, $frame, @attribs) = split(/\t+/, $string); |
521
|
1
|
|
|
|
|
3
|
my $attribs = join ' ', @attribs; |
522
|
|
|
|
|
|
|
|
523
|
1
|
50
|
|
|
|
3
|
if ( !defined $frame ) { |
524
|
0
|
|
|
|
|
0
|
$feat->throw("[$string] does not look like GFF2 to me"); |
525
|
|
|
|
|
|
|
} |
526
|
1
|
|
|
|
|
3
|
$feat->seq_id($seqname); |
527
|
1
|
|
|
|
|
2
|
$feat->source_tag($source); |
528
|
1
|
|
|
|
|
3
|
$feat->primary_tag($primary); |
529
|
1
|
|
|
|
|
2
|
$feat->start($start); |
530
|
1
|
|
|
|
|
3
|
$feat->end($end); |
531
|
1
|
|
|
|
|
2
|
$feat->frame($frame); |
532
|
1
|
50
|
|
|
|
3
|
if ( $score eq '.' ) { |
533
|
|
|
|
|
|
|
# $feat->score(undef); |
534
|
|
|
|
|
|
|
} else { |
535
|
1
|
|
|
|
|
3
|
$feat->score($score); |
536
|
|
|
|
|
|
|
} |
537
|
1
|
50
|
|
|
|
4
|
if ( $strand eq '-' ) { $feat->strand(-1); } |
|
1
|
|
|
|
|
2
|
|
538
|
1
|
50
|
|
|
|
4
|
if ( $strand eq '+' ) { $feat->strand(1); } |
|
0
|
|
|
|
|
0
|
|
539
|
1
|
50
|
|
|
|
3
|
if ( $strand eq '.' ) { $feat->strand(0); } |
|
0
|
|
|
|
|
0
|
|
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
# |
543
|
|
|
|
|
|
|
# this routine is necessay to allow the presence of semicolons in |
544
|
|
|
|
|
|
|
# quoted text Semicolons are the delimiting character for new |
545
|
|
|
|
|
|
|
# tag/value attributes. it is more or less a "state" machine, with |
546
|
|
|
|
|
|
|
# the "quoted" flag going up and down as we pass thorugh quotes to |
547
|
|
|
|
|
|
|
# distinguish free-text semicolon and hash symbols from GFF control |
548
|
|
|
|
|
|
|
# characters |
549
|
|
|
|
|
|
|
|
550
|
1
|
|
|
|
|
1
|
my $flag = 0; # this could be changed to a bit and just be twiddled |
551
|
1
|
|
|
|
|
1
|
my @parsed; |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# run through each character one at a time and check it |
554
|
|
|
|
|
|
|
# NOTE: changed to foreach loop which is more efficient in perl |
555
|
|
|
|
|
|
|
# --jasons |
556
|
1
|
|
|
|
|
9
|
for my $a ( split //, $attribs ) { |
557
|
|
|
|
|
|
|
# flag up on entering quoted text, down on leaving it |
558
|
61
|
100
|
100
|
|
|
168
|
if( $a eq '"') { $flag = ( $flag == 0 ) ? 1:0 } |
|
2
|
100
|
33
|
|
|
4
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
559
|
1
|
|
|
|
|
1
|
elsif( $a eq ';' && $flag ) { $a = "INSERT_SEMICOLON_HERE"} |
560
|
0
|
|
|
|
|
0
|
elsif( $a eq '#' && ! $flag ) { last } |
561
|
61
|
|
|
|
|
48
|
push @parsed, $a; |
562
|
|
|
|
|
|
|
} |
563
|
1
|
|
|
|
|
6
|
$attribs = join "", @parsed; # rejoin into a single string |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
# |
566
|
|
|
|
|
|
|
# Please feel free to fix this and make it more "perlish" |
567
|
|
|
|
|
|
|
|
568
|
1
|
|
|
|
|
3
|
my @key_vals = split /;/, $attribs; # attributes are semicolon-delimited |
569
|
|
|
|
|
|
|
|
570
|
1
|
|
|
|
|
2
|
foreach my $pair ( @key_vals ) { |
571
|
|
|
|
|
|
|
# replace semicolons that were removed from free-text above. |
572
|
3
|
|
|
|
|
5
|
$pair =~ s/INSERT_SEMICOLON_HERE/;/g; |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
# separate the key from the value |
575
|
3
|
|
|
|
|
12
|
my ($blank, $key, $values) = split /^\s*([\w\d]+)\s/, $pair; |
576
|
|
|
|
|
|
|
|
577
|
3
|
50
|
|
|
|
5
|
if( defined $values ) { |
578
|
3
|
|
|
|
|
2
|
my @values; |
579
|
|
|
|
|
|
|
# free text is quoted, so match each free-text block |
580
|
|
|
|
|
|
|
# and remove it from the $values string |
581
|
3
|
|
|
|
|
9
|
while ($values =~ s/"(.*?)"//){ |
582
|
|
|
|
|
|
|
# and push it on to the list of values (tags may have |
583
|
|
|
|
|
|
|
# more than one value... and the value may be undef) |
584
|
1
|
|
|
|
|
3
|
push @values, $1; |
585
|
|
|
|
|
|
|
} |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
# and what is left over should be space-separated |
588
|
|
|
|
|
|
|
# non-free-text values |
589
|
|
|
|
|
|
|
|
590
|
3
|
|
|
|
|
7
|
my @othervals = split /\s+/, $values; |
591
|
3
|
|
|
|
|
4
|
foreach my $othervalue(@othervals){ |
592
|
|
|
|
|
|
|
# get rid of any empty strings which might |
593
|
|
|
|
|
|
|
# result from the split |
594
|
2
|
50
|
|
|
|
3
|
if (CORE::length($othervalue) > 0) {push @values, $othervalue} |
|
2
|
|
|
|
|
3
|
|
595
|
|
|
|
|
|
|
} |
596
|
|
|
|
|
|
|
|
597
|
3
|
|
|
|
|
3
|
foreach my $value(@values){ |
598
|
3
|
|
|
|
|
6
|
$feat->add_tag_value($key, $value); |
599
|
|
|
|
|
|
|
} |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
sub _from_gff3_string { |
606
|
1
|
|
|
1
|
|
3
|
my ($gff, $feat, $string) = @_; |
607
|
1
|
|
|
|
|
3
|
chomp($string); |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# according to the now nearly final GFF3 spec, columns should |
610
|
|
|
|
|
|
|
# be tab separated, allowing unescaped spaces to occur in |
611
|
|
|
|
|
|
|
# column 9 |
612
|
|
|
|
|
|
|
|
613
|
1
|
|
|
|
|
5
|
my ($seqname, $source, $primary, $start, $end, |
614
|
|
|
|
|
|
|
$score, $strand, $frame, $groups) = split(/\t/, $string); |
615
|
|
|
|
|
|
|
|
616
|
1
|
50
|
|
|
|
4
|
if ( ! defined $frame ) { |
617
|
0
|
|
|
|
|
0
|
$feat->throw("[$string] does not look like GFF3 to me"); |
618
|
|
|
|
|
|
|
} |
619
|
1
|
|
|
|
|
3
|
$feat->seq_id($seqname); |
620
|
1
|
|
|
|
|
2
|
$feat->source_tag($source); |
621
|
1
|
|
|
|
|
3
|
$feat->primary_tag($primary); |
622
|
1
|
|
|
|
|
2
|
$feat->start($start); |
623
|
1
|
|
|
|
|
2
|
$feat->end($end); |
624
|
1
|
|
|
|
|
6
|
$feat->frame($frame); |
625
|
1
|
50
|
|
|
|
3
|
if ( $score eq '.' ) { |
626
|
|
|
|
|
|
|
#$feat->score(undef); |
627
|
|
|
|
|
|
|
} else { |
628
|
0
|
|
|
|
|
0
|
$feat->score($score); |
629
|
|
|
|
|
|
|
} |
630
|
1
|
50
|
|
|
|
4
|
if ( $strand eq '-' ) { $feat->strand(-1); } |
|
0
|
|
|
|
|
0
|
|
631
|
1
|
50
|
|
|
|
2
|
if ( $strand eq '+' ) { $feat->strand(1); } |
|
1
|
|
|
|
|
2
|
|
632
|
1
|
50
|
|
|
|
4
|
if ( $strand eq '.' ) { $feat->strand(0); } |
|
0
|
|
|
|
|
0
|
|
633
|
1
|
|
|
|
|
9
|
my @groups = split(/\s*;\s*/, $groups); |
634
|
|
|
|
|
|
|
|
635
|
1
|
|
|
|
|
2
|
for my $group (@groups) { |
636
|
3
|
|
|
|
|
7
|
my ($tag,$value) = split /=/,$group; |
637
|
3
|
|
|
|
|
5
|
$tag = unescape($tag); |
638
|
3
|
|
|
|
|
5
|
my @values = map {unescape($_)} split /,/,$value; |
|
3
|
|
|
|
|
4
|
|
639
|
3
|
|
|
|
|
4
|
for my $v ( @values ) { $feat->add_tag_value($tag,$v); } |
|
3
|
|
|
|
|
5
|
|
640
|
|
|
|
|
|
|
} |
641
|
|
|
|
|
|
|
} |
642
|
|
|
|
|
|
|
|
643
|
|
|
|
|
|
|
# taken from Bio::DB::GFF |
644
|
|
|
|
|
|
|
sub unescape { |
645
|
48
|
|
|
48
|
0
|
40
|
my $v = shift; |
646
|
48
|
|
|
|
|
39
|
$v =~ tr/+/ /; |
647
|
48
|
|
|
|
|
52
|
$v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge; |
|
2
|
|
|
|
|
7
|
|
648
|
48
|
|
|
|
|
157
|
return $v; |
649
|
|
|
|
|
|
|
} |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
=head2 write_feature |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
Title : write_feature |
655
|
|
|
|
|
|
|
Usage : $gffio->write_feature($feature); |
656
|
|
|
|
|
|
|
Function: Writes the specified SeqFeatureI object in GFF format to the stream |
657
|
|
|
|
|
|
|
associated with this instance. |
658
|
|
|
|
|
|
|
Returns : none |
659
|
|
|
|
|
|
|
Args : An array of Bio::SeqFeatureI implementing objects to be serialized |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
=cut |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
sub write_feature { |
664
|
3
|
|
|
3
|
1
|
186
|
my ($self, @features) = @_; |
665
|
3
|
50
|
|
|
|
7
|
return unless @features; |
666
|
3
|
100
|
66
|
|
|
11
|
if( $self->{'_first'} && $self->gff_version() == 3 ) { |
667
|
1
|
|
|
|
|
4
|
$self->_print("##gff-version 3\n"); |
668
|
|
|
|
|
|
|
} |
669
|
3
|
|
|
|
|
5
|
$self->{'_first'} = 0; |
670
|
3
|
|
|
|
|
4
|
foreach my $feature ( @features ) { |
671
|
3
|
|
|
|
|
8
|
$self->_print($self->gff_string($feature)."\n"); |
672
|
|
|
|
|
|
|
} |
673
|
|
|
|
|
|
|
} |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
=head2 gff_string |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
Title : gff_string |
679
|
|
|
|
|
|
|
Usage : $gffstr = $gffio->gff_string($feature); |
680
|
|
|
|
|
|
|
Function: Obtain the GFF-formatted representation of a SeqFeatureI object. |
681
|
|
|
|
|
|
|
The formatting depends on the version specified at initialization. |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
This method is used by write_feature(). It actually dispatches to |
684
|
|
|
|
|
|
|
one of the version-specific (private) methods. |
685
|
|
|
|
|
|
|
Example : |
686
|
|
|
|
|
|
|
Returns : A GFF-formatted string representation of the SeqFeature |
687
|
|
|
|
|
|
|
Args : A Bio::SeqFeatureI implementing object to be GFF-stringified |
688
|
|
|
|
|
|
|
|
689
|
|
|
|
|
|
|
=cut |
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
sub gff_string{ |
692
|
11
|
|
|
11
|
1
|
11
|
my ($self, $feature) = @_; |
693
|
|
|
|
|
|
|
|
694
|
11
|
100
|
|
|
|
22
|
if($self->gff_version() == 1) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
695
|
1
|
|
|
|
|
3
|
return $self->_gff1_string($feature); |
696
|
|
|
|
|
|
|
} elsif( $self->gff_version() == 3 ) { |
697
|
8
|
|
|
|
|
19
|
return $self->_gff3_string($feature); |
698
|
|
|
|
|
|
|
} elsif( $self->gff_version() == 2.5 ) { |
699
|
0
|
|
|
|
|
0
|
return $self->_gff25_string($feature); |
700
|
|
|
|
|
|
|
} else { |
701
|
2
|
|
|
|
|
9
|
return $self->_gff2_string($feature); |
702
|
|
|
|
|
|
|
} |
703
|
|
|
|
|
|
|
} |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
|
706
|
|
|
|
|
|
|
=head2 _gff1_string |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
Title : _gff1_string |
709
|
|
|
|
|
|
|
Usage : $gffstr = $gffio->_gff1_string |
710
|
|
|
|
|
|
|
Function: |
711
|
|
|
|
|
|
|
Example : |
712
|
|
|
|
|
|
|
Returns : A GFF1-formatted string representation of the SeqFeature |
713
|
|
|
|
|
|
|
Args : A Bio::SeqFeatureI implementing object to be GFF-stringified |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
=cut |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
sub _gff1_string{ |
718
|
1
|
|
|
1
|
|
2
|
my ($gff, $feat) = @_; |
719
|
1
|
|
|
|
|
1
|
my ($str,$score,$frame,$name,$strand); |
720
|
|
|
|
|
|
|
|
721
|
1
|
50
|
|
|
|
12
|
if( $feat->can('score') ) { |
722
|
1
|
|
|
|
|
3
|
$score = $feat->score(); |
723
|
|
|
|
|
|
|
} |
724
|
1
|
50
|
|
|
|
3
|
$score = '.' unless defined $score; |
725
|
|
|
|
|
|
|
|
726
|
1
|
50
|
|
|
|
4
|
if( $feat->can('frame') ) { |
727
|
1
|
|
|
|
|
3
|
$frame = $feat->frame(); |
728
|
|
|
|
|
|
|
} |
729
|
1
|
50
|
|
|
|
5
|
$frame = '.' unless defined $frame; |
730
|
|
|
|
|
|
|
|
731
|
1
|
|
|
|
|
2
|
$strand = $feat->strand(); |
732
|
1
|
50
|
|
|
|
6
|
if(! $strand) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
733
|
0
|
|
|
|
|
0
|
$strand = "."; |
734
|
|
|
|
|
|
|
} elsif( $strand == 1 ) { |
735
|
0
|
|
|
|
|
0
|
$strand = '+'; |
736
|
|
|
|
|
|
|
} elsif ( $feat->strand == -1 ) { |
737
|
1
|
|
|
|
|
1
|
$strand = '-'; |
738
|
|
|
|
|
|
|
} |
739
|
|
|
|
|
|
|
|
740
|
1
|
50
|
|
|
|
5
|
if( $feat->can('seqname') ) { |
741
|
1
|
|
|
|
|
4
|
$name = $feat->seq_id(); |
742
|
1
|
|
50
|
|
|
4
|
$name ||= 'SEQ'; |
743
|
|
|
|
|
|
|
} else { |
744
|
0
|
|
|
|
|
0
|
$name = 'SEQ'; |
745
|
|
|
|
|
|
|
} |
746
|
|
|
|
|
|
|
|
747
|
1
|
|
|
|
|
4
|
$str = join("\t", |
748
|
|
|
|
|
|
|
$name, |
749
|
|
|
|
|
|
|
$feat->source_tag, |
750
|
|
|
|
|
|
|
$feat->primary_tag, |
751
|
|
|
|
|
|
|
$feat->start, |
752
|
|
|
|
|
|
|
$feat->end, |
753
|
|
|
|
|
|
|
$score, |
754
|
|
|
|
|
|
|
$strand, |
755
|
|
|
|
|
|
|
$frame); |
756
|
|
|
|
|
|
|
|
757
|
1
|
|
|
|
|
4
|
foreach my $tag ( $feat->get_all_tags ) { |
758
|
4
|
100
|
|
|
|
7
|
next if exists $SKIPPED_TAGS{$tag}; |
759
|
3
|
|
|
|
|
4
|
foreach my $value ( $feat->get_tag_values($tag) ) { |
760
|
3
|
50
|
|
|
|
12
|
$str .= " $tag=$value" if $value; |
761
|
|
|
|
|
|
|
} |
762
|
|
|
|
|
|
|
} |
763
|
|
|
|
|
|
|
|
764
|
1
|
|
|
|
|
10
|
return $str; |
765
|
|
|
|
|
|
|
} |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
=head2 _gff2_string |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
Title : _gff2_string |
771
|
|
|
|
|
|
|
Usage : $gffstr = $gffio->_gff2_string |
772
|
|
|
|
|
|
|
Function: |
773
|
|
|
|
|
|
|
Example : |
774
|
|
|
|
|
|
|
Returns : A GFF2-formatted string representation of the SeqFeature |
775
|
|
|
|
|
|
|
Args : A Bio::SeqFeatureI implementing object to be GFF2-stringified |
776
|
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
=cut |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
sub _gff2_string{ |
780
|
2
|
|
|
2
|
|
3
|
my ($gff, $origfeat) = @_; |
781
|
2
|
|
|
|
|
3
|
my $feat; |
782
|
2
|
50
|
|
|
|
17
|
if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){ |
783
|
0
|
|
|
|
|
0
|
$feat = $origfeat->feature2; |
784
|
|
|
|
|
|
|
} else { |
785
|
2
|
|
|
|
|
3
|
$feat = $origfeat; |
786
|
|
|
|
|
|
|
} |
787
|
2
|
|
|
|
|
4
|
my ($str1, $str2,$score,$frame,$name,$strand); |
788
|
|
|
|
|
|
|
|
789
|
2
|
50
|
|
|
|
108
|
if( $feat->can('score') ) { |
790
|
2
|
|
|
|
|
7
|
$score = $feat->score(); |
791
|
|
|
|
|
|
|
} |
792
|
2
|
100
|
|
|
|
6
|
$score = '.' unless defined $score; |
793
|
|
|
|
|
|
|
|
794
|
2
|
50
|
|
|
|
8
|
if( $feat->can('frame') ) { |
795
|
2
|
|
|
|
|
4
|
$frame = $feat->frame(); |
796
|
|
|
|
|
|
|
} |
797
|
2
|
50
|
|
|
|
6
|
$frame = '.' unless defined $frame; |
798
|
|
|
|
|
|
|
|
799
|
2
|
|
|
|
|
3
|
$strand = $feat->strand(); |
800
|
2
|
50
|
|
|
|
18
|
if(! $strand) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
801
|
0
|
|
|
|
|
0
|
$strand = "."; |
802
|
|
|
|
|
|
|
} elsif( $strand == 1 ) { |
803
|
1
|
|
|
|
|
2
|
$strand = '+'; |
804
|
|
|
|
|
|
|
} elsif ( $feat->strand == -1 ) { |
805
|
1
|
|
|
|
|
2
|
$strand = '-'; |
806
|
|
|
|
|
|
|
} |
807
|
|
|
|
|
|
|
|
808
|
2
|
50
|
|
|
|
9
|
if( $feat->can('seqname') ) { |
809
|
2
|
|
|
|
|
5
|
$name = $feat->seq_id(); |
810
|
|
|
|
|
|
|
} |
811
|
2
|
|
50
|
|
|
8
|
$name ||= 'SEQ'; |
812
|
|
|
|
|
|
|
|
813
|
2
|
|
|
|
|
5
|
$str1 = join("\t", |
814
|
|
|
|
|
|
|
$name, |
815
|
|
|
|
|
|
|
$feat->source_tag(), |
816
|
|
|
|
|
|
|
$feat->primary_tag(), |
817
|
|
|
|
|
|
|
$feat->start(), |
818
|
|
|
|
|
|
|
$feat->end(), |
819
|
|
|
|
|
|
|
$score, |
820
|
|
|
|
|
|
|
$strand, |
821
|
|
|
|
|
|
|
$frame); |
822
|
|
|
|
|
|
|
# the routine below is the only modification I made to the original |
823
|
|
|
|
|
|
|
# ->gff_string routine (above) as on November 17th, 2000, the |
824
|
|
|
|
|
|
|
# Sanger webpage describing GFF2 format reads: "From version 2 |
825
|
|
|
|
|
|
|
# onwards, the attribute field must have a tag value structure |
826
|
|
|
|
|
|
|
# following the syntax used within objects in a .ace file, |
827
|
|
|
|
|
|
|
# flattened onto one line by semicolon separators. Tags must be |
828
|
|
|
|
|
|
|
# standard identifiers ([A-Za-z][A-Za-z0-9_]*). Free text values |
829
|
|
|
|
|
|
|
# must be quoted with double quotes". |
830
|
|
|
|
|
|
|
# MW |
831
|
|
|
|
|
|
|
|
832
|
2
|
|
|
|
|
3
|
my @group; |
833
|
|
|
|
|
|
|
|
834
|
2
|
|
|
|
|
5
|
foreach my $tag ( $feat->get_all_tags ) { |
835
|
7
|
100
|
|
|
|
14
|
next if exists $SKIPPED_TAGS{$tag}; |
836
|
6
|
|
|
|
|
5
|
my @v; |
837
|
6
|
|
|
|
|
10
|
foreach my $value ( $feat->get_tag_values($tag) ) { |
838
|
6
|
50
|
33
|
|
|
29
|
unless( defined $value && length($value) ) { |
|
|
100
|
|
|
|
|
|
839
|
|
|
|
|
|
|
# quote anything other than valid tag/value characters |
840
|
0
|
|
|
|
|
0
|
$value = '""'; |
841
|
|
|
|
|
|
|
} elsif ($value =~ /[^A-Za-z0-9_]/){ |
842
|
|
|
|
|
|
|
# substitute tab and newline chars by their UNIX equivalent |
843
|
1
|
|
|
|
|
2
|
$value =~ s/\t/\\t/g; |
844
|
1
|
|
|
|
|
1
|
$value =~ s/\n/\\n/g; |
845
|
1
|
|
|
|
|
3
|
$value = '"' . $value . '" '; |
846
|
|
|
|
|
|
|
} |
847
|
6
|
|
|
|
|
8
|
push @v, $value; |
848
|
|
|
|
|
|
|
# for this tag (allowed in GFF2 and .ace format) |
849
|
|
|
|
|
|
|
} |
850
|
6
|
|
|
|
|
14
|
push @group, "$tag ".join(" ", @v); |
851
|
|
|
|
|
|
|
} |
852
|
|
|
|
|
|
|
|
853
|
2
|
|
|
|
|
6
|
$str2 .= join(' ; ', @group); |
854
|
|
|
|
|
|
|
# Add Target information for Feature Pairs |
855
|
2
|
50
|
33
|
|
|
4
|
if( ! $feat->has_tag('Target') && # This is a bad hack IMHO |
|
|
|
33
|
|
|
|
|
856
|
|
|
|
|
|
|
! $feat->has_tag('Group') && |
857
|
|
|
|
|
|
|
$origfeat->isa('Bio::SeqFeature::FeaturePair') ) { |
858
|
0
|
0
|
|
|
|
0
|
$str2 = sprintf("Target %s %d %d", $origfeat->feature1->seq_id, |
|
|
0
|
|
|
|
|
|
859
|
|
|
|
|
|
|
( $origfeat->feature1->strand < 0 ? |
860
|
|
|
|
|
|
|
( $origfeat->feature1->end, |
861
|
|
|
|
|
|
|
$origfeat->feature1->start) : |
862
|
|
|
|
|
|
|
( $origfeat->feature1->start, |
863
|
|
|
|
|
|
|
$origfeat->feature1->end) |
864
|
|
|
|
|
|
|
)) . ($str2?" ; ".$str2:""); # need to put Target information before other tag/value pairs - mw |
865
|
|
|
|
|
|
|
} |
866
|
2
|
|
|
|
|
12
|
return $str1."\t".$str2; |
867
|
|
|
|
|
|
|
} |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
=head2 _gff25_string |
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
Title : _gff25_string |
873
|
|
|
|
|
|
|
Usage : $gffstr = $gffio->_gff2_string |
874
|
|
|
|
|
|
|
Function: To get a format of GFF that is peculiar to Gbrowse/Bio::DB::GFF |
875
|
|
|
|
|
|
|
Example : |
876
|
|
|
|
|
|
|
Returns : A GFF2.5-formatted string representation of the SeqFeature |
877
|
|
|
|
|
|
|
Args : A Bio::SeqFeatureI implementing object to be GFF2.5-stringified |
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
=cut |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
sub _gff25_string { |
882
|
0
|
|
|
0
|
|
0
|
my ($gff, $origfeat) = @_; |
883
|
0
|
|
|
|
|
0
|
my $feat; |
884
|
0
|
0
|
|
|
|
0
|
if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){ |
885
|
0
|
|
|
|
|
0
|
$feat = $origfeat->feature2; |
886
|
|
|
|
|
|
|
} else { |
887
|
0
|
|
|
|
|
0
|
$feat = $origfeat; |
888
|
|
|
|
|
|
|
} |
889
|
0
|
|
|
|
|
0
|
my ($str1, $str2,$score,$frame,$name,$strand); |
890
|
|
|
|
|
|
|
|
891
|
0
|
0
|
|
|
|
0
|
if( $feat->can('score') ) { |
892
|
0
|
|
|
|
|
0
|
$score = $feat->score(); |
893
|
|
|
|
|
|
|
} |
894
|
0
|
0
|
|
|
|
0
|
$score = '.' unless defined $score; |
895
|
|
|
|
|
|
|
|
896
|
0
|
0
|
|
|
|
0
|
if( $feat->can('frame') ) { |
897
|
0
|
|
|
|
|
0
|
$frame = $feat->frame(); |
898
|
|
|
|
|
|
|
} |
899
|
0
|
0
|
|
|
|
0
|
$frame = '.' unless defined $frame; |
900
|
|
|
|
|
|
|
|
901
|
0
|
|
|
|
|
0
|
$strand = $feat->strand(); |
902
|
0
|
0
|
|
|
|
0
|
if(! $strand) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
903
|
0
|
|
|
|
|
0
|
$strand = "."; |
904
|
|
|
|
|
|
|
} elsif( $strand == 1 ) { |
905
|
0
|
|
|
|
|
0
|
$strand = '+'; |
906
|
|
|
|
|
|
|
} elsif ( $feat->strand == -1 ) { |
907
|
0
|
|
|
|
|
0
|
$strand = '-'; |
908
|
|
|
|
|
|
|
} |
909
|
|
|
|
|
|
|
|
910
|
0
|
0
|
|
|
|
0
|
if( $feat->can('seqname') ) { |
911
|
0
|
|
|
|
|
0
|
$name = $feat->seq_id(); |
912
|
0
|
|
0
|
|
|
0
|
$name ||= 'SEQ'; |
913
|
|
|
|
|
|
|
} else { |
914
|
0
|
|
|
|
|
0
|
$name = 'SEQ'; |
915
|
|
|
|
|
|
|
} |
916
|
0
|
|
|
|
|
0
|
$str1 = join("\t", |
917
|
|
|
|
|
|
|
$name, |
918
|
|
|
|
|
|
|
$feat->source_tag(), |
919
|
|
|
|
|
|
|
$feat->primary_tag(), |
920
|
|
|
|
|
|
|
$feat->start(), |
921
|
|
|
|
|
|
|
$feat->end(), |
922
|
|
|
|
|
|
|
$score, |
923
|
|
|
|
|
|
|
$strand, |
924
|
|
|
|
|
|
|
$frame); |
925
|
|
|
|
|
|
|
|
926
|
0
|
|
|
|
|
0
|
my @all_tags = $feat->all_tags; |
927
|
0
|
|
|
|
|
0
|
my @group; my @firstgroup; |
928
|
0
|
0
|
|
|
|
0
|
if (@all_tags) { # only play this game if it is worth playing... |
929
|
0
|
|
|
|
|
0
|
foreach my $tag ( @all_tags ) { |
930
|
0
|
|
|
|
|
0
|
my @v; |
931
|
0
|
|
|
|
|
0
|
foreach my $value ( $feat->get_tag_values($tag) ) { |
932
|
0
|
0
|
|
|
|
0
|
next if exists $SKIPPED_TAGS{$tag}; |
933
|
0
|
0
|
0
|
|
|
0
|
unless( defined $value && length($value) ) { |
|
|
0
|
|
|
|
|
|
934
|
0
|
|
|
|
|
0
|
$value = '""'; |
935
|
|
|
|
|
|
|
} elsif ($value =~ /[^A-Za-z0-9_]/){ |
936
|
0
|
|
|
|
|
0
|
$value =~ s/\t/\\t/g; # substitute tab and newline |
937
|
|
|
|
|
|
|
# characters |
938
|
0
|
|
|
|
|
0
|
$value =~ s/\n/\\n/g; # to their UNIX equivalents |
939
|
0
|
|
|
|
|
0
|
$value = '"' . $value . '" '; |
940
|
|
|
|
|
|
|
} # if the value contains |
941
|
|
|
|
|
|
|
# anything other than valid |
942
|
|
|
|
|
|
|
# tag/value characters, then |
943
|
|
|
|
|
|
|
# quote it |
944
|
0
|
|
|
|
|
0
|
push @v, $value; |
945
|
|
|
|
|
|
|
# for this tag (allowed in GFF2 and .ace format) |
946
|
|
|
|
|
|
|
} |
947
|
0
|
0
|
0
|
|
|
0
|
if (($tag eq 'Group') || ($tag eq 'Target')){ # hopefully we wont get both... |
948
|
0
|
|
|
|
|
0
|
push @firstgroup, "$tag ".join(" ", @v); |
949
|
|
|
|
|
|
|
} else { |
950
|
0
|
|
|
|
|
0
|
push @group, "$tag ".join(" ", @v); |
951
|
|
|
|
|
|
|
} |
952
|
|
|
|
|
|
|
} |
953
|
|
|
|
|
|
|
} |
954
|
0
|
|
|
|
|
0
|
$str2 = join(' ; ', (@firstgroup, @group)); |
955
|
|
|
|
|
|
|
# Add Target information for Feature Pairs |
956
|
0
|
0
|
0
|
|
|
0
|
if( ! $feat->has_tag('Target') && # This is a bad hack IMHO |
|
|
|
0
|
|
|
|
|
957
|
|
|
|
|
|
|
! $feat->has_tag('Group') && |
958
|
|
|
|
|
|
|
$origfeat->isa('Bio::SeqFeature::FeaturePair') ) { |
959
|
0
|
0
|
|
|
|
0
|
$str2 = sprintf("Target %s ; tstart %d ; tend %d", $origfeat->feature1->seq_id, |
|
|
0
|
|
|
|
|
|
960
|
|
|
|
|
|
|
( $origfeat->feature1->strand < 0 ? |
961
|
|
|
|
|
|
|
( $origfeat->feature1->end, |
962
|
|
|
|
|
|
|
$origfeat->feature1->start) : |
963
|
|
|
|
|
|
|
( $origfeat->feature1->start, |
964
|
|
|
|
|
|
|
$origfeat->feature1->end) |
965
|
|
|
|
|
|
|
)) . ($str2?" ; ".$str2:""); # need to put the target info before other tag/value pairs - mw |
966
|
|
|
|
|
|
|
} |
967
|
0
|
|
|
|
|
0
|
return $str1 . "\t". $str2; |
968
|
|
|
|
|
|
|
} |
969
|
|
|
|
|
|
|
|
970
|
|
|
|
|
|
|
|
971
|
|
|
|
|
|
|
=head2 _gff3_string |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
Title : _gff3_string |
974
|
|
|
|
|
|
|
Usage : $gffstr = $gffio->_gff3_string |
975
|
|
|
|
|
|
|
Function: |
976
|
|
|
|
|
|
|
Example : |
977
|
|
|
|
|
|
|
Returns : A GFF3-formatted string representation of the SeqFeature |
978
|
|
|
|
|
|
|
Args : A Bio::SeqFeatureI implementing object to be GFF3-stringified |
979
|
|
|
|
|
|
|
|
980
|
|
|
|
|
|
|
=cut |
981
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
sub _gff3_string { |
983
|
8
|
|
|
8
|
|
8
|
my ($gff, $origfeat) = @_; |
984
|
8
|
|
|
|
|
6
|
my $feat; |
985
|
8
|
50
|
|
|
|
39
|
if ($origfeat->isa('Bio::SeqFeature::FeaturePair')){ |
986
|
0
|
|
|
|
|
0
|
$feat = $origfeat->feature2; |
987
|
|
|
|
|
|
|
} else { |
988
|
8
|
|
|
|
|
8
|
$feat = $origfeat; |
989
|
|
|
|
|
|
|
} |
990
|
|
|
|
|
|
|
|
991
|
8
|
|
|
|
|
15
|
my $ID = $gff->_incrementGFF3ID(); |
992
|
|
|
|
|
|
|
|
993
|
8
|
|
|
|
|
8
|
my ($score,$frame,$name,$strand); |
994
|
|
|
|
|
|
|
|
995
|
8
|
50
|
|
|
|
30
|
if( $feat->can('score') ) { |
996
|
8
|
|
|
|
|
13
|
$score = $feat->score(); |
997
|
|
|
|
|
|
|
} |
998
|
8
|
100
|
|
|
|
15
|
$score = '.' unless defined $score; |
999
|
|
|
|
|
|
|
|
1000
|
8
|
50
|
|
|
|
22
|
if( $feat->can('frame') ) { |
1001
|
8
|
|
|
|
|
15
|
$frame = $feat->frame(); |
1002
|
|
|
|
|
|
|
} |
1003
|
8
|
100
|
|
|
|
13
|
$frame = '1' unless defined $frame; |
1004
|
|
|
|
|
|
|
|
1005
|
8
|
|
|
|
|
16
|
$strand = $feat->strand(); |
1006
|
|
|
|
|
|
|
|
1007
|
8
|
50
|
|
|
|
20
|
if(! $strand) { |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1008
|
0
|
|
|
|
|
0
|
$strand = "."; |
1009
|
|
|
|
|
|
|
} elsif( $strand == 1 ) { |
1010
|
8
|
|
|
|
|
10
|
$strand = '+'; |
1011
|
|
|
|
|
|
|
} elsif ( $feat->strand == -1 ) { |
1012
|
0
|
|
|
|
|
0
|
$strand = '-'; |
1013
|
|
|
|
|
|
|
} |
1014
|
|
|
|
|
|
|
|
1015
|
8
|
50
|
|
|
|
24
|
if( $feat->can('seqname') ) { |
1016
|
8
|
|
|
|
|
18
|
$name = $feat->seq_id(); |
1017
|
8
|
|
50
|
|
|
17
|
$name ||= 'SEQ'; |
1018
|
|
|
|
|
|
|
} else { |
1019
|
0
|
|
|
|
|
0
|
$name = 'SEQ'; |
1020
|
|
|
|
|
|
|
} |
1021
|
|
|
|
|
|
|
|
1022
|
8
|
|
|
|
|
8
|
my @groups; |
1023
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
# force leading ID and Parent tags |
1025
|
8
|
|
|
|
|
17
|
my @all_tags = grep { ! exists $GFF3_ID_Tags{$_} } $feat->all_tags; |
|
36
|
|
|
|
|
57
|
|
1026
|
8
|
|
|
|
|
30
|
for my $t ( sort { $GFF3_ID_Tags{$b} <=> $GFF3_ID_Tags{$a} } |
|
23
|
|
|
|
|
37
|
|
1027
|
|
|
|
|
|
|
keys %GFF3_ID_Tags ) { |
1028
|
24
|
100
|
|
|
|
40
|
unshift @all_tags, $t if $feat->has_tag($t); |
1029
|
|
|
|
|
|
|
} |
1030
|
|
|
|
|
|
|
|
1031
|
8
|
|
|
|
|
13
|
for my $tag ( @all_tags ) { |
1032
|
36
|
100
|
|
|
|
63
|
next if exists $SKIPPED_TAGS{$tag}; |
1033
|
|
|
|
|
|
|
# next if $tag eq 'Target'; |
1034
|
29
|
100
|
66
|
|
|
84
|
if ($tag eq 'Target' && ! $origfeat->isa('Bio::SeqFeature::FeaturePair')){ |
1035
|
|
|
|
|
|
|
# simple Target,start,stop |
1036
|
7
|
|
|
|
|
17
|
my($target_id, $b,$e,$strand) = $feat->get_tag_values($tag); |
1037
|
7
|
0
|
33
|
|
|
24
|
next unless(defined($e) && defined($b) && $target_id); |
|
|
|
33
|
|
|
|
|
1038
|
0
|
0
|
0
|
|
|
0
|
($b,$e)= ($e,$b) if(defined $strand && $strand<0); |
1039
|
0
|
|
|
|
|
0
|
$target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge; |
|
0
|
|
|
|
|
0
|
|
1040
|
0
|
|
|
|
|
0
|
push @groups, sprintf("Target=%s %d %d", $target_id,$b,$e); |
1041
|
0
|
|
|
|
|
0
|
next; |
1042
|
|
|
|
|
|
|
} |
1043
|
|
|
|
|
|
|
|
1044
|
22
|
|
|
|
|
14
|
my $valuestr; |
1045
|
|
|
|
|
|
|
# a string which will hold one or more values |
1046
|
|
|
|
|
|
|
# for this tag, with quoted free text and |
1047
|
|
|
|
|
|
|
# space-separated individual values. |
1048
|
|
|
|
|
|
|
my @v; |
1049
|
22
|
|
|
|
|
39
|
for my $value ( $feat->get_tag_values($tag) ) { |
1050
|
22
|
100
|
66
|
|
|
67
|
if( defined $value && length($value) ) { |
1051
|
|
|
|
|
|
|
#$value =~ tr/ /+/; #spaces are allowed now |
1052
|
20
|
50
|
|
|
|
32
|
if ( ref $value eq 'Bio::Annotation::Comment') { |
1053
|
0
|
|
|
|
|
0
|
$value = $value->text; |
1054
|
|
|
|
|
|
|
} |
1055
|
|
|
|
|
|
|
|
1056
|
20
|
100
|
|
|
|
46
|
if ($value =~ /[^a-zA-Z0-9\,\;\=\.:\%\^\*\$\@\!\+\_\?\-]/) { |
1057
|
1
|
|
|
|
|
3
|
$value =~ s/\t/\\t/g; # substitute tab and newline |
1058
|
|
|
|
|
|
|
# characters |
1059
|
1
|
|
|
|
|
1
|
$value =~ s/\n/\\n/g; # to their UNIX equivalents |
1060
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
# Unescaped quotes are not allowed in GFF3 |
1062
|
|
|
|
|
|
|
# $value = '"' . $value . '"'; |
1063
|
|
|
|
|
|
|
} |
1064
|
20
|
|
|
|
|
35
|
$value =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge; |
|
2
|
|
|
|
|
15
|
|
1065
|
|
|
|
|
|
|
} else { |
1066
|
|
|
|
|
|
|
# if it is completely empty, then just make empty double quotes |
1067
|
2
|
|
|
|
|
4
|
$value = '""'; |
1068
|
|
|
|
|
|
|
} |
1069
|
22
|
|
|
|
|
27
|
push @v, $value; |
1070
|
|
|
|
|
|
|
} |
1071
|
|
|
|
|
|
|
# can we figure out how to improve this? |
1072
|
22
|
100
|
|
|
|
77
|
$tag = lcfirst($tag) unless ( $tag =~ |
1073
|
|
|
|
|
|
|
/^(ID|Name|Alias|Parent|Gap|Target|Derives_from|Note|Dbxref|Ontology_term)$/); |
1074
|
|
|
|
|
|
|
|
1075
|
22
|
|
|
|
|
60
|
push @groups, "$tag=".join(",",@v); |
1076
|
|
|
|
|
|
|
} |
1077
|
|
|
|
|
|
|
# Add Target information for Feature Pairs |
1078
|
8
|
50
|
66
|
|
|
15
|
if( $feat->has_tag('Target') && |
|
|
|
66
|
|
|
|
|
1079
|
|
|
|
|
|
|
! $feat->has_tag('Group') && |
1080
|
|
|
|
|
|
|
$origfeat->isa('Bio::SeqFeature::FeaturePair') ) { |
1081
|
|
|
|
|
|
|
|
1082
|
0
|
|
|
|
|
0
|
my $target_id = $origfeat->feature1->seq_id; |
1083
|
0
|
|
|
|
|
0
|
$target_id =~ s/([\t\n\r%&\=;,])/sprintf("%%%X",ord($1))/ge; |
|
0
|
|
|
|
|
0
|
|
1084
|
|
|
|
|
|
|
|
1085
|
0
|
0
|
|
|
|
0
|
push @groups, sprintf("Target=%s %d %d", |
1086
|
|
|
|
|
|
|
$target_id, |
1087
|
|
|
|
|
|
|
( $origfeat->feature1->strand < 0 ? |
1088
|
|
|
|
|
|
|
( $origfeat->feature1->end, |
1089
|
|
|
|
|
|
|
$origfeat->feature1->start) : |
1090
|
|
|
|
|
|
|
( $origfeat->feature1->start, |
1091
|
|
|
|
|
|
|
$origfeat->feature1->end) |
1092
|
|
|
|
|
|
|
)); |
1093
|
|
|
|
|
|
|
} |
1094
|
|
|
|
|
|
|
|
1095
|
|
|
|
|
|
|
# unshift @groups, "ID=autogenerated$ID" unless ($feat->has_tag('ID')); |
1096
|
8
|
50
|
33
|
|
|
53
|
if ( $feat->can('name') && defined($feat->name) ) { |
1097
|
|
|
|
|
|
|
# such as might be for Bio::DB::SeqFeature |
1098
|
0
|
|
|
|
|
0
|
unshift @groups, 'Name=' . $feat->name; |
1099
|
|
|
|
|
|
|
} |
1100
|
|
|
|
|
|
|
|
1101
|
8
|
|
|
|
|
11
|
my $gff_string = ""; |
1102
|
8
|
50
|
|
|
|
16
|
if ($feat->location->isa("Bio::Location::SplitLocationI")) { |
1103
|
0
|
|
|
|
|
0
|
my @locs = $feat->location->each_Location; |
1104
|
0
|
|
|
|
|
0
|
foreach my $loc (@locs) { |
1105
|
0
|
|
0
|
|
|
0
|
$gff_string .= join("\t", |
1106
|
|
|
|
|
|
|
$name, |
1107
|
|
|
|
|
|
|
$feat->source_tag() || '.', |
1108
|
|
|
|
|
|
|
$feat->primary_tag(), |
1109
|
|
|
|
|
|
|
$loc->start(), |
1110
|
|
|
|
|
|
|
$loc->end(), |
1111
|
|
|
|
|
|
|
$score, |
1112
|
|
|
|
|
|
|
$strand, |
1113
|
|
|
|
|
|
|
$frame, |
1114
|
|
|
|
|
|
|
join(';', @groups)) . "\n"; |
1115
|
|
|
|
|
|
|
} |
1116
|
0
|
|
|
|
|
0
|
chop $gff_string; |
1117
|
0
|
|
|
|
|
0
|
return $gff_string; |
1118
|
|
|
|
|
|
|
} else { |
1119
|
8
|
|
50
|
|
|
15
|
$gff_string = join("\t", |
1120
|
|
|
|
|
|
|
$name, |
1121
|
|
|
|
|
|
|
$feat->source_tag() || '.', |
1122
|
|
|
|
|
|
|
$feat->primary_tag(), |
1123
|
|
|
|
|
|
|
$feat->start(), |
1124
|
|
|
|
|
|
|
$feat->end(), |
1125
|
|
|
|
|
|
|
$score, |
1126
|
|
|
|
|
|
|
$strand, |
1127
|
|
|
|
|
|
|
$frame, |
1128
|
|
|
|
|
|
|
join(';', @groups)); |
1129
|
|
|
|
|
|
|
} |
1130
|
8
|
|
|
|
|
70
|
return $gff_string; |
1131
|
|
|
|
|
|
|
} |
1132
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
=head2 gff_version |
1135
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
Title : _gff_version |
1137
|
|
|
|
|
|
|
Usage : $gffversion = $gffio->gff_version |
1138
|
|
|
|
|
|
|
Function: |
1139
|
|
|
|
|
|
|
Example : |
1140
|
|
|
|
|
|
|
Returns : The GFF version this parser will accept and emit. |
1141
|
|
|
|
|
|
|
Args : none |
1142
|
|
|
|
|
|
|
|
1143
|
|
|
|
|
|
|
=cut |
1144
|
|
|
|
|
|
|
|
1145
|
|
|
|
|
|
|
sub gff_version { |
1146
|
47
|
|
|
47
|
1
|
40
|
my ($self, $value) = @_; |
1147
|
47
|
100
|
66
|
|
|
84
|
if(defined $value && grep {$value == $_ } ( 1, 2, 2.5, 3)) { |
|
64
|
|
|
|
|
117
|
|
1148
|
16
|
|
|
|
|
23
|
$self->{'GFF_VERSION'} = $value; |
1149
|
|
|
|
|
|
|
} |
1150
|
47
|
|
|
|
|
383
|
return $self->{'GFF_VERSION'}; |
1151
|
|
|
|
|
|
|
} |
1152
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
# Make filehandles |
1155
|
|
|
|
|
|
|
|
1156
|
|
|
|
|
|
|
=head2 newFh |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
Title : newFh |
1159
|
|
|
|
|
|
|
Usage : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format') |
1160
|
|
|
|
|
|
|
Function: does a new() followed by an fh() |
1161
|
|
|
|
|
|
|
Example : $fh = Bio::Tools::GFF->newFh(-file=>$filename,-format=>'Format') |
1162
|
|
|
|
|
|
|
$feature = <$fh>; # read a feature object |
1163
|
|
|
|
|
|
|
print $fh $feature; # write a feature object |
1164
|
|
|
|
|
|
|
Returns : filehandle tied to the Bio::Tools::GFF class |
1165
|
|
|
|
|
|
|
Args : |
1166
|
|
|
|
|
|
|
|
1167
|
|
|
|
|
|
|
=cut |
1168
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
sub newFh { |
1170
|
0
|
|
|
0
|
1
|
0
|
my $class = shift; |
1171
|
0
|
0
|
|
|
|
0
|
return unless my $self = $class->new(@_); |
1172
|
0
|
|
|
|
|
0
|
return $self->fh; |
1173
|
|
|
|
|
|
|
} |
1174
|
|
|
|
|
|
|
|
1175
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
=head2 fh |
1177
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
Title : fh |
1179
|
|
|
|
|
|
|
Usage : $obj->fh |
1180
|
|
|
|
|
|
|
Function: |
1181
|
|
|
|
|
|
|
Example : $fh = $obj->fh; # make a tied filehandle |
1182
|
|
|
|
|
|
|
$feature = <$fh>; # read a feature object |
1183
|
|
|
|
|
|
|
print $fh $feature; # write a feature object |
1184
|
|
|
|
|
|
|
Returns : filehandle tied to Bio::Tools::GFF class |
1185
|
|
|
|
|
|
|
Args : none |
1186
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
=cut |
1188
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
sub fh { |
1191
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1192
|
0
|
|
0
|
|
|
0
|
my $class = ref($self) || $self; |
1193
|
0
|
|
|
|
|
0
|
my $s = Symbol::gensym; |
1194
|
0
|
|
|
|
|
0
|
tie $$s,$class,$self; |
1195
|
0
|
|
|
|
|
0
|
return $s; |
1196
|
|
|
|
|
|
|
} |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
# This accessor is used for accessing the Bio::Seq objects from a GFF3 |
1199
|
|
|
|
|
|
|
# file; if the file you are using has no sequence data you can ignore |
1200
|
|
|
|
|
|
|
# this accessor |
1201
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
# This accessor returns a hash reference containing Bio::Seq objects, |
1203
|
|
|
|
|
|
|
# indexed by Bio::Seq->primary_id |
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
sub _seq_by_id_h { |
1206
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
1207
|
|
|
|
|
|
|
|
1208
|
0
|
0
|
|
|
|
0
|
return $self->{'_seq_by_id_h'} = shift if @_; |
1209
|
|
|
|
|
|
|
$self->{'_seq_by_id_h'} = {} |
1210
|
0
|
0
|
|
|
|
0
|
unless $self->{'_seq_by_id_h'}; |
1211
|
0
|
|
|
|
|
0
|
return $self->{'_seq_by_id_h'}; |
1212
|
|
|
|
|
|
|
} |
1213
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
=head2 get_seqs |
1216
|
|
|
|
|
|
|
|
1217
|
|
|
|
|
|
|
Title : get_seqs |
1218
|
|
|
|
|
|
|
Usage : |
1219
|
|
|
|
|
|
|
Function: Returns all Bio::Seq objects populated by GFF3 file |
1220
|
|
|
|
|
|
|
Example : |
1221
|
|
|
|
|
|
|
Returns : |
1222
|
|
|
|
|
|
|
Args : |
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
=cut |
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
sub get_seqs { |
1227
|
0
|
|
|
0
|
1
|
0
|
my ($self,@args) = @_; |
1228
|
0
|
|
|
|
|
0
|
return values %{$self->_seq_by_id_h}; |
|
0
|
|
|
|
|
0
|
|
1229
|
|
|
|
|
|
|
} |
1230
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
|
1232
|
|
|
|
|
|
|
=head2 features_attached_to_seqs |
1233
|
|
|
|
|
|
|
|
1234
|
|
|
|
|
|
|
Title : features_attached_to_seqs |
1235
|
|
|
|
|
|
|
Usage : $obj->features_attached_to_seqs(1); |
1236
|
|
|
|
|
|
|
Function: For use with GFF3 containg sequence only |
1237
|
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
|
Setting this B parsing ensures that all Bio::Seq object |
1239
|
|
|
|
|
|
|
created will have the appropriate features added to them |
1240
|
|
|
|
|
|
|
|
1241
|
|
|
|
|
|
|
defaults to false (off) |
1242
|
|
|
|
|
|
|
|
1243
|
|
|
|
|
|
|
Note that this mode will incur higher memory usage because features |
1244
|
|
|
|
|
|
|
will have to be cached until the relevant feature comes along |
1245
|
|
|
|
|
|
|
|
1246
|
|
|
|
|
|
|
Example : |
1247
|
|
|
|
|
|
|
Returns : value of features_attached_to_seqs (a boolean) |
1248
|
|
|
|
|
|
|
Args : on set, new value (a boolean, optional) |
1249
|
|
|
|
|
|
|
|
1250
|
|
|
|
|
|
|
|
1251
|
|
|
|
|
|
|
=cut |
1252
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
sub features_attached_to_seqs{ |
1254
|
3
|
|
|
3
|
1
|
3
|
my $self = shift; |
1255
|
|
|
|
|
|
|
|
1256
|
3
|
50
|
|
|
|
7
|
return $self->{'_features_attached_to_seqs'} = shift if @_; |
1257
|
3
|
|
|
|
|
5
|
return $self->{'_features_attached_to_seqs'}; |
1258
|
|
|
|
|
|
|
} |
1259
|
|
|
|
|
|
|
|
1260
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
=head2 ignore_sequence |
1262
|
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
Title : ignore_sequence |
1264
|
|
|
|
|
|
|
Usage : $obj->ignore_sequence(1); |
1265
|
|
|
|
|
|
|
Function: For use with GFF3 containg sequence only |
1266
|
|
|
|
|
|
|
|
1267
|
|
|
|
|
|
|
Setting this B parsing means that all sequence data will be |
1268
|
|
|
|
|
|
|
ignored |
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
Example : |
1271
|
|
|
|
|
|
|
Returns : value of ignore_sequence (a boolean) |
1272
|
|
|
|
|
|
|
Args : on set, new value (a boolean, optional) |
1273
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
=cut |
1275
|
|
|
|
|
|
|
|
1276
|
|
|
|
|
|
|
sub ignore_sequence{ |
1277
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1278
|
|
|
|
|
|
|
|
1279
|
0
|
0
|
|
|
|
0
|
return $self->{'_ignore_sequence'} = shift if @_; |
1280
|
0
|
|
|
|
|
0
|
return $self->{'_ignore_sequence'}; |
1281
|
|
|
|
|
|
|
} |
1282
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
|
1284
|
|
|
|
|
|
|
sub DESTROY { |
1285
|
15
|
|
|
15
|
|
57
|
my $self = shift; |
1286
|
15
|
|
|
|
|
41
|
$self->close(); |
1287
|
|
|
|
|
|
|
} |
1288
|
|
|
|
|
|
|
|
1289
|
|
|
|
|
|
|
sub TIEHANDLE { |
1290
|
0
|
|
|
0
|
|
|
my ($class,$val) = @_; |
1291
|
0
|
|
|
|
|
|
return bless {'gffio' => $val}, $class; |
1292
|
|
|
|
|
|
|
} |
1293
|
|
|
|
|
|
|
|
1294
|
|
|
|
|
|
|
sub READLINE { |
1295
|
0
|
|
|
0
|
|
|
my $self = shift; |
1296
|
0
|
0
|
0
|
|
|
|
return $self->{'gffio'}->next_feature() || undef unless wantarray; |
1297
|
0
|
|
|
|
|
|
my (@list, $obj); |
1298
|
0
|
|
|
|
|
|
push @list, $obj while $obj = $self->{'gffio'}->next_feature(); |
1299
|
0
|
|
|
|
|
|
return @list; |
1300
|
|
|
|
|
|
|
} |
1301
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
sub PRINT { |
1303
|
0
|
|
|
0
|
|
|
my $self = shift; |
1304
|
0
|
|
|
|
|
|
$self->{'gffio'}->write_feature(@_); |
1305
|
|
|
|
|
|
|
} |
1306
|
|
|
|
|
|
|
|
1307
|
|
|
|
|
|
|
1; |
1308
|
|
|
|
|
|
|
|