line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::ToolBox::parser::gff; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '1.66'; |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
=head1 NAME |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
Bio::ToolBox::parser::gff - parse GFF3, GTF, and GFF files |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 SYNOPSIS |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
use Bio::ToolBox::parser::gff; |
12
|
|
|
|
|
|
|
my $filename = 'file.gff3'; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
my $parser = Bio::ToolBox::parser::gff->new( |
15
|
|
|
|
|
|
|
file => $filename, |
16
|
|
|
|
|
|
|
do_gene => 1, |
17
|
|
|
|
|
|
|
do_exon => 1, |
18
|
|
|
|
|
|
|
) or die "unable to open gff file!\n"; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
while (my $feature = $parser->next_top_feature() ) { |
21
|
|
|
|
|
|
|
# each $feature is a SeqFeature object |
22
|
|
|
|
|
|
|
my @children = $feature->get_SeqFeatures(); |
23
|
|
|
|
|
|
|
} |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 DESCRIPTION |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
This module parses a GFF file into SeqFeature objects. It natively |
28
|
|
|
|
|
|
|
handles GFF3, GTF, and general GFF files. |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
For both GFF3 and GTF files, fully nested gene models, typically |
31
|
|
|
|
|
|
|
gene =E transcript =E (exon, CDS, etc), may be built using the appropriate |
32
|
|
|
|
|
|
|
attribute tags. For GFF3 files, these include ID and Parent tags; for GTF |
33
|
|
|
|
|
|
|
these include C and C tags. |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
For GFF3 files, any feature without a C tag is assumed to be a |
36
|
|
|
|
|
|
|
parent. Children features referencing a parent feature that has not been |
37
|
|
|
|
|
|
|
loaded are considered orphans. Orphans are attempted to be re-associated |
38
|
|
|
|
|
|
|
with missing parents after the file is completely parsed. Any orphans left |
39
|
|
|
|
|
|
|
may be collected. Files with orphans are considered poorly formatted or |
40
|
|
|
|
|
|
|
incomplete and should be fixed. Multiple parentage, for example exons |
41
|
|
|
|
|
|
|
shared between different transcripts of the same gene, are fully supported. |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
Embedded Fasta sequences are ignored, as are most comment and pragma lines. |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
The SeqFeature objects that are returned are L |
46
|
|
|
|
|
|
|
objects. Refer to that documentation for more information. |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=head1 METHODS |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head2 Initialize and modify the parser. |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
These are class methods to initialize the parser with an annotation file |
53
|
|
|
|
|
|
|
and modify the parsing behavior. Most parameters can be set either upon |
54
|
|
|
|
|
|
|
initialization or as class methods on the object. Unpredictable behavior |
55
|
|
|
|
|
|
|
may occur if you implement these in the midst of parsing a file. |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
Do not open subsequent files with the same object. Always create a new |
58
|
|
|
|
|
|
|
object to parse a new file |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=over 4 |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=item new |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
my $parser = Bio::ToolBox::parser::gff->new($filename); |
65
|
|
|
|
|
|
|
my $parser = Bio::ToolBox::parser::gff->new( |
66
|
|
|
|
|
|
|
file => 'file.gtf.gz', |
67
|
|
|
|
|
|
|
do_gene => 1, |
68
|
|
|
|
|
|
|
do_utr => 1, |
69
|
|
|
|
|
|
|
); |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Initialize a new gff parser object. Pass a single value (a GFF file name) |
72
|
|
|
|
|
|
|
to open a file. Alternatively, pass an array of key value pairs to control |
73
|
|
|
|
|
|
|
how the file is parsed. Options include the following. |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=over 4 |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=item file |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
Provide a GFF file name to be parsed. It should have a gff, gtf, or gff3 file |
80
|
|
|
|
|
|
|
extension. The file may be gzip compressed. |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
=item version |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
Specify the version. Normally this is not needed, as version can be determined |
85
|
|
|
|
|
|
|
either from the file extension (in the case of gtf and gff3) or from the |
86
|
|
|
|
|
|
|
C<##gff-version> pragma at the top of the file. Acceptable values include 1, 2, |
87
|
|
|
|
|
|
|
2.5 (gtf), or 3. |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
=item class |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
Pass the name of a L compliant class that will be used to |
92
|
|
|
|
|
|
|
create the SeqFeature objects. The default is to use L. |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=item simplify |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
Pass a boolean value to simplify the SeqFeature objects parsed from the GFF |
97
|
|
|
|
|
|
|
file and ignore extraneous attributes. |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
=item do_gene |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Pass a boolean (1 or 0) value to combine multiple transcripts with the same gene |
102
|
|
|
|
|
|
|
name under a single gene object. Default is true. |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=item do_cds |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=item do_exon |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=item do_utr |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=item do_codon |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
Pass a boolean (1 or 0) value to parse certain subfeatures. Exon subfeatures |
113
|
|
|
|
|
|
|
are always parsed, but C, C, C, C, |
114
|
|
|
|
|
|
|
and C features may be optionally parsed. Default is false. |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=back |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
=item open_file |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
$parser->open_file($file) or die "unable to open $file!"; |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
Pass the name of a GFF file to be parsed. The file may optionally be gzipped |
123
|
|
|
|
|
|
|
(F<.gz> extension). Do not open a new file when one has already opened a file. |
124
|
|
|
|
|
|
|
Create a new object for a new file, or concatenate the GFF files. |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=item version |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
Set or get the GFF version of the current file. Acceptable values include 1, 2, |
129
|
|
|
|
|
|
|
2.5 (gtf), or 3. Normally this is determined by file extension or |
130
|
|
|
|
|
|
|
C pragma on the first line, and should not need to be set by the |
131
|
|
|
|
|
|
|
user in most circumstances. |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=item simplify |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Pass a boolean true value to simplify the attributes of GFF3 and GTF files |
136
|
|
|
|
|
|
|
that may have considerable numbers of tags, e.g. Ensembl files. Only |
137
|
|
|
|
|
|
|
essential information, including name, ID, and parentage, is retained. |
138
|
|
|
|
|
|
|
Useful if you're trying to quickly parse annotation files for basic |
139
|
|
|
|
|
|
|
information. |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=back |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head2 Feature retrieval |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
The following methods parse the GFF file lines into SeqFeature objects. |
146
|
|
|
|
|
|
|
It is best if these methods are not mixed; unexpected results may occur. |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=over 4 |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=item next_top_feature |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
This method will return a top level parent SeqFeature object |
153
|
|
|
|
|
|
|
assembled with child features as sub-features. For example, a gene |
154
|
|
|
|
|
|
|
object with mRNA subfeatures, which in turn may have exon and/or CDS |
155
|
|
|
|
|
|
|
subfeatures. Child features are assembled based on the existence of |
156
|
|
|
|
|
|
|
proper Parent attributes in child features. If no Parent attributes are |
157
|
|
|
|
|
|
|
included in the GFF file, then this will behave as L. |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
Child features (those containing a C attribute) |
160
|
|
|
|
|
|
|
are associated with the parent feature. A warning will be issued about lost |
161
|
|
|
|
|
|
|
children (orphans). Shared subfeatures, for example exons common to |
162
|
|
|
|
|
|
|
multiple transcripts, are associated properly with each parent. An opportunity |
163
|
|
|
|
|
|
|
to rescue orphans is available using the L method. |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
Note that subfeatures may not necessarily be in ascending genomic order |
166
|
|
|
|
|
|
|
when associated with the feature, depending on their order in the GFF3 |
167
|
|
|
|
|
|
|
file and whether shared subfeatures are present or not. When calling |
168
|
|
|
|
|
|
|
subfeatures in your program, you may want to sort the subfeatures. For |
169
|
|
|
|
|
|
|
example |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
my @subfeatures = map { $_->[0] } |
172
|
|
|
|
|
|
|
sort { $a->[1] <=> $b->[1] } |
173
|
|
|
|
|
|
|
map { [$_, $_->start] } |
174
|
|
|
|
|
|
|
$parent->get_SeqFeatures; |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=item top_features |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
This method will return an array of the top (parent) features defined in |
179
|
|
|
|
|
|
|
the GFF file. This is similar to the L method except that |
180
|
|
|
|
|
|
|
all features are returned at once. |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=item next_feature |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
This method will return a SeqFeature object representation of |
185
|
|
|
|
|
|
|
the next feature (line) in the file. Parent - child relationships are |
186
|
|
|
|
|
|
|
NOT assembled; however, undefined parents in a GTF file may still be |
187
|
|
|
|
|
|
|
generated, just not returned. |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
This method is best used with simple GFF files with no hierarchies |
190
|
|
|
|
|
|
|
present. This may be used in a while loop until the end of the file |
191
|
|
|
|
|
|
|
is reached. Pragmas are ignored and comment lines and sequence are |
192
|
|
|
|
|
|
|
automatically skipped. |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=back |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=head2 Other methods |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
Additional methods for working with the parser object and the parsed |
199
|
|
|
|
|
|
|
SeqFeature objects. |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
=over 4 |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=item fh |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
This method returns the L object of the opened GFF file. |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=item parse_file |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
Parses the entire file into memory. This is automatically called when |
210
|
|
|
|
|
|
|
either L or L is called. |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=item find_gene |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
my $gene = $parser->find_gene( |
215
|
|
|
|
|
|
|
name => $display_name, |
216
|
|
|
|
|
|
|
id => $primary_id, |
217
|
|
|
|
|
|
|
) or warn "gene $display_name can not be found!"; |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
Pass a gene name, or an array of key =E values (C, C, |
220
|
|
|
|
|
|
|
C, C, and/or coordinate information), that can be used |
221
|
|
|
|
|
|
|
to find a gene already loaded into memory. Only useful after |
222
|
|
|
|
|
|
|
is called. Genes with a matching name are confirmed by a matching ID or |
223
|
|
|
|
|
|
|
overlapping coordinates, if available. Otherwise the first match is returned. |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=item orphans |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
my @orphans = $parser->orphans; |
228
|
|
|
|
|
|
|
printf "we have %d orphans left over!", scalar @orpans; |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
This method will return an array of orphan SeqFeature objects that indicated |
231
|
|
|
|
|
|
|
they had a parent but said parent could not be found. Typically, this is an |
232
|
|
|
|
|
|
|
indication of an incomplete or malformed GFF3 file. Nevertheless, it might |
233
|
|
|
|
|
|
|
be a good idea to check this after retrieving all top features. |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
=item comments |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
This method will return an array of the comment or pragma lines that may have |
238
|
|
|
|
|
|
|
been in the parsed file. These may or may not be useful. |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=item typelist |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
Returns a comma-delimited string of the GFF primary tags (column 3) |
243
|
|
|
|
|
|
|
observed in the first 1000 lines of an opened file. Useful for |
244
|
|
|
|
|
|
|
checking what is in the GFF file. See |
245
|
|
|
|
|
|
|
L. |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=item from_gff_string |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
my $seqfeature = $parser->from_gff_string($string); |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
This method will parse a single GFF, GTF, or GFF3 formatted string or line |
252
|
|
|
|
|
|
|
of text and return a SeqFeature object. |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=item unescape |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
This method will unescape special characters in a text string. Certain |
257
|
|
|
|
|
|
|
characters, including ";" and "=", are reserved for GFF3 formatting and |
258
|
|
|
|
|
|
|
are not allowed, thus requiring them to be escaped. |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=item seq_ids |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
Returns an array or array reference of the names of the chromosomes or |
263
|
|
|
|
|
|
|
reference sequences present in the file. These may be defined by GFF3 |
264
|
|
|
|
|
|
|
sequence-region pragmas or inferred from the features. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=item seq_id_lengths |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
my $seq2len = $parser->seq_id_lengths; |
269
|
|
|
|
|
|
|
foreach (keys %$seq2len) { |
270
|
|
|
|
|
|
|
printf "chromosome %s is %d bp long\n", $_, $seq2len->{$_}; |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
Returns a hash reference to the chromosomes or reference sequences and |
274
|
|
|
|
|
|
|
their corresponding lengths. In this case, the length is either defined |
275
|
|
|
|
|
|
|
by the C pragma or inferred by the greatest end position of |
276
|
|
|
|
|
|
|
the top features. |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
=back |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
=head1 SEE ALSO |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
L, L, L |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
=cut |
285
|
|
|
|
|
|
|
|
286
|
1
|
|
|
1
|
|
72886
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
26
|
|
287
|
1
|
|
|
1
|
|
5
|
use Carp qw(carp cluck croak); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
39
|
|
288
|
1
|
|
|
1
|
|
503
|
use Bio::ToolBox::Data::file; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
4565
|
|
289
|
|
|
|
|
|
|
my $SFCLASS = 'Bio::ToolBox::SeqFeature'; # alternative to Bio::SeqFeature::Lite |
290
|
|
|
|
|
|
|
eval "require $SFCLASS" or croak $@; |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
1; |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
sub new { |
295
|
11
|
|
|
11
|
1
|
10767
|
my $class = shift; |
296
|
11
|
|
|
|
|
129
|
my $self = { |
297
|
|
|
|
|
|
|
'fh' => undef, |
298
|
|
|
|
|
|
|
'top_features' => [], |
299
|
|
|
|
|
|
|
'orphans' => [], |
300
|
|
|
|
|
|
|
'duplicate_ids' => {}, |
301
|
|
|
|
|
|
|
'loaded' => {}, |
302
|
|
|
|
|
|
|
'eof' => 0, |
303
|
|
|
|
|
|
|
'do_gene' => 1, |
304
|
|
|
|
|
|
|
'do_exon' => 0, |
305
|
|
|
|
|
|
|
'do_cds' => 0, |
306
|
|
|
|
|
|
|
'do_utr' => 0, |
307
|
|
|
|
|
|
|
'do_codon' => 0, |
308
|
|
|
|
|
|
|
'gff3' => 0, |
309
|
|
|
|
|
|
|
'gtf' => 0, |
310
|
|
|
|
|
|
|
'comments' => [], |
311
|
|
|
|
|
|
|
'seq_ids' => {}, |
312
|
|
|
|
|
|
|
'simplify' => 0, |
313
|
|
|
|
|
|
|
'typelist' => '', |
314
|
|
|
|
|
|
|
'convertor_sub' => undef, |
315
|
|
|
|
|
|
|
}; |
316
|
11
|
|
|
|
|
22
|
bless $self, $class; |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
# check for options |
319
|
11
|
100
|
|
|
|
29
|
if (@_) { |
320
|
8
|
50
|
|
|
|
17
|
if (scalar @_ == 1) { |
321
|
0
|
0
|
|
|
|
0
|
$self->open_file($_[0]) or croak "unable to open file!"; |
322
|
|
|
|
|
|
|
} |
323
|
|
|
|
|
|
|
else { |
324
|
8
|
|
|
|
|
25
|
my %options = @_; |
325
|
8
|
100
|
|
|
|
20
|
if (exists $options{simplify}) { |
326
|
4
|
|
|
|
|
9
|
$self->simplify( $options{simplify} ); |
327
|
|
|
|
|
|
|
} |
328
|
8
|
100
|
|
|
|
16
|
if (exists $options{do_gene}) { |
329
|
4
|
|
|
|
|
10
|
$self->do_gene($options{do_gene}); |
330
|
|
|
|
|
|
|
} |
331
|
8
|
100
|
|
|
|
17
|
if (exists $options{do_exon}) { |
332
|
2
|
|
|
|
|
7
|
$self->do_exon($options{do_exon}); |
333
|
|
|
|
|
|
|
} |
334
|
8
|
100
|
|
|
|
17
|
if (exists $options{do_cds}) { |
335
|
4
|
|
|
|
|
7
|
$self->do_cds($options{do_cds}); |
336
|
|
|
|
|
|
|
} |
337
|
8
|
50
|
|
|
|
16
|
if (exists $options{do_utr}) { |
338
|
0
|
|
|
|
|
0
|
$self->do_utr($options{do_utr}); |
339
|
|
|
|
|
|
|
} |
340
|
8
|
50
|
|
|
|
14
|
if (exists $options{do_codon}) { |
341
|
0
|
|
|
|
|
0
|
$self->do_codon($options{do_codon}); |
342
|
|
|
|
|
|
|
} |
343
|
8
|
50
|
|
|
|
14
|
if (exists $options{version}) { |
344
|
0
|
|
|
|
|
0
|
$self->version($options{version}); |
345
|
|
|
|
|
|
|
} |
346
|
8
|
50
|
33
|
|
|
17
|
if (exists $options{file} or $options{table}) { |
347
|
8
|
|
33
|
|
|
15
|
$options{file} ||= $options{table}; |
348
|
8
|
50
|
|
|
|
17
|
$self->open_file( $options{file} ) or |
349
|
|
|
|
|
|
|
croak "unable to open file!"; |
350
|
|
|
|
|
|
|
} |
351
|
8
|
50
|
|
|
|
17
|
if (exists $options{class}) { |
352
|
8
|
|
|
|
|
20
|
my $class = $options{class}; |
353
|
8
|
50
|
|
|
|
381
|
if (eval "require $class; 1") { |
354
|
8
|
|
|
|
|
20
|
$SFCLASS = $class; |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
else { |
357
|
0
|
|
|
|
|
0
|
croak $@; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
} |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
# done |
364
|
11
|
|
|
|
|
36
|
return $self; |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
sub do_gene { |
368
|
593
|
|
|
593
|
1
|
1252
|
my $self = shift; |
369
|
593
|
100
|
|
|
|
814
|
if (@_) { |
370
|
7
|
|
|
|
|
12
|
$self->{'do_gene'} = shift; |
371
|
|
|
|
|
|
|
} |
372
|
593
|
|
|
|
|
1354
|
return $self->{'do_gene'}; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
sub do_exon { |
376
|
188
|
|
|
188
|
1
|
200
|
my $self = shift; |
377
|
188
|
100
|
|
|
|
305
|
if (@_) { |
378
|
6
|
|
|
|
|
10
|
$self->{'do_exon'} = shift; |
379
|
|
|
|
|
|
|
} |
380
|
188
|
|
|
|
|
297
|
return $self->{'do_exon'}; |
381
|
|
|
|
|
|
|
} |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
sub do_cds { |
384
|
272
|
|
|
272
|
1
|
314
|
my $self = shift; |
385
|
272
|
100
|
|
|
|
415
|
if (@_) { |
386
|
5
|
|
|
|
|
9
|
$self->{'do_cds'} = shift; |
387
|
|
|
|
|
|
|
} |
388
|
272
|
|
|
|
|
493
|
return $self->{'do_cds'}; |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
sub do_utr { |
392
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
393
|
0
|
0
|
|
|
|
0
|
if (@_) { |
394
|
0
|
|
|
|
|
0
|
$self->{'do_utr'} = shift; |
395
|
|
|
|
|
|
|
} |
396
|
0
|
|
|
|
|
0
|
return $self->{'do_utr'}; |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
sub do_codon { |
400
|
34
|
|
|
34
|
1
|
47
|
my $self = shift; |
401
|
34
|
50
|
|
|
|
62
|
if (@_) { |
402
|
0
|
|
|
|
|
0
|
$self->{'do_codon'} = shift; |
403
|
|
|
|
|
|
|
} |
404
|
34
|
|
|
|
|
70
|
return $self->{'do_codon'}; |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
sub do_name { |
408
|
|
|
|
|
|
|
# this does nothing other than maintain compatibility with ucsc parser |
409
|
0
|
|
|
0
|
0
|
0
|
return 0; |
410
|
|
|
|
|
|
|
} |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
sub share { |
413
|
|
|
|
|
|
|
# this does nothing other than maintain compatibility with ucsc parser |
414
|
0
|
|
|
0
|
0
|
0
|
return 1; |
415
|
|
|
|
|
|
|
} |
416
|
|
|
|
|
|
|
|
417
|
|
|
|
|
|
|
sub simplify { |
418
|
85
|
|
|
85
|
1
|
101
|
my $self = shift; |
419
|
85
|
100
|
|
|
|
132
|
if (defined $_[0]) { |
420
|
7
|
|
|
|
|
12
|
$self->{simplify} = shift; |
421
|
|
|
|
|
|
|
} |
422
|
85
|
|
|
|
|
837
|
return $self->{simplify}; |
423
|
|
|
|
|
|
|
} |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
sub version { |
426
|
24
|
|
|
24
|
1
|
1145
|
my $self = shift; |
427
|
24
|
100
|
|
|
|
47
|
if (@_) { |
428
|
11
|
|
|
|
|
22
|
my $v = shift; |
429
|
11
|
100
|
33
|
|
|
31
|
if ($v eq '3') { |
|
|
50
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
430
|
7
|
|
|
|
|
19
|
$self->{version} = $v; |
431
|
7
|
|
|
|
|
12
|
$self->{gff3} = 1; |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
elsif ($v eq '2.5' or $v eq '2.2') { |
434
|
4
|
|
|
|
|
9
|
$self->{version} = '2.5'; |
435
|
4
|
|
|
|
|
6
|
$self->{gtf} = 1; |
436
|
|
|
|
|
|
|
} |
437
|
|
|
|
|
|
|
elsif ($v eq '2' or $v eq '1') { |
438
|
0
|
|
|
|
|
0
|
$self->{version} = $v; |
439
|
|
|
|
|
|
|
} |
440
|
|
|
|
|
|
|
else { |
441
|
0
|
|
|
|
|
0
|
warn "unrecognized GFF version '$v'!\n"; |
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
} |
444
|
24
|
|
|
|
|
284
|
return $self->{version}; |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
sub open_file { |
448
|
11
|
|
|
11
|
1
|
24
|
my $self = shift; |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
# check file |
451
|
11
|
|
|
|
|
16
|
my $filename = shift; |
452
|
11
|
50
|
|
|
|
20
|
unless ($filename) { |
453
|
0
|
|
|
|
|
0
|
cluck("no file name passed!\n"); |
454
|
0
|
|
|
|
|
0
|
return; |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# check type list |
458
|
11
|
|
|
|
|
59
|
my $typelist = Bio::ToolBox::Data::file->sample_gff_type_list($filename); |
459
|
11
|
50
|
|
|
|
47
|
if ($typelist !~ /\w+/) { |
460
|
0
|
|
|
|
|
0
|
print "GFF file has no evident types!? $filename may not be a valid GFF file\n"; |
461
|
0
|
|
|
|
|
0
|
return; |
462
|
|
|
|
|
|
|
} |
463
|
11
|
|
|
|
|
25
|
$self->{typelist} = $typelist; |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
# Open filehandle object |
466
|
11
|
50
|
|
|
|
68
|
my $fh = Bio::ToolBox::Data::file->open_to_read_fh($filename) or |
467
|
|
|
|
|
|
|
croak " cannot open file '$filename'!\n"; |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
# check gff version pragma |
470
|
11
|
|
|
|
|
220
|
my $first = $fh->getline; |
471
|
11
|
50
|
|
|
|
379
|
if ($first =~ /^##gff.version\s+([\d\.]+)\s*$/i) { |
472
|
|
|
|
|
|
|
# override any version that may have been inferred from the extension |
473
|
|
|
|
|
|
|
# based on the assumption that this pragma is correct |
474
|
11
|
|
|
|
|
33
|
$self->version($1); |
475
|
|
|
|
|
|
|
} |
476
|
|
|
|
|
|
|
else { |
477
|
|
|
|
|
|
|
# no pragma, reopen the file |
478
|
0
|
|
|
|
|
0
|
$fh->close; |
479
|
0
|
|
|
|
|
0
|
$fh = Bio::ToolBox::Data::file->open_to_read_fh($filename); |
480
|
|
|
|
|
|
|
# set version based on file type extension???? |
481
|
0
|
0
|
|
|
|
0
|
if ($filename =~ /\.gtf.*$/i) { |
|
|
0
|
|
|
|
|
|
482
|
0
|
|
|
|
|
0
|
$self->version('2.5'); |
483
|
0
|
|
|
|
|
0
|
$self->{gtf} = 1; |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
elsif ($filename =~ /\.gff3.*$/i) { |
486
|
0
|
|
|
|
|
0
|
$self->version('3'); |
487
|
0
|
|
|
|
|
0
|
$self->{gff3} = 1; |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
} |
490
|
11
|
|
|
|
|
15
|
$self->{fh} = $fh; |
491
|
11
|
|
|
|
|
34
|
return 1; |
492
|
|
|
|
|
|
|
} |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
sub fh { |
495
|
668
|
|
|
668
|
1
|
1733
|
return shift->{fh}; |
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
sub typelist { |
499
|
13
|
|
|
13
|
1
|
59
|
return shift->{typelist}; |
500
|
|
|
|
|
|
|
} |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
sub next_feature { |
503
|
587
|
|
|
587
|
1
|
3056
|
my $self = shift; |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# check that we have an open filehandle |
506
|
587
|
50
|
|
|
|
775
|
unless ($self->fh) { |
507
|
0
|
|
|
|
|
0
|
croak("no GFF file loaded to parse!"); |
508
|
|
|
|
|
|
|
} |
509
|
587
|
50
|
|
|
|
872
|
return if $self->{'eof'}; |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# check convertor |
512
|
587
|
100
|
|
|
|
863
|
unless (defined $self->{convertor_sub}) { |
513
|
11
|
|
|
|
|
25
|
$self->_assign_gff_converter; |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
# look for the next feature line |
517
|
587
|
|
|
|
|
9479
|
while (my $line = $self->{fh}->getline) { |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
# check first character |
520
|
770
|
|
|
|
|
13731
|
my $firstchar = substr($line, 0, 1); |
521
|
|
|
|
|
|
|
|
522
|
|
|
|
|
|
|
# skip any comment and pragma lines that we might encounter |
523
|
770
|
100
|
|
|
|
1728
|
if ($firstchar eq '#') { |
|
|
50
|
|
|
|
|
|
524
|
25
|
50
|
|
|
|
54
|
if ($line =~ /^###$/) { |
|
|
50
|
|
|
|
|
|
525
|
|
|
|
|
|
|
# a close pragma, all we can do is check for orphans |
526
|
|
|
|
|
|
|
# a properly written shouldn't have any orphans, but just in case |
527
|
0
|
|
|
|
|
0
|
$self->check_orphanage; |
528
|
0
|
|
|
|
|
0
|
next; |
529
|
|
|
|
|
|
|
} |
530
|
|
|
|
|
|
|
elsif ($line =~ /^##sequence.region/i) { |
531
|
|
|
|
|
|
|
# sequence region pragma |
532
|
0
|
|
|
|
|
0
|
my ($pragma, $seq_id, $start, $stop) = split /\s+/, $line; |
533
|
0
|
0
|
0
|
|
|
0
|
if (defined $seq_id and $start =~ /^\d+$/ and $stop =~ /^\d+$/) { |
|
|
|
0
|
|
|
|
|
534
|
|
|
|
|
|
|
# we're actually only concerned with the stop coordinate |
535
|
0
|
|
|
|
|
0
|
$self->{seq_ids}{$seq_id} = $stop; |
536
|
|
|
|
|
|
|
} |
537
|
|
|
|
|
|
|
else { |
538
|
0
|
|
|
|
|
0
|
warn "malformed sequence-region pragma! $line\n"; |
539
|
|
|
|
|
|
|
} |
540
|
0
|
|
|
|
|
0
|
next; |
541
|
|
|
|
|
|
|
} |
542
|
|
|
|
|
|
|
else { |
543
|
|
|
|
|
|
|
# must be some sort of pragma or a comment line, may be useful, keep it |
544
|
25
|
|
|
|
|
30
|
push @{$self->{comments}}, $line; |
|
25
|
|
|
|
|
39
|
|
545
|
25
|
|
|
|
|
298
|
next; |
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
elsif ($firstchar eq '>') { |
549
|
|
|
|
|
|
|
# fasta header line |
550
|
|
|
|
|
|
|
# this is almost always at the end of the file, and rarely is sequence put |
551
|
|
|
|
|
|
|
# into GFF files anyway, so let's assume it's the end of the file |
552
|
0
|
|
|
|
|
0
|
$self->{'eof'} = 1; |
553
|
0
|
|
|
|
|
0
|
return; |
554
|
|
|
|
|
|
|
} |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
# line must be a GFF feature |
557
|
745
|
|
|
|
|
844
|
chomp $line; |
558
|
745
|
|
|
|
|
2817
|
my @fields = split('\t', $line); |
559
|
745
|
50
|
|
|
|
1274
|
next unless scalar(@fields) == 9; |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
# check the primary_tag and generate the SeqFeature object for known types |
562
|
745
|
|
|
|
|
969
|
my $type = lc $fields[2]; |
563
|
745
|
100
|
|
|
|
1808
|
if ($type eq 'cds') { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
564
|
265
|
100
|
|
|
|
444
|
if ($self->do_cds) { |
565
|
199
|
|
|
|
|
243
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
199
|
|
|
|
|
276
|
|
566
|
|
|
|
|
|
|
} else { |
567
|
66
|
|
|
|
|
887
|
next; |
568
|
|
|
|
|
|
|
} |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
elsif ($type eq 'exon') { |
571
|
178
|
50
|
|
|
|
290
|
if ($self->do_exon) { |
572
|
178
|
|
|
|
|
211
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
178
|
|
|
|
|
252
|
|
573
|
|
|
|
|
|
|
} else { |
574
|
0
|
|
|
|
|
0
|
next; |
575
|
|
|
|
|
|
|
} |
576
|
|
|
|
|
|
|
} |
577
|
|
|
|
|
|
|
elsif ($type =~ /utr|untranslated/) { |
578
|
0
|
0
|
|
|
|
0
|
if ($self->do_utr) { |
579
|
0
|
|
|
|
|
0
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
0
|
|
|
|
|
0
|
|
580
|
|
|
|
|
|
|
} else { |
581
|
0
|
|
|
|
|
0
|
next; |
582
|
|
|
|
|
|
|
} |
583
|
|
|
|
|
|
|
} |
584
|
|
|
|
|
|
|
elsif ($type =~ /codon/) { |
585
|
32
|
50
|
|
|
|
75
|
if ($self->do_codon) { |
586
|
0
|
|
|
|
|
0
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
0
|
|
|
|
|
0
|
|
587
|
|
|
|
|
|
|
} else { |
588
|
32
|
|
|
|
|
454
|
next; |
589
|
|
|
|
|
|
|
} |
590
|
|
|
|
|
|
|
} |
591
|
|
|
|
|
|
|
elsif ($type =~ /gene$/) { |
592
|
165
|
50
|
|
|
|
311
|
if ($self->do_gene) { |
593
|
165
|
|
|
|
|
213
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
165
|
|
|
|
|
277
|
|
594
|
|
|
|
|
|
|
} else { |
595
|
0
|
|
|
|
|
0
|
next; |
596
|
|
|
|
|
|
|
} |
597
|
|
|
|
|
|
|
} |
598
|
|
|
|
|
|
|
elsif ($type =~ /transcript|rna/) { |
599
|
36
|
|
|
|
|
50
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
36
|
|
|
|
|
68
|
|
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
elsif ($type =~ /chromosome|contig|scaffold/) { |
602
|
|
|
|
|
|
|
# gff3 files can record the chromosome as a gff record |
603
|
|
|
|
|
|
|
# process this as a region |
604
|
7
|
|
|
|
|
21
|
$self->{seq_ids}{$fields[0]} = $fields[4]; |
605
|
7
|
|
|
|
|
95
|
next; |
606
|
|
|
|
|
|
|
} |
607
|
|
|
|
|
|
|
else { |
608
|
|
|
|
|
|
|
# everything else must be some non-standard weird element |
609
|
|
|
|
|
|
|
# only process this if the user wants everything |
610
|
62
|
100
|
|
|
|
116
|
return &{$self->{convertor_sub}}($self, \@fields) unless $self->simplify; |
|
2
|
|
|
|
|
7
|
|
611
|
|
|
|
|
|
|
} |
612
|
|
|
|
|
|
|
} |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
# presumably reached the end of the file |
615
|
7
|
|
|
|
|
217
|
$self->{'eof'} = 1; |
616
|
7
|
|
|
|
|
20
|
return; |
617
|
|
|
|
|
|
|
} |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
sub next_top_feature { |
620
|
70
|
|
|
70
|
1
|
83
|
my $self = shift; |
621
|
|
|
|
|
|
|
# check that we have an open filehandle |
622
|
70
|
50
|
|
|
|
76
|
unless ($self->fh) { |
623
|
0
|
|
|
|
|
0
|
croak("no GFF3 file loaded to parse!"); |
624
|
|
|
|
|
|
|
} |
625
|
70
|
50
|
|
|
|
95
|
unless ($self->{'eof'}) { |
626
|
0
|
0
|
|
|
|
0
|
$self->parse_file or croak "unable to parse file!"; |
627
|
|
|
|
|
|
|
} |
628
|
70
|
|
|
|
|
77
|
return shift @{ $self->{top_features} }; |
|
70
|
|
|
|
|
154
|
|
629
|
|
|
|
|
|
|
} |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
sub top_features { |
632
|
4
|
|
|
4
|
1
|
4355
|
my $self = shift; |
633
|
4
|
50
|
|
|
|
42
|
unless ($self->{'eof'}) { |
634
|
0
|
|
|
|
|
0
|
$self->parse_file; |
635
|
|
|
|
|
|
|
} |
636
|
4
|
|
|
|
|
8
|
my @features = @{ $self->{top_features} }; |
|
4
|
|
|
|
|
14
|
|
637
|
4
|
50
|
|
|
|
17
|
return wantarray ? @features : \@features; |
638
|
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
*parse_table = \&parse_file; |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
sub parse_file { |
643
|
7
|
|
|
7
|
1
|
24
|
my $self = shift; |
644
|
|
|
|
|
|
|
# check that we have an open filehandle |
645
|
7
|
50
|
|
|
|
17
|
unless ($self->fh) { |
646
|
0
|
|
|
|
|
0
|
confess("no file loaded to parse!"); |
647
|
|
|
|
|
|
|
} |
648
|
7
|
50
|
|
|
|
24
|
return 1 if $self->{'eof'}; |
649
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
# Each line will be processed into a SeqFeature object, and then checked |
651
|
|
|
|
|
|
|
# for parentage. Child objects with a Parent tag will be appropriately |
652
|
|
|
|
|
|
|
# associated with its parent, or put into an orphanage. Any orphans |
653
|
|
|
|
|
|
|
# left will be checked a final time for parents; if a parent can't be |
654
|
|
|
|
|
|
|
# found, it will be lost. Features without a parent are assumed to be |
655
|
|
|
|
|
|
|
# top-level features. |
656
|
|
|
|
|
|
|
|
657
|
7
|
50
|
|
|
|
18
|
printf " Parsing %s format file....\n", |
|
|
100
|
|
|
|
|
|
658
|
|
|
|
|
|
|
$self->version eq '3' ? 'GFF3' : |
659
|
|
|
|
|
|
|
$self->version =~ /2\../ ? 'GTF' : 'GFF'; |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
TOP_FEATURE_LOOP: |
663
|
7
|
|
|
|
|
44
|
while (my $feature = $self->next_feature) { |
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
### Process the feature |
666
|
|
|
|
|
|
|
# check the ID |
667
|
576
|
|
|
|
|
1079
|
my $id = $feature->primary_id; |
668
|
|
|
|
|
|
|
# if the seqfeature didn't have an ID specified from the file, then |
669
|
|
|
|
|
|
|
# Bio::ToolBox::SeqFeature will autogenerate one, but Bio::SeqFeature::Lite |
670
|
|
|
|
|
|
|
# will not - so we will likely lose that feature |
671
|
576
|
100
|
|
|
|
1673
|
if ($id) { |
672
|
|
|
|
|
|
|
# remember this feature since we have an ID |
673
|
404
|
50
|
|
|
|
699
|
if (exists $self->{loaded}{$id}) { |
674
|
|
|
|
|
|
|
# this ID should be unique in the GFF file |
675
|
|
|
|
|
|
|
# otherwise it might be a shared duplicate or a malformed GFF file |
676
|
|
|
|
|
|
|
# generally only a concern for top level features |
677
|
0
|
0
|
|
|
|
0
|
if ($feature->primary_tag =~ /gene|rna|transcript/) { |
678
|
0
|
|
|
|
|
0
|
$self->{duplicate_ids}{$id} += 1; |
679
|
|
|
|
|
|
|
} |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
# store all of the features as an array |
682
|
0
|
0
|
|
|
|
0
|
if (ref($self->{loaded}{$id}) eq 'ARRAY') { |
683
|
|
|
|
|
|
|
# there's more than two duplicates! pile it on! |
684
|
0
|
|
|
|
|
0
|
push @{ $self->{loaded}{$id} }, $feature; |
|
0
|
|
|
|
|
0
|
|
685
|
|
|
|
|
|
|
} |
686
|
|
|
|
|
|
|
else { |
687
|
0
|
|
|
|
|
0
|
my $existing = $self->{loaded}{$id}; |
688
|
0
|
|
|
|
|
0
|
$self->{loaded}{$id} = [$existing, $feature]; |
689
|
|
|
|
|
|
|
} |
690
|
|
|
|
|
|
|
} |
691
|
|
|
|
|
|
|
else { |
692
|
|
|
|
|
|
|
# unique ID, so remember it |
693
|
404
|
|
|
|
|
804
|
$self->{loaded}{$id} = $feature; |
694
|
|
|
|
|
|
|
} |
695
|
|
|
|
|
|
|
} |
696
|
|
|
|
|
|
|
# if the feature didn't have an ID, we'll just assume it is |
697
|
|
|
|
|
|
|
# a child of another feature, otherwise it may get lost |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
# look for parents and children |
700
|
576
|
100
|
|
|
|
968
|
if ($feature->has_tag('Parent')) { |
701
|
|
|
|
|
|
|
# must be a child |
702
|
|
|
|
|
|
|
# there may be more than one parent, per the GFF3 specification |
703
|
411
|
|
|
|
|
1235
|
foreach my $parent_id ( $feature->get_tag_values('Parent') ) { |
704
|
411
|
50
|
|
|
|
1820
|
if (exists $self->{loaded}{$parent_id}) { |
705
|
|
|
|
|
|
|
# we've seen this id |
706
|
|
|
|
|
|
|
# associate the child with the parent |
707
|
411
|
|
|
|
|
548
|
my $parent = $self->{loaded}{$parent_id}; |
708
|
411
|
|
|
|
|
743
|
$parent->add_SeqFeature($feature); |
709
|
|
|
|
|
|
|
|
710
|
|
|
|
|
|
|
# check boundaries for gtf genes |
711
|
|
|
|
|
|
|
# gtf genes may not be explicitly defined so must correct as necessary |
712
|
|
|
|
|
|
|
# gff3 files shouldn't have this issue |
713
|
411
|
100
|
|
|
|
13191
|
if ($self->{gtf}) { |
714
|
312
|
50
|
|
|
|
474
|
if ($feature->start < $parent->start) { |
715
|
0
|
|
|
|
|
0
|
$parent->start( $feature->start ); |
716
|
|
|
|
|
|
|
} |
717
|
312
|
100
|
|
|
|
1894
|
if ($feature->end > $parent->end) { |
718
|
1
|
|
|
|
|
3
|
$parent->end( $feature->end ); |
719
|
|
|
|
|
|
|
} |
720
|
|
|
|
|
|
|
# check parent's parent too |
721
|
312
|
100
|
|
|
|
1780
|
if ($parent->has_tag('Parent')) { |
722
|
|
|
|
|
|
|
# in all likelihood parent is a transcript and there is a |
723
|
|
|
|
|
|
|
# gene that probably also needs fixin' |
724
|
278
|
|
|
|
|
849
|
my ($grandparent_id) = $parent->get_tag_values('Parent'); |
725
|
278
|
|
|
|
|
1205
|
my $grandparent = $self->{loaded}{$grandparent_id}; |
726
|
278
|
50
|
|
|
|
389
|
if ($feature->start < $grandparent->start) { |
727
|
0
|
|
|
|
|
0
|
$grandparent->start( $feature->start ); |
728
|
|
|
|
|
|
|
} |
729
|
278
|
50
|
|
|
|
1643
|
if ($feature->end > $grandparent->end) { |
730
|
0
|
|
|
|
|
0
|
$grandparent->end( $feature->end ); |
731
|
|
|
|
|
|
|
} |
732
|
|
|
|
|
|
|
} |
733
|
|
|
|
|
|
|
} |
734
|
|
|
|
|
|
|
} |
735
|
|
|
|
|
|
|
else { |
736
|
|
|
|
|
|
|
# can't find the parent, maybe not loaded yet? |
737
|
|
|
|
|
|
|
# put 'em in the orphanage |
738
|
0
|
|
|
|
|
0
|
$self->_add_orphan($feature); |
739
|
|
|
|
|
|
|
} |
740
|
|
|
|
|
|
|
} |
741
|
|
|
|
|
|
|
} |
742
|
|
|
|
|
|
|
else { |
743
|
|
|
|
|
|
|
# must be a parent |
744
|
165
|
|
|
|
|
269
|
push @{ $self->{top_features} }, $feature; |
|
165
|
|
|
|
|
401
|
|
745
|
|
|
|
|
|
|
} |
746
|
|
|
|
|
|
|
} |
747
|
|
|
|
|
|
|
# Finished loading the GFF lines |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
# check for orphans |
750
|
7
|
50
|
|
|
|
12
|
if (scalar @{ $self->{orphans} }) { |
|
7
|
|
|
|
|
20
|
|
751
|
0
|
|
|
|
|
0
|
$self->check_orphanage; |
752
|
|
|
|
|
|
|
# report |
753
|
0
|
0
|
|
|
|
0
|
if (scalar @{ $self->{orphans} }) { |
|
0
|
|
|
|
|
0
|
|
754
|
|
|
|
|
|
|
printf " GFF errors: %d features could not be associated with reported parents!\n", |
755
|
0
|
|
|
|
|
0
|
scalar @{ $self->{orphans} }; |
|
0
|
|
|
|
|
0
|
|
756
|
|
|
|
|
|
|
} |
757
|
|
|
|
|
|
|
} |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
# report on duplicate IDs |
760
|
7
|
50
|
|
|
|
11
|
if (keys %{ $self->{duplicate_ids} }) { |
|
7
|
|
|
|
|
788
|
|
761
|
|
|
|
|
|
|
printf " GFF errors: %d IDs were duplicated\n", |
762
|
0
|
|
|
|
|
0
|
scalar(keys %{ $self->{duplicate_ids} }); |
|
0
|
|
|
|
|
0
|
|
763
|
|
|
|
|
|
|
} |
764
|
|
|
|
|
|
|
|
765
|
7
|
|
|
|
|
40
|
return 1; |
766
|
|
|
|
|
|
|
} |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
sub _make_gene_parent { |
770
|
|
|
|
|
|
|
# for generating GTF gene parent features |
771
|
12
|
|
|
12
|
|
22
|
my ($self, $fields, $gene_id) = @_; |
772
|
12
|
|
|
|
|
43
|
my $gene = $SFCLASS->new( |
773
|
|
|
|
|
|
|
-seq_id => $fields->[0], |
774
|
|
|
|
|
|
|
-primary_tag => 'gene', |
775
|
|
|
|
|
|
|
-start => $fields->[3], |
776
|
|
|
|
|
|
|
-end => $fields->[4], |
777
|
|
|
|
|
|
|
-strand => $fields->[6], |
778
|
|
|
|
|
|
|
-primary_id => $gene_id, |
779
|
|
|
|
|
|
|
); |
780
|
|
|
|
|
|
|
|
781
|
12
|
50
|
|
|
|
270
|
if ($fields->[8] =~ /gene_name "([^"]+)";?/) { |
782
|
12
|
|
|
|
|
25
|
$gene->display_name($1); |
783
|
|
|
|
|
|
|
} |
784
|
|
|
|
|
|
|
else { |
785
|
0
|
|
|
|
|
0
|
$gene->display_name($gene_id); |
786
|
|
|
|
|
|
|
} |
787
|
|
|
|
|
|
|
|
788
|
|
|
|
|
|
|
# add extra information if possible |
789
|
12
|
100
|
|
|
|
53
|
unless ($self->simplify) { |
790
|
2
|
50
|
|
|
|
11
|
if ($fields->[8] =~ /gene_biotype "([^"]+)";?/) { |
|
|
50
|
|
|
|
|
|
791
|
0
|
|
|
|
|
0
|
$gene->add_tag_value('gene_biotype', $1); |
792
|
|
|
|
|
|
|
} |
793
|
|
|
|
|
|
|
elsif ($fields->[8] =~ /gene_type "([^"]+)";?/) { |
794
|
0
|
|
|
|
|
0
|
$gene->add_tag_value('gene_type', $1); |
795
|
|
|
|
|
|
|
} |
796
|
2
|
50
|
|
|
|
6
|
if ($fields->[8] =~ /gene_source "([^"]+)";?/) { |
797
|
0
|
|
|
|
|
0
|
$gene->source($1); |
798
|
|
|
|
|
|
|
} |
799
|
|
|
|
|
|
|
} |
800
|
12
|
|
|
|
|
16
|
return $gene; |
801
|
|
|
|
|
|
|
} |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
|
804
|
|
|
|
|
|
|
sub _make_rna_parent { |
805
|
|
|
|
|
|
|
# for generating GTF gene parent features |
806
|
0
|
|
|
0
|
|
0
|
my ($self, $fields, $transcript_id) = @_; |
807
|
0
|
|
|
|
|
0
|
my $rna = $SFCLASS->new( |
808
|
|
|
|
|
|
|
-seq_id => $fields->[0], |
809
|
|
|
|
|
|
|
-primary_tag => 'transcript', |
810
|
|
|
|
|
|
|
-start => $fields->[3], |
811
|
|
|
|
|
|
|
-end => $fields->[4], |
812
|
|
|
|
|
|
|
-strand => $fields->[6], |
813
|
|
|
|
|
|
|
-primary_id => $transcript_id, |
814
|
|
|
|
|
|
|
); |
815
|
|
|
|
|
|
|
|
816
|
0
|
0
|
|
|
|
0
|
if ($fields->[8] =~ /transcript_name "([^"]+)";?/) { |
817
|
0
|
|
|
|
|
0
|
$rna->display_name($1); |
818
|
|
|
|
|
|
|
} |
819
|
|
|
|
|
|
|
else { |
820
|
0
|
|
|
|
|
0
|
$rna->display_name($transcript_id); |
821
|
|
|
|
|
|
|
} |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
# add extra information if possible |
824
|
0
|
0
|
|
|
|
0
|
unless ($self->simplify) { |
825
|
0
|
0
|
|
|
|
0
|
if ($fields->[8] =~ /transcript_biotype "([^"]+)";?/) { |
|
|
0
|
|
|
|
|
|
826
|
0
|
|
|
|
|
0
|
$rna->add_tag_value('transcript_biotype', $1); |
827
|
|
|
|
|
|
|
} |
828
|
|
|
|
|
|
|
elsif ($fields->[8] =~ /transcript_type "([^"]+)";?/) { |
829
|
0
|
|
|
|
|
0
|
$rna->add_tag_value('transcript_type', $1); |
830
|
|
|
|
|
|
|
} |
831
|
0
|
0
|
|
|
|
0
|
if ($fields->[8] =~ /transcript_source "([^"]+)";?/) { |
832
|
0
|
|
|
|
|
0
|
$rna->source($1); |
833
|
|
|
|
|
|
|
} |
834
|
|
|
|
|
|
|
} |
835
|
0
|
|
|
|
|
0
|
return $rna; |
836
|
|
|
|
|
|
|
} |
837
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
|
839
|
|
|
|
|
|
|
sub find_gene { |
840
|
37
|
|
|
37
|
1
|
10880
|
my $self = shift; |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
# check that we have gene2seqf table |
843
|
37
|
100
|
|
|
|
61
|
unless (exists $self->{gene2seqf}) { |
844
|
5
|
50
|
|
|
|
14
|
croak "must parse file first!" unless $self->{'eof'}; |
845
|
5
|
|
|
|
|
10
|
$self->{gene2seqf} = {}; |
846
|
5
|
|
|
|
|
8
|
foreach (@{ $self->{top_features} }) { |
|
5
|
|
|
|
|
13
|
|
847
|
107
|
|
|
|
|
147
|
my $name = lc $_->display_name; |
848
|
107
|
50
|
|
|
|
308
|
if (exists $self->{gene2seqf}->{$name}) { |
849
|
0
|
|
|
|
|
0
|
push @{ $self->{gene2seqf}->{$name} }, $_; |
|
0
|
|
|
|
|
0
|
|
850
|
|
|
|
|
|
|
} |
851
|
|
|
|
|
|
|
else { |
852
|
107
|
|
|
|
|
199
|
$self->{gene2seqf}->{$name} = [$_]; |
853
|
|
|
|
|
|
|
} |
854
|
|
|
|
|
|
|
} |
855
|
|
|
|
|
|
|
} |
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
# get the name and coordinates from arguments |
858
|
37
|
|
|
|
|
48
|
my ($name, $id, $chrom, $start, $end, $strand); |
859
|
37
|
50
|
|
|
|
67
|
if (scalar @_ == 0) { |
|
|
100
|
|
|
|
|
|
860
|
0
|
|
|
|
|
0
|
carp "must provide information to find_gene method!"; |
861
|
0
|
|
|
|
|
0
|
return; |
862
|
|
|
|
|
|
|
} |
863
|
|
|
|
|
|
|
elsif (scalar @_ == 1) { |
864
|
4
|
|
|
|
|
6
|
$name = $_[0]; |
865
|
|
|
|
|
|
|
} |
866
|
|
|
|
|
|
|
else { |
867
|
33
|
|
|
|
|
60
|
my %opt = @_; |
868
|
33
|
|
0
|
|
|
49
|
$name = $opt{name} || $opt{display_name} || undef; |
869
|
33
|
|
0
|
|
|
45
|
$id = $opt{id} || $opt{primary_id} || undef; |
870
|
33
|
|
50
|
|
|
105
|
$chrom = $opt{chrom} || $opt{seq_id} || undef; |
871
|
33
|
|
50
|
|
|
72
|
$start = $opt{start} || undef; |
872
|
33
|
|
50
|
|
|
86
|
$end = $opt{stop} || $opt{end} || undef; |
873
|
33
|
|
50
|
|
|
68
|
$strand = $opt{strand} || 0; |
874
|
|
|
|
|
|
|
} |
875
|
37
|
50
|
|
|
|
67
|
unless ($name) { |
876
|
0
|
|
|
|
|
0
|
carp "name is required for find_gene!"; |
877
|
0
|
|
|
|
|
0
|
return; |
878
|
|
|
|
|
|
|
} |
879
|
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
# check if a gene with this name exists |
881
|
37
|
50
|
|
|
|
77
|
if (exists $self->{gene2seqf}->{lc $name} ) { |
882
|
|
|
|
|
|
|
# we found a matching gene |
883
|
|
|
|
|
|
|
|
884
|
|
|
|
|
|
|
# pull out the gene seqfeature(s) array reference |
885
|
|
|
|
|
|
|
# there may be more than one gene |
886
|
37
|
|
|
|
|
50
|
my $genes = $self->{gene2seqf}->{ lc $name }; |
887
|
|
|
|
|
|
|
|
888
|
|
|
|
|
|
|
# go through a series of checks to find the appropriate |
889
|
37
|
100
|
|
|
|
46
|
if ($id) { |
890
|
33
|
|
|
|
|
42
|
foreach my $g (@$genes) { |
891
|
33
|
50
|
|
|
|
60
|
if ($g->primary_id eq $id) { |
892
|
33
|
|
|
|
|
60
|
return $g; |
893
|
|
|
|
|
|
|
} |
894
|
|
|
|
|
|
|
} |
895
|
0
|
|
|
|
|
0
|
return; # none of these matched despite having an ID |
896
|
|
|
|
|
|
|
} |
897
|
4
|
0
|
33
|
|
|
9
|
if ($chrom and $start and $end) { |
|
|
|
33
|
|
|
|
|
898
|
0
|
|
|
|
|
0
|
foreach my $g (@$genes) { |
899
|
0
|
0
|
0
|
|
|
0
|
if ( |
|
|
|
0
|
|
|
|
|
900
|
|
|
|
|
|
|
# overlap method borrowed from Bio::RangeI |
901
|
|
|
|
|
|
|
($g->strand == $strand) and not ( |
902
|
|
|
|
|
|
|
$g->start > $end or |
903
|
|
|
|
|
|
|
$g->end < $start |
904
|
|
|
|
|
|
|
) |
905
|
|
|
|
|
|
|
) { |
906
|
|
|
|
|
|
|
# gene and transcript overlap on the same strand |
907
|
|
|
|
|
|
|
# we found the intersecting gene |
908
|
0
|
|
|
|
|
0
|
return $g; |
909
|
|
|
|
|
|
|
} |
910
|
|
|
|
|
|
|
} |
911
|
0
|
|
|
|
|
0
|
return; # none of these matched despite having coordinate info |
912
|
|
|
|
|
|
|
} |
913
|
4
|
50
|
|
|
|
9
|
if (scalar @$genes == 1) { |
|
|
0
|
|
|
|
|
|
914
|
|
|
|
|
|
|
# going on trust here that this is the one |
915
|
4
|
|
|
|
|
12
|
return $genes->[0]; |
916
|
|
|
|
|
|
|
} |
917
|
|
|
|
|
|
|
elsif (scalar @$genes > 1) { |
918
|
0
|
|
|
|
|
0
|
carp "more than one gene named $name found!"; |
919
|
0
|
|
|
|
|
0
|
return $genes->[0]; |
920
|
|
|
|
|
|
|
} |
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
# nothing suitable found |
923
|
0
|
|
|
|
|
0
|
return; |
924
|
|
|
|
|
|
|
} |
925
|
|
|
|
|
|
|
} |
926
|
|
|
|
|
|
|
|
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
sub from_gff_string { |
929
|
0
|
|
|
0
|
1
|
0
|
my ($self, $string) = @_; |
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
# check string |
932
|
0
|
|
|
|
|
0
|
chomp $string; |
933
|
0
|
0
|
|
|
|
0
|
unless ($string) { |
934
|
0
|
|
|
|
|
0
|
cluck("must pass a string!\n"); |
935
|
0
|
|
|
|
|
0
|
return; |
936
|
|
|
|
|
|
|
} |
937
|
0
|
|
|
|
|
0
|
my @fields = split('\t', $string); |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
# check convertor |
940
|
0
|
0
|
|
|
|
0
|
unless (defined $self->{convertor_sub}) { |
941
|
0
|
|
|
|
|
0
|
$self->_assign_gff_converter; |
942
|
|
|
|
|
|
|
} |
943
|
|
|
|
|
|
|
|
944
|
|
|
|
|
|
|
# parse appropriately |
945
|
0
|
|
|
|
|
0
|
return &{$self->{convertor_sub}}($self, \@fields); |
|
0
|
|
|
|
|
0
|
|
946
|
|
|
|
|
|
|
} |
947
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
sub _assign_gff_converter { |
950
|
11
|
|
|
11
|
|
15
|
my $self = shift; |
951
|
11
|
100
|
|
|
|
28
|
if ($self->{gff3}) { |
|
|
50
|
|
|
|
|
|
952
|
7
|
|
|
|
|
19
|
$self->{convertor_sub} = \&_gff3_to_seqf; |
953
|
|
|
|
|
|
|
} |
954
|
|
|
|
|
|
|
elsif ($self->{gtf}) { |
955
|
4
|
100
|
|
|
|
7
|
if ($self->simplify) { |
956
|
2
|
|
|
|
|
5
|
$self->{convertor_sub} = \&_gtf_to_seqf_simple; |
957
|
|
|
|
|
|
|
} |
958
|
|
|
|
|
|
|
else { |
959
|
2
|
|
|
|
|
5
|
$self->{convertor_sub} = \&_gtf_to_seqf_full; |
960
|
|
|
|
|
|
|
} |
961
|
|
|
|
|
|
|
# double check we have transcript information |
962
|
4
|
50
|
|
|
|
9
|
if ($self->typelist !~ /transcript|rna/i) { |
963
|
|
|
|
|
|
|
# we will have to rely on exon and/or cds information to get transcript |
964
|
0
|
0
|
0
|
|
|
0
|
unless ($self->do_exon or $self->do_cds) { |
965
|
0
|
|
|
|
|
0
|
$self->do_exon(1); |
966
|
|
|
|
|
|
|
} |
967
|
|
|
|
|
|
|
} |
968
|
4
|
50
|
33
|
|
|
9
|
if ($self->do_gene and $self->typelist !~ /gene/i) { |
969
|
4
|
|
|
|
|
7
|
$self->do_exon(1); |
970
|
|
|
|
|
|
|
} |
971
|
|
|
|
|
|
|
} |
972
|
|
|
|
|
|
|
else { |
973
|
0
|
|
|
|
|
0
|
$self->{convertor_sub} = \&_gff2_to_seqf; |
974
|
|
|
|
|
|
|
} |
975
|
|
|
|
|
|
|
} |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
|
978
|
|
|
|
|
|
|
sub _gff3_to_seqf { |
979
|
266
|
|
|
266
|
|
327
|
my ($self, $fields) = @_; |
980
|
266
|
|
|
|
|
322
|
my $group = $fields->[8]; |
981
|
266
|
|
|
|
|
418
|
my $feature = $self->_gff_to_seqf($fields); |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
# process groups |
984
|
266
|
|
|
|
|
1563
|
foreach my $g (split(/\s*;\s*/, $group)) { |
985
|
1382
|
|
|
|
|
3256
|
my ($tag, $value) = split /=/, $g; |
986
|
1382
|
|
|
|
|
1905
|
$tag = $self->unescape($tag); |
987
|
1382
|
|
|
|
|
2019
|
my @values = map { $self->unescape($_) } split(/,/, $value); |
|
1470
|
|
|
|
|
1689
|
|
988
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
# determine the appropriate attribute based on tag name |
990
|
1382
|
100
|
|
|
|
3531
|
if ($tag eq 'Name') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
991
|
266
|
|
|
|
|
460
|
$feature->display_name($values[0]); |
992
|
|
|
|
|
|
|
} |
993
|
|
|
|
|
|
|
elsif ($tag eq 'ID') { |
994
|
167
|
|
|
|
|
334
|
$feature->primary_id($values[0]); |
995
|
|
|
|
|
|
|
} |
996
|
|
|
|
|
|
|
elsif ($tag eq 'Parent') { |
997
|
|
|
|
|
|
|
# always record Parent except for transcripts when genes are not wanted |
998
|
99
|
50
|
33
|
|
|
158
|
unless (not $self->do_gene and $feature->primary_tag =~ /rna|transcript/i) { |
999
|
99
|
|
|
|
|
135
|
foreach (@values) { |
1000
|
99
|
|
|
|
|
190
|
$feature->add_tag_value($tag, $_); |
1001
|
|
|
|
|
|
|
} |
1002
|
|
|
|
|
|
|
} |
1003
|
|
|
|
|
|
|
} |
1004
|
|
|
|
|
|
|
elsif (lc $tag eq 'exon_id') { |
1005
|
|
|
|
|
|
|
# ensembl GFF3 store the exon id but doesn't record it as the ID, why? |
1006
|
0
|
|
|
|
|
0
|
$feature->primary_id($values[0]); |
1007
|
|
|
|
|
|
|
} |
1008
|
|
|
|
|
|
|
elsif (not $self->{simplify}) { |
1009
|
2
|
|
|
|
|
5
|
foreach (@values) { |
1010
|
2
|
|
|
|
|
7
|
$feature->add_tag_value($tag, $_); |
1011
|
|
|
|
|
|
|
} |
1012
|
|
|
|
|
|
|
} |
1013
|
|
|
|
|
|
|
} |
1014
|
|
|
|
|
|
|
|
1015
|
266
|
|
|
|
|
989
|
return $feature; |
1016
|
|
|
|
|
|
|
} |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
|
1019
|
|
|
|
|
|
|
sub _gtf_to_seqf_simple { |
1020
|
|
|
|
|
|
|
|
1021
|
312
|
|
|
312
|
|
425
|
my ($self, $fields) = @_; |
1022
|
312
|
|
|
|
|
400
|
my $group = $fields->[8]; |
1023
|
312
|
|
|
|
|
503
|
my $feature = $self->_gff_to_seqf($fields); |
1024
|
|
|
|
|
|
|
|
1025
|
|
|
|
|
|
|
# extract essential tags |
1026
|
312
|
|
|
|
|
371
|
my ($gene_id, $transcript_id); |
1027
|
312
|
50
|
|
|
|
1074
|
if ($group =~ /gene_id "([^"]+)";?/i) { |
1028
|
312
|
|
|
|
|
574
|
$gene_id = $1; |
1029
|
|
|
|
|
|
|
} |
1030
|
312
|
50
|
|
|
|
787
|
if ($group =~ /transcript_id "([^"]+)";?/i) { |
1031
|
312
|
|
|
|
|
454
|
$transcript_id = $1; |
1032
|
|
|
|
|
|
|
} |
1033
|
312
|
50
|
33
|
|
|
492
|
unless ($gene_id or $transcript_id) { |
1034
|
|
|
|
|
|
|
# improperly formatted GTF file without one of these two items, nothing more to do |
1035
|
0
|
|
|
|
|
0
|
return $feature; |
1036
|
|
|
|
|
|
|
} |
1037
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
# common subfeatures including exon, CDS, UTR, and codons |
1039
|
312
|
100
|
|
|
|
1028
|
if ($fields->[2] =~ /cds|exon|utr|codon|untranslated/i) { |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1040
|
278
|
|
|
|
|
659
|
$feature->add_tag_value('Parent', $transcript_id); |
1041
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
# exon id if present |
1043
|
278
|
50
|
66
|
|
|
1526
|
if ($fields->[2] eq 'exon' and $group =~ /exon_id "([^"]+)";?/i) { |
1044
|
0
|
|
|
|
|
0
|
$feature->primary_id($1); |
1045
|
|
|
|
|
|
|
} |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
# check gene parent |
1048
|
278
|
50
|
33
|
|
|
484
|
if ($self->do_gene and not exists $self->{loaded}{$gene_id}) { |
1049
|
0
|
|
|
|
|
0
|
my $gene = $self->_make_gene_parent($fields, $gene_id); |
1050
|
0
|
|
|
|
|
0
|
$self->{loaded}{$gene_id} = $gene; |
1051
|
0
|
|
|
|
|
0
|
push @{ $self->{top_features} }, $gene; |
|
0
|
|
|
|
|
0
|
|
1052
|
|
|
|
|
|
|
} |
1053
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
# check transcript parent |
1055
|
278
|
50
|
|
|
|
492
|
unless (exists $self->{loaded}{$transcript_id}) { |
1056
|
0
|
|
|
|
|
0
|
my $rna = $self->_make_rna_parent($fields, $transcript_id); |
1057
|
0
|
|
|
|
|
0
|
$self->{loaded}{$transcript_id} = $rna; |
1058
|
0
|
0
|
|
|
|
0
|
if ($self->do_gene) { |
1059
|
0
|
|
|
|
|
0
|
$rna->add_tag_value('Parent', $gene_id); |
1060
|
0
|
|
|
|
|
0
|
$self->{loaded}{$gene_id}->add_SeqFeature($rna); |
1061
|
|
|
|
|
|
|
} |
1062
|
|
|
|
|
|
|
else { |
1063
|
0
|
|
|
|
|
0
|
push @{ $self->{top_features} }, $rna; |
|
0
|
|
|
|
|
0
|
|
1064
|
|
|
|
|
|
|
} |
1065
|
|
|
|
|
|
|
} |
1066
|
|
|
|
|
|
|
} |
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
# a transcript feature |
1069
|
|
|
|
|
|
|
elsif ($fields->[2] =~ /rna|transcript/i) { |
1070
|
|
|
|
|
|
|
# these are sometimes present in GTF files, such as from Ensembl |
1071
|
|
|
|
|
|
|
# but are not required and often absent |
1072
|
34
|
|
|
|
|
97
|
$feature->primary_id($transcript_id); |
1073
|
|
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
# transcript information |
1075
|
34
|
50
|
|
|
|
214
|
if ($group =~ /transcript_name "([^"]+)";?/i) { |
1076
|
34
|
|
|
|
|
73
|
$feature->display_name($1); |
1077
|
|
|
|
|
|
|
} |
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
# check whether parent was loaded and add gene information if not |
1080
|
34
|
50
|
|
|
|
156
|
if ($self->do_gene) { |
1081
|
34
|
|
|
|
|
88
|
$feature->add_tag_value('Parent', $gene_id); |
1082
|
34
|
100
|
|
|
|
172
|
if (not exists $self->{loaded}{$gene_id}) { |
1083
|
10
|
|
|
|
|
18
|
my $gene = $self->_make_gene_parent($fields, $gene_id); |
1084
|
10
|
|
|
|
|
30
|
$self->{loaded}{$gene_id} = $gene; |
1085
|
10
|
|
|
|
|
13
|
push @{ $self->{top_features} }, $gene; |
|
10
|
|
|
|
|
17
|
|
1086
|
|
|
|
|
|
|
} |
1087
|
|
|
|
|
|
|
} |
1088
|
|
|
|
|
|
|
} |
1089
|
|
|
|
|
|
|
|
1090
|
|
|
|
|
|
|
# a gene feature |
1091
|
|
|
|
|
|
|
elsif ($fields->[2] eq 'gene') { |
1092
|
|
|
|
|
|
|
# these are sometimes present in GTF files, such as from Ensembl |
1093
|
|
|
|
|
|
|
# but are not required and often absent |
1094
|
0
|
|
|
|
|
0
|
$feature->primary_id($gene_id); |
1095
|
0
|
0
|
|
|
|
0
|
if ($group =~ /gene_name "([^"]+)";?/i) { |
1096
|
0
|
|
|
|
|
0
|
$feature->display_name($1); |
1097
|
|
|
|
|
|
|
} |
1098
|
|
|
|
|
|
|
} |
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
# anything else, like CNS (conserved noncoding sequence) doesn't get any |
1101
|
|
|
|
|
|
|
# further special attributes, as far as I can tell |
1102
|
312
|
|
|
|
|
1031
|
return $feature; |
1103
|
|
|
|
|
|
|
} |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
sub _gtf_to_seqf_full { |
1107
|
|
|
|
|
|
|
|
1108
|
2
|
|
|
2
|
|
4
|
my ($self, $fields) = @_; |
1109
|
2
|
|
|
|
|
4
|
my $group = $fields->[8]; |
1110
|
2
|
|
|
|
|
5
|
my $feature = $self->_gff_to_seqf($fields); |
1111
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
# process the group tags |
1113
|
2
|
|
|
|
|
2
|
my %attributes; |
1114
|
2
|
|
|
|
|
9
|
foreach my $g (split('; ', $group)) { # supposed to be "; " as delimiter |
1115
|
10
|
|
|
|
|
19
|
my ($tag, $value, @bits) = split ' ', $g; |
1116
|
10
|
|
|
|
|
27
|
$value =~ s/[";]//g; # remove the flanking double quotes, assume no internal quotes |
1117
|
10
|
|
|
|
|
18
|
$attributes{$tag} = $value; |
1118
|
|
|
|
|
|
|
} |
1119
|
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
# assign special tags based on the feature type |
1121
|
2
|
50
|
|
|
|
15
|
if ($fields->[2] =~ /cds|exon|utr|codon|untranslated/i) { |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1122
|
0
|
|
|
|
|
0
|
$feature->add_tag_value('Parent', $attributes{'transcript_id'}); |
1123
|
|
|
|
|
|
|
|
1124
|
|
|
|
|
|
|
# exon id if present |
1125
|
0
|
0
|
0
|
|
|
0
|
if ($fields->[2] eq 'exon' and exists $attributes{'exon_id'}) { |
1126
|
0
|
|
|
|
|
0
|
$feature->primary_id($attributes{'exon_id'}); |
1127
|
|
|
|
|
|
|
} |
1128
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
# check gene parent |
1130
|
0
|
0
|
|
|
|
0
|
if ($self->do_gene) { |
1131
|
0
|
|
0
|
|
|
0
|
my $gene_id = $attributes{gene_id} || undef; |
1132
|
0
|
0
|
0
|
|
|
0
|
if ($gene_id and not exists $self->{loaded}{$gene_id}) { |
1133
|
0
|
|
|
|
|
0
|
my $gene = $self->_make_gene_parent($fields, $gene_id); |
1134
|
0
|
|
|
|
|
0
|
$self->{loaded}{$gene_id} = $gene; |
1135
|
0
|
|
|
|
|
0
|
push @{ $self->{top_features} }, $gene; |
|
0
|
|
|
|
|
0
|
|
1136
|
|
|
|
|
|
|
} |
1137
|
|
|
|
|
|
|
} |
1138
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
# check transcript parent |
1140
|
0
|
|
0
|
|
|
0
|
my $transcript_id = $attributes{transcript_id} || undef; |
1141
|
0
|
0
|
0
|
|
|
0
|
if ($transcript_id and not exists $self->{loaded}{$transcript_id}) { |
1142
|
0
|
|
|
|
|
0
|
my $rna = $self->_make_rna_parent($fields, $transcript_id); |
1143
|
0
|
|
|
|
|
0
|
$self->{loaded}{$transcript_id} = $rna; |
1144
|
0
|
0
|
|
|
|
0
|
if ($self->do_gene) { |
1145
|
0
|
|
|
|
|
0
|
$rna->add_tag_value('Parent', $attributes{gene_id}); |
1146
|
0
|
|
|
|
|
0
|
$self->{loaded}{ $attributes{gene_id} }->add_SeqFeature($rna); |
1147
|
|
|
|
|
|
|
} |
1148
|
|
|
|
|
|
|
else { |
1149
|
0
|
|
|
|
|
0
|
push @{ $self->{top_features} }, $rna; |
|
0
|
|
|
|
|
0
|
|
1150
|
|
|
|
|
|
|
} |
1151
|
|
|
|
|
|
|
} |
1152
|
|
|
|
|
|
|
} |
1153
|
|
|
|
|
|
|
|
1154
|
|
|
|
|
|
|
# transcript |
1155
|
|
|
|
|
|
|
elsif ($fields->[2] =~ /transcript|rna/i) { |
1156
|
|
|
|
|
|
|
# these are sometimes present in GTF files, such as from Ensembl |
1157
|
|
|
|
|
|
|
|
1158
|
|
|
|
|
|
|
# transcript information |
1159
|
2
|
|
|
|
|
7
|
$feature->primary_id($attributes{transcript_id}); |
1160
|
2
|
|
|
|
|
8
|
delete $attributes{transcript_id}; |
1161
|
2
|
50
|
|
|
|
5
|
if (exists $attributes{transcript_name}) { |
1162
|
2
|
|
|
|
|
5
|
$feature->display_name($attributes{transcript_name}); |
1163
|
2
|
|
|
|
|
8
|
delete $attributes{transcript_name}; |
1164
|
|
|
|
|
|
|
} |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
# check gene parent |
1167
|
2
|
50
|
|
|
|
5
|
if ($self->do_gene) { |
1168
|
2
|
|
50
|
|
|
6
|
my $gene_id = $attributes{gene_id} || undef; |
1169
|
2
|
50
|
33
|
|
|
8
|
if ($gene_id and not exists $self->{loaded}{$gene_id}) { |
1170
|
2
|
|
|
|
|
6
|
my $gene = $self->_make_gene_parent($fields, $gene_id); |
1171
|
2
|
|
|
|
|
4
|
$self->{loaded}{$gene_id} = $gene; |
1172
|
2
|
|
|
|
|
3
|
push @{ $self->{top_features} }, $gene; |
|
2
|
|
|
|
|
4
|
|
1173
|
|
|
|
|
|
|
} |
1174
|
2
|
|
|
|
|
5
|
$feature->add_tag_value('Parent', $gene_id); |
1175
|
|
|
|
|
|
|
} |
1176
|
|
|
|
|
|
|
} |
1177
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
# gene |
1179
|
|
|
|
|
|
|
elsif ($fields->[2] eq 'gene') { |
1180
|
|
|
|
|
|
|
# these are sometimes present in GTF files, such as from Ensembl |
1181
|
|
|
|
|
|
|
# but are not required and often absent |
1182
|
0
|
|
|
|
|
0
|
$feature->primary_id($attributes{gene_id}); |
1183
|
0
|
|
|
|
|
0
|
delete $attributes{gene_id}; |
1184
|
0
|
0
|
|
|
|
0
|
if (exists $attributes{gene_name}) { |
1185
|
0
|
|
|
|
|
0
|
$feature->display_name($attributes{gene_name}); |
1186
|
0
|
|
|
|
|
0
|
delete $attributes{gene_name}; |
1187
|
|
|
|
|
|
|
} |
1188
|
|
|
|
|
|
|
} |
1189
|
|
|
|
|
|
|
|
1190
|
|
|
|
|
|
|
# store remaining attributes, if any |
1191
|
2
|
|
|
|
|
12
|
foreach my $key (keys %attributes) { |
1192
|
6
|
|
|
|
|
19
|
$feature->add_tag_value($key, $attributes{$key}); |
1193
|
|
|
|
|
|
|
} |
1194
|
2
|
|
|
|
|
13
|
return $feature; |
1195
|
|
|
|
|
|
|
} |
1196
|
|
|
|
|
|
|
|
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
sub _gff2_to_seqf { |
1199
|
|
|
|
|
|
|
# generic gff1 or gff2 format or poorly defined gff3 or gtf file |
1200
|
|
|
|
|
|
|
# hope for the best! |
1201
|
0
|
|
|
0
|
|
0
|
my ($self, $fields) = @_; |
1202
|
0
|
|
|
|
|
0
|
my $feature = $self->_gff_to_seqf($fields); |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
# process groups |
1205
|
|
|
|
|
|
|
# we have no uniform method of combining features, so we'll leave the tags |
1206
|
|
|
|
|
|
|
# as is and hope for the best |
1207
|
0
|
|
|
|
|
0
|
foreach my $g (split(/\s*;\s*/, $fields->[8])) { |
1208
|
0
|
|
|
|
|
0
|
my ($tag, $value) = split /\s+/, $g; |
1209
|
0
|
0
|
0
|
|
|
0
|
next unless ($tag and $value); |
1210
|
0
|
|
|
|
|
0
|
$feature->add_tag_value($tag, $value); |
1211
|
|
|
|
|
|
|
} |
1212
|
0
|
|
|
|
|
0
|
return $feature; |
1213
|
|
|
|
|
|
|
} |
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
sub _gff_to_seqf { |
1217
|
580
|
|
|
580
|
|
710
|
my ($self, $fields) = @_; |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
# generate the basic SeqFeature |
1220
|
580
|
|
|
|
|
2035
|
my $feature = $SFCLASS->new( |
1221
|
|
|
|
|
|
|
-seq_id => $fields->[0], |
1222
|
|
|
|
|
|
|
-source => $fields->[1], |
1223
|
|
|
|
|
|
|
-primary_tag => $fields->[2], |
1224
|
|
|
|
|
|
|
-start => $fields->[3], |
1225
|
|
|
|
|
|
|
-end => $fields->[4], |
1226
|
|
|
|
|
|
|
-strand => $fields->[6], |
1227
|
|
|
|
|
|
|
); |
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
# add more attributes if they're not null |
1230
|
580
|
50
|
|
|
|
9900
|
if ($fields->[5] ne '.') { |
1231
|
0
|
|
|
|
|
0
|
$feature->score($fields->[5]); |
1232
|
|
|
|
|
|
|
} |
1233
|
580
|
100
|
|
|
|
812
|
if ($fields->[7] ne '.') { |
1234
|
199
|
|
|
|
|
377
|
$feature->phase($fields->[7]); |
1235
|
|
|
|
|
|
|
} |
1236
|
|
|
|
|
|
|
|
1237
|
|
|
|
|
|
|
# finished |
1238
|
580
|
|
|
|
|
1026
|
return $feature; |
1239
|
|
|
|
|
|
|
} |
1240
|
|
|
|
|
|
|
|
1241
|
|
|
|
|
|
|
|
1242
|
|
|
|
|
|
|
sub unescape { |
1243
|
|
|
|
|
|
|
# Borrowed unashamedly from bioperl Bio::Tools::GFF |
1244
|
|
|
|
|
|
|
# which in turn was borrowed from Bio::DB::GFF |
1245
|
2852
|
|
|
2852
|
1
|
2670
|
my $self = shift; |
1246
|
2852
|
|
|
|
|
2730
|
my $v = shift; |
1247
|
2852
|
|
|
|
|
3102
|
$v =~ tr/+/ /; |
1248
|
2852
|
|
|
|
|
3703
|
$v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge; |
|
5574
|
|
|
|
|
10926
|
|
1249
|
2852
|
|
|
|
|
4213
|
return $v; |
1250
|
|
|
|
|
|
|
} |
1251
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
sub _add_orphan { |
1254
|
0
|
|
|
0
|
|
|
my ($self, $feature) = @_; |
1255
|
0
|
|
|
|
|
|
push @{ $self->{orphans} }, $feature; |
|
0
|
|
|
|
|
|
|
1256
|
0
|
|
|
|
|
|
return 1; |
1257
|
|
|
|
|
|
|
} |
1258
|
|
|
|
|
|
|
|
1259
|
|
|
|
|
|
|
|
1260
|
|
|
|
|
|
|
sub check_orphanage { |
1261
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
1262
|
0
|
0
|
|
|
|
|
return unless scalar @{ $self->{orphans} }; |
|
0
|
|
|
|
|
|
|
1263
|
|
|
|
|
|
|
|
1264
|
|
|
|
|
|
|
# go through the list of orphans |
1265
|
0
|
|
|
|
|
|
my @reunited; # list of indices to delete after reuniting orphan with parent |
1266
|
0
|
|
|
|
|
|
for (my $i = 0; $i < scalar @{ $self->{orphans} }; $i++) { |
|
0
|
|
|
|
|
|
|
1267
|
0
|
|
|
|
|
|
my $orphan = $self->{orphans}->[$i]; |
1268
|
0
|
|
|
|
|
|
my $success = 0; |
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
# find the parent |
1271
|
0
|
|
|
|
|
|
foreach my $parent ($orphan->get_tag_values('Parent') ) { |
1272
|
0
|
0
|
|
|
|
|
if (exists $self->{loaded}{$parent}) { |
1273
|
|
|
|
|
|
|
# we have loaded the parent |
1274
|
|
|
|
|
|
|
# associate each orphan feature with the parent |
1275
|
0
|
|
|
|
|
|
$self->{loaded}{$parent}->add_SeqFeature($orphan); |
1276
|
0
|
|
|
|
|
|
$success++; |
1277
|
|
|
|
|
|
|
} |
1278
|
|
|
|
|
|
|
} |
1279
|
|
|
|
|
|
|
# delete the orphan from the array if it found it's long lost parent |
1280
|
0
|
0
|
|
|
|
|
push @reunited, $i if $success; |
1281
|
|
|
|
|
|
|
} |
1282
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
# clean up the orphanage |
1284
|
0
|
|
|
|
|
|
while (@reunited) { |
1285
|
0
|
|
|
|
|
|
my $i = pop @reunited; |
1286
|
0
|
|
|
|
|
|
splice(@{ $self->{orphans} }, $i, 1); |
|
0
|
|
|
|
|
|
|
1287
|
|
|
|
|
|
|
} |
1288
|
|
|
|
|
|
|
} |
1289
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
sub orphans { |
1291
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
1292
|
0
|
|
|
|
|
|
my @orphans; |
1293
|
0
|
|
|
|
|
|
foreach (@{ $self->{orphans} }) { |
|
0
|
|
|
|
|
|
|
1294
|
0
|
|
|
|
|
|
push @orphans, $_; |
1295
|
|
|
|
|
|
|
} |
1296
|
0
|
0
|
|
|
|
|
return wantarray ? @orphans : \@orphans; |
1297
|
|
|
|
|
|
|
} |
1298
|
|
|
|
|
|
|
|
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
sub comments { |
1301
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
1302
|
0
|
|
|
|
|
|
my @comments; |
1303
|
0
|
|
|
|
|
|
foreach (@{ $self->{comments} }) { |
|
0
|
|
|
|
|
|
|
1304
|
0
|
|
|
|
|
|
push @comments, $_; |
1305
|
|
|
|
|
|
|
} |
1306
|
0
|
0
|
|
|
|
|
return wantarray ? @comments : \@comments; |
1307
|
|
|
|
|
|
|
} |
1308
|
|
|
|
|
|
|
|
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
sub seq_ids { |
1311
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
1312
|
0
|
0
|
|
|
|
|
unless (scalar keys %{$self->{seq_ids}}) { |
|
0
|
|
|
|
|
|
|
1313
|
0
|
|
|
|
|
|
$self->_get_seq_ids; |
1314
|
|
|
|
|
|
|
} |
1315
|
0
|
|
|
|
|
|
my @s = keys %{$self->{seq_ids}}; |
|
0
|
|
|
|
|
|
|
1316
|
0
|
0
|
|
|
|
|
return wantarray ? @s : \@s; |
1317
|
|
|
|
|
|
|
} |
1318
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
sub seq_id_lengths { |
1321
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
1322
|
0
|
0
|
|
|
|
|
unless (scalar keys %{$self->{seq_ids}}) { |
|
0
|
|
|
|
|
|
|
1323
|
0
|
|
|
|
|
|
$self->_get_seq_ids; |
1324
|
|
|
|
|
|
|
} |
1325
|
0
|
|
|
|
|
|
return $self->{seq_ids}; |
1326
|
|
|
|
|
|
|
} |
1327
|
|
|
|
|
|
|
|
1328
|
|
|
|
|
|
|
sub _get_seq_ids { |
1329
|
0
|
|
|
0
|
|
|
my $self = shift; |
1330
|
0
|
0
|
|
|
|
|
return unless $self->{'eof'}; |
1331
|
0
|
|
|
|
|
|
foreach (@{ $self->{top_features} }) { |
|
0
|
|
|
|
|
|
|
1332
|
0
|
|
|
|
|
|
my $s = $_->seq_id; |
1333
|
0
|
0
|
|
|
|
|
unless (exists $self->{seq_ids}{$s}) { |
1334
|
0
|
|
|
|
|
|
$self->{seq_ids}{$s} = 1; |
1335
|
|
|
|
|
|
|
} |
1336
|
0
|
0
|
|
|
|
|
$self->{seq_ids}{$s} = $_->end if $_->end > $self->{seq_ids}{$s}; |
1337
|
|
|
|
|
|
|
} |
1338
|
|
|
|
|
|
|
} |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
__END__ |