line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
=head1 NAME |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
Bio::DB::GFF -- Storage and retrieval of sequence annotation data |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 SYNOPSIS |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
use Bio::DB::GFF; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# Open the sequence database |
11
|
|
|
|
|
|
|
my $db = Bio::DB::GFF->new( -adaptor => 'dbi::mysqlopt', |
12
|
|
|
|
|
|
|
-dsn => 'dbi:mysql:elegans'); |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
# fetch a 1 megabase segment of sequence starting at landmark "ZK909" |
15
|
|
|
|
|
|
|
my $segment = $db->segment('ZK909', 1 => 1000000); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# pull out all transcript features |
18
|
|
|
|
|
|
|
my @transcripts = $segment->features('transcript'); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# for each transcript, total the length of the introns |
21
|
|
|
|
|
|
|
my %totals; |
22
|
|
|
|
|
|
|
for my $t (@transcripts) { |
23
|
|
|
|
|
|
|
my @introns = $t->Intron; |
24
|
|
|
|
|
|
|
$totals{$t->name} += $_->length foreach @introns; |
25
|
|
|
|
|
|
|
} |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# Sort the exons of the first transcript by position |
28
|
|
|
|
|
|
|
my @exons = sort {$a->start <=> $b->start} $transcripts[0]->Exon; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
# Get a region 1000 bp upstream of first exon |
31
|
|
|
|
|
|
|
my $upstream = $exons[0]->subseq(-1000,0); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# get its DNA |
34
|
|
|
|
|
|
|
my $dna = $upstream->seq; |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# and get all curated polymorphisms inside it |
37
|
|
|
|
|
|
|
@polymorphisms = $upstream->contained_features('polymorphism:curated'); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# get all feature types in the database |
40
|
|
|
|
|
|
|
my @types = $db->types; |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# count all feature types in the segment |
43
|
|
|
|
|
|
|
my %type_counts = $segment->types(-enumerate=>1); |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
# get an iterator on all curated features of type 'exon' or 'intron' |
46
|
|
|
|
|
|
|
my $iterator = $db->get_seq_stream(-type => ['exon:curated','intron:curated']); |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
while (my $s = $iterator->next_seq) { |
49
|
|
|
|
|
|
|
print $s,"\n"; |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# find all transcripts annotated as having function 'kinase' |
53
|
|
|
|
|
|
|
my $iterator = $db->get_seq_stream(-type=>'transcript', |
54
|
|
|
|
|
|
|
-attributes=>{Function=>'kinase'}); |
55
|
|
|
|
|
|
|
while (my $s = $iterator->next_seq) { |
56
|
|
|
|
|
|
|
print $s,"\n"; |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=head1 DESCRIPTION |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
Bio::DB::GFF provides fast indexed access to a sequence annotation |
62
|
|
|
|
|
|
|
database. It supports multiple database types (ACeDB, relational), |
63
|
|
|
|
|
|
|
and multiple schemas through a system of adaptors and aggregators. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
The following operations are supported by this module: |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
- retrieving a segment of sequence based on the ID of a landmark |
68
|
|
|
|
|
|
|
- retrieving the DNA from that segment |
69
|
|
|
|
|
|
|
- finding all annotations that overlap with the segment |
70
|
|
|
|
|
|
|
- finding all annotations that are completely contained within the |
71
|
|
|
|
|
|
|
segment |
72
|
|
|
|
|
|
|
- retrieving all annotations of a particular type, either within a |
73
|
|
|
|
|
|
|
segment, or globally |
74
|
|
|
|
|
|
|
- conversion from absolute to relative coordinates and back again, |
75
|
|
|
|
|
|
|
using any arbitrary landmark for the relative coordinates |
76
|
|
|
|
|
|
|
- using a sequence segment to create new segments based on relative |
77
|
|
|
|
|
|
|
offsets |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
The data model used by Bio::DB::GFF is compatible with the GFF flat |
80
|
|
|
|
|
|
|
file format (L). The module |
81
|
|
|
|
|
|
|
can load a set of GFF files into the database, and serves objects that |
82
|
|
|
|
|
|
|
have methods corresponding to GFF fields. |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
The objects returned by Bio::DB::GFF are compatible with the |
85
|
|
|
|
|
|
|
SeqFeatureI interface, allowing their use by the Bio::Graphics and |
86
|
|
|
|
|
|
|
Bio::DAS modules. |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
=head2 Auxiliary Scripts |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
The bioperl distribution includes several scripts that make it easier |
91
|
|
|
|
|
|
|
to work with Bio::DB::GFF databases. They are located in the scripts |
92
|
|
|
|
|
|
|
directory under a subdirectory named Bio::DB::GFF: |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=over 4 |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=item * |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
bp_load_gff.pl |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
This script will load a Bio::DB::GFF database from a flat GFF file of |
101
|
|
|
|
|
|
|
sequence annotations. Only the relational database version of |
102
|
|
|
|
|
|
|
Bio::DB::GFF is supported. It can be used to create the database from |
103
|
|
|
|
|
|
|
scratch, as well as to incrementally load new data. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
This script takes a --fasta argument to load raw DNA into the database |
106
|
|
|
|
|
|
|
as well. However, GFF databases do not require access to the raw DNA |
107
|
|
|
|
|
|
|
for most of their functionality. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
load_gff.pl also has a --upgrade option, which will perform a |
110
|
|
|
|
|
|
|
non-destructive upgrade of older schemas to newer ones. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=item * |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
bp_bulk_load_gff.pl |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
This script will populate a Bio::DB::GFF database from a flat GFF file |
117
|
|
|
|
|
|
|
of sequence annotations. Only the MySQL database version of |
118
|
|
|
|
|
|
|
Bio::DB::GFF is supported. It uses the "LOAD DATA INFILE" query in |
119
|
|
|
|
|
|
|
order to accelerate loading considerably; however, it can only be used |
120
|
|
|
|
|
|
|
for the initial load, and not for updates. |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
This script takes a --fasta argument to load raw DNA into the database |
123
|
|
|
|
|
|
|
as well. However, GFF databases do not require access to the raw DNA |
124
|
|
|
|
|
|
|
for most of their functionality. |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=item * |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
bp_fast_load_gff.pl |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
This script is as fast as bp_bulk_load_gff.pl but uses Unix pipe |
131
|
|
|
|
|
|
|
tricks to allow for incremental updates. It only supports the MySQL |
132
|
|
|
|
|
|
|
database version of Bio::DB::GFF and is guaranteed not to work on |
133
|
|
|
|
|
|
|
non-Unix platforms. |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
Arguments are the same as bp_load_gff.pl |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
=item * |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
gadfly_to_gff.pl |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
This script will convert the GFF-like format used by the Berkeley |
142
|
|
|
|
|
|
|
Drosophila Sequencing project into a format suitable for use with this |
143
|
|
|
|
|
|
|
module. |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=item * |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sgd_to_gff.pl |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
This script will convert the tab-delimited feature files used by the |
150
|
|
|
|
|
|
|
Saccharomyces Genome Database into a format suitable for use with this |
151
|
|
|
|
|
|
|
module. |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=back |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=head2 GFF Fundamentals |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
The GFF format is a flat tab-delimited file, each line of which |
158
|
|
|
|
|
|
|
corresponds to an annotation, or feature. Each line has nine columns |
159
|
|
|
|
|
|
|
and looks like this: |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
Chr1 curated CDS 365647 365963 . + 1 Transcript "R119.7" |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
The 9 columns are as follows: |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=over 4 |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=item 1. |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
reference sequence |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
This is the ID of the sequence that is used to establish the |
172
|
|
|
|
|
|
|
coordinate system of the annotation. In the example above, the |
173
|
|
|
|
|
|
|
reference sequence is "Chr1". |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=item 2. |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
source |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
The source of the annotation. This field describes how the annotation |
180
|
|
|
|
|
|
|
was derived. In the example above, the source is "curated" to |
181
|
|
|
|
|
|
|
indicate that the feature is the result of human curation. The names |
182
|
|
|
|
|
|
|
and versions of software programs are often used for the source field, |
183
|
|
|
|
|
|
|
as in "tRNAScan-SE/1.2". |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=item 3. |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
method |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
The annotation method. This field describes the type of the |
190
|
|
|
|
|
|
|
annotation, such as "CDS". Together the method and source describe |
191
|
|
|
|
|
|
|
the annotation type. |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=item 4. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
start position |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
The start of the annotation relative to the reference sequence. |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=item 5. |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
stop position |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
The stop of the annotation relative to the reference sequence. Start |
204
|
|
|
|
|
|
|
is always less than or equal to stop. |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=item 6. |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
score |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
For annotations that are associated with a numeric score (for example, |
211
|
|
|
|
|
|
|
a sequence similarity), this field describes the score. The score |
212
|
|
|
|
|
|
|
units are completely unspecified, but for sequence similarities, it is |
213
|
|
|
|
|
|
|
typically percent identity. Annotations that don't have a score can |
214
|
|
|
|
|
|
|
use "." |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=item 7. |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
strand |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
For those annotations which are strand-specific, this field is the |
221
|
|
|
|
|
|
|
strand on which the annotation resides. It is "+" for the forward |
222
|
|
|
|
|
|
|
strand, "-" for the reverse strand, or "." for annotations that are |
223
|
|
|
|
|
|
|
not stranded. |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=item 8. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
phase |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
For annotations that are linked to proteins, this field describes the |
230
|
|
|
|
|
|
|
phase of the annotation on the codons. It is a number from 0 to 2, or |
231
|
|
|
|
|
|
|
"." for features that have no phase. |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=item 9. |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
group |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
GFF provides a simple way of generating annotation hierarchies ("is |
238
|
|
|
|
|
|
|
composed of" relationships) by providing a group field. The group |
239
|
|
|
|
|
|
|
field contains the class and ID of an annotation which is the logical |
240
|
|
|
|
|
|
|
parent of the current one. In the example given above, the group is |
241
|
|
|
|
|
|
|
the Transcript named "R119.7". |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
The group field is also used to store information about the target of |
244
|
|
|
|
|
|
|
sequence similarity hits, and miscellaneous notes. See the next |
245
|
|
|
|
|
|
|
section for a description of how to describe similarity targets. |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
The format of the group fields is "Class ID" with a single space (not |
248
|
|
|
|
|
|
|
a tab) separating the class from the ID. It is VERY IMPORTANT to |
249
|
|
|
|
|
|
|
follow this format, or grouping will not work properly. |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=back |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
The sequences used to establish the coordinate system for annotations |
254
|
|
|
|
|
|
|
can correspond to sequenced clones, clone fragments, contigs or |
255
|
|
|
|
|
|
|
super-contigs. Thus, this module can be used throughout the lifecycle |
256
|
|
|
|
|
|
|
of a sequencing project. |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
In addition to a group ID, the GFF format allows annotations to have a |
259
|
|
|
|
|
|
|
group class. For example, in the ACeDB representation, RNA |
260
|
|
|
|
|
|
|
interference experiments have a class of "RNAi" and an ID that is |
261
|
|
|
|
|
|
|
unique among the RNAi experiments. Since not all databases support |
262
|
|
|
|
|
|
|
this notion, the class is optional in all calls to this module, and |
263
|
|
|
|
|
|
|
defaults to "Sequence" when not provided. |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
Double-quotes are sometimes used in GFF files around components of the |
266
|
|
|
|
|
|
|
group field. Strictly, this is only necessary if the group name or |
267
|
|
|
|
|
|
|
class contains whitespace. |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=head2 Making GFF files work with this module |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
Some annotations do not need to be individually named. For example, |
272
|
|
|
|
|
|
|
it is probably not useful to assign a unique name to each ALU repeat |
273
|
|
|
|
|
|
|
in a vertebrate genome. Others, such as predicted genes, correspond |
274
|
|
|
|
|
|
|
to named biological objects; you probably want to be able to fetch the |
275
|
|
|
|
|
|
|
positions of these objects by referring to them by name. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
To accommodate named annotations, the GFF format places the object |
278
|
|
|
|
|
|
|
class and name in the group field. The name identifies the object, |
279
|
|
|
|
|
|
|
and the class prevents similarly-named objects, for example clones and |
280
|
|
|
|
|
|
|
sequences, from collding. |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
A named object is shown in the following excerpt from a GFF file: |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
Chr1 curated transcript 939627 942410 . + . Transcript Y95B8A.2 |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
This object is a predicted transcript named Y95BA.2. In this case, |
287
|
|
|
|
|
|
|
the group field is used to identify the class and name of the object, |
288
|
|
|
|
|
|
|
even though no other annotation belongs to that group. |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
It now becomes possible to retrieve the region of the genome covered |
291
|
|
|
|
|
|
|
by transcript Y95B8A.2 using the segment() method: |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
$segment = $db->segment(-class=>'Transcript',-name=>'Y95B8A.2'); |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
It is not necessary for the annotation's method to correspond to the |
296
|
|
|
|
|
|
|
object class, although this is commonly the case. |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
As explained above, each annotation in a GFF file refers to a |
299
|
|
|
|
|
|
|
reference sequence. It is important that each reference sequence also |
300
|
|
|
|
|
|
|
be identified by a line in the GFF file. This allows the Bio::DB::GFF |
301
|
|
|
|
|
|
|
module to determine the length and class of the reference sequence, |
302
|
|
|
|
|
|
|
and makes it possible to do relative arithmetic. |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
For example, if "Chr1" is used as a reference sequence, then it should |
305
|
|
|
|
|
|
|
have an entry in the GFF file similar to this one: |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
Chr1 assembly chromosome 1 14972282 . + . Sequence Chr1 |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
This indicates that the reference sequence named "Chr1" has length |
310
|
|
|
|
|
|
|
14972282 bp, method "chromosome" and source "assembly". In addition, |
311
|
|
|
|
|
|
|
as indicated by the group field, Chr1 has class "Sequence" and name |
312
|
|
|
|
|
|
|
"Chr1". |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
The object class "Sequence" is used by default when the class is not |
315
|
|
|
|
|
|
|
specified in the segment() call. This allows you to use a shortcut |
316
|
|
|
|
|
|
|
form of the segment() method: |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
$segment = $db->segment('Chr1'); # whole chromosome |
319
|
|
|
|
|
|
|
$segment = $db->segment('Chr1',1=>1000); # first 1000 bp |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
For your convenience, if, during loading a GFF file, Bio::DB::GFF |
322
|
|
|
|
|
|
|
encounters a line like the following: |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
##sequence-region Chr1 1 14972282 |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
It will automatically generate the following entry: |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
Chr1 reference Component 1 14972282 . + . Sequence Chr1 |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
This is sufficient to use Chr1 as a reference point. |
331
|
|
|
|
|
|
|
The ##sequence-region line is frequently found in the GFF files |
332
|
|
|
|
|
|
|
distributed by annotation groups. |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=head2 Specifying the group tag |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
A frequent problem with GFF files is the problem distinguishing |
337
|
|
|
|
|
|
|
which of the several tag/value pairs in the 9th column is the grouping |
338
|
|
|
|
|
|
|
pair. Ordinarily the first tag will be used for grouping, but some |
339
|
|
|
|
|
|
|
GFF manipulating tools do not preserve the order of attributes. To |
340
|
|
|
|
|
|
|
eliminate this ambiguity, this module provides two ways of explicitly |
341
|
|
|
|
|
|
|
specifying which tag to group on: |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=over 4 |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=item * |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
Using -preferred_groups |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
When you create a Bio::DB::GFF object, pass it a -preferred_groups=E |
350
|
|
|
|
|
|
|
argument. This specifies a tag that will be used for grouping. You |
351
|
|
|
|
|
|
|
can pass an array reference to specify a list of such tags. |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
=item * |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
In the GFF header |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
The GFF file itself can specify which tags are to be used for |
358
|
|
|
|
|
|
|
grouping. Insert a comment like the following: |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
##group-tags Accession Locus |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
This says to use the Accession tag for grouping. If it is not |
363
|
|
|
|
|
|
|
available, use the Locus tag. If neither tag is available, use the |
364
|
|
|
|
|
|
|
first pair to appear. |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
=back |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
These options only apply when B a GFF file into the database, |
369
|
|
|
|
|
|
|
and have no effect on existing databases. |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
The group-tags comment in the GFF file will *override* the preferred |
372
|
|
|
|
|
|
|
groups set when you create the Bio::DB::GFF object. |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
For backward compatibility, the tags Sequence and Transcript are |
375
|
|
|
|
|
|
|
always treated as grouping tags unless preferred_tags are specified. |
376
|
|
|
|
|
|
|
The "Target" tag is always used for grouping regardless of the |
377
|
|
|
|
|
|
|
preferred_groups() setting, and the tags "tstart", "tend" and "Note" |
378
|
|
|
|
|
|
|
cannot be used for grouping. These are historical artefacts coming |
379
|
|
|
|
|
|
|
from various interpretations of GFF2, and cannot be changed. |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
=head2 Sequence alignments |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
There are two cases in which an annotation indicates the relationship |
384
|
|
|
|
|
|
|
between two sequences. The first case is a similarity hit, where the |
385
|
|
|
|
|
|
|
annotation indicates an alignment. The second case is a map assembly, |
386
|
|
|
|
|
|
|
in which the annotation indicates that a portion of a larger sequence |
387
|
|
|
|
|
|
|
is built up from one or more smaller ones. |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
Both cases are indicated by using the B tag in the group |
390
|
|
|
|
|
|
|
field. For example, a typical similarity hit will look like this: |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
Chr1 BLASTX similarity 76953 77108 132 + 0 Target Protein:SW:ABL_DROME 493 544 |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
The group field contains the Target tag, followed by an identifier for |
395
|
|
|
|
|
|
|
the biological object referred to. The GFF format uses the notation |
396
|
|
|
|
|
|
|
I:I for the biological object, and even though this is |
397
|
|
|
|
|
|
|
stylistically inconsistent, that's the way it's done. The object |
398
|
|
|
|
|
|
|
identifier is followed by two integers indicating the start and stop |
399
|
|
|
|
|
|
|
of the alignment on the target sequence. |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
Unlike the main start and stop columns, it is possible for the target |
402
|
|
|
|
|
|
|
start to be greater than the target end. The previous example |
403
|
|
|
|
|
|
|
indicates that the the section of Chr1 from 76,953 to 77,108 aligns to |
404
|
|
|
|
|
|
|
the protein SW:ABL_DROME starting at position 493 and extending to |
405
|
|
|
|
|
|
|
position 544. |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
A similar notation is used for sequence assembly information as shown |
408
|
|
|
|
|
|
|
in this example: |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
Chr1 assembly Link 10922906 11177731 . . . Target Sequence:LINK_H06O01 1 254826 |
411
|
|
|
|
|
|
|
LINK_H06O01 assembly Cosmid 32386 64122 . . . Target Sequence:F49B2 6 31742 |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
This indicates that the region between bases 10922906 and 11177731 of |
414
|
|
|
|
|
|
|
Chr1 are composed of LINK_H06O01 from bp 1 to bp 254826. The region |
415
|
|
|
|
|
|
|
of LINK_H0601 between 32386 and 64122 is, in turn, composed of the |
416
|
|
|
|
|
|
|
bases 5 to 31742 of cosmid F49B2. |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=head2 Attributes |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
While not intended to serve as a general-purpose sequence database |
421
|
|
|
|
|
|
|
(see bioperl-db for that), GFF allows you to tag features with |
422
|
|
|
|
|
|
|
arbitrary attributes. Attributes appear in the Group field following |
423
|
|
|
|
|
|
|
the initial class/name pair. For example: |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
Chr1 cur trans 939 942 . + . Transcript Y95B8A.2 ; Gene sma-3 ; Alias sma3 |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
This line tags the feature named Transcript Y95B8A.2 as being "Gene" |
428
|
|
|
|
|
|
|
named sma-3 and having the Alias "sma3". Features having these |
429
|
|
|
|
|
|
|
attributes can be looked up using the fetch_feature_by_attribute() method. |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
Two attributes have special meaning: "Note" is for backward |
432
|
|
|
|
|
|
|
compatibility and is used for unstructured text remarks. "Alias" is |
433
|
|
|
|
|
|
|
considered as a synonym for the feature name and will be consulted |
434
|
|
|
|
|
|
|
when looking up a feature by its name. |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=head2 Adaptors and Aggregators |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
This module uses a system of adaptors and aggregators in order to make |
439
|
|
|
|
|
|
|
it adaptable to use with a variety of databases. |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
=over 4 |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
=item * |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
Adaptors |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
The core of the module handles the user API, annotation coordinate |
448
|
|
|
|
|
|
|
arithmetic, and other common issues. The details of fetching |
449
|
|
|
|
|
|
|
information from databases is handled by an adaptor, which is |
450
|
|
|
|
|
|
|
specified during Bio::DB::GFF construction. The adaptor encapsulates |
451
|
|
|
|
|
|
|
database-specific information such as the schema, user authentication |
452
|
|
|
|
|
|
|
and access methods. |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
There are currently five adaptors recommended for general use: |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
Adaptor Name Description |
457
|
|
|
|
|
|
|
------------ ----------- |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
memory A simple in-memory database suitable for testing |
460
|
|
|
|
|
|
|
and small data sets. |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
berkeleydb An indexed file database based on the DB_File module, |
463
|
|
|
|
|
|
|
suitable for medium-sized read-only data sets. |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
dbi::mysql An interface to a schema implemented in the Mysql |
466
|
|
|
|
|
|
|
relational database management system. |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
dbi::oracle An interface to a schema implemented in the Oracle |
469
|
|
|
|
|
|
|
relational database management system. |
470
|
|
|
|
|
|
|
|
471
|
|
|
|
|
|
|
dbi::pg An interface to a schema implemented in the PostgreSQL |
472
|
|
|
|
|
|
|
relational database management system. |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
Check the Bio/DB/GFF/Adaptor directory and subdirectories for other, |
475
|
|
|
|
|
|
|
more specialized adaptors, as well as experimental ones. |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
=item * |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
Aggregators |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
The GFF format uses a "group" field to indicate aggregation properties |
482
|
|
|
|
|
|
|
of individual features. For example, a set of exons and introns may |
483
|
|
|
|
|
|
|
share a common transcript group, and multiple transcripts may share |
484
|
|
|
|
|
|
|
the same gene group. |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
Aggregators are small modules that use the group information to |
487
|
|
|
|
|
|
|
rebuild the hierarchy. When a Bio::DB::GFF object is created, you |
488
|
|
|
|
|
|
|
indicate that it use a set of one or more aggregators. Each |
489
|
|
|
|
|
|
|
aggregator provides a new composite annotation type. Before the |
490
|
|
|
|
|
|
|
database query is generated each aggregator is called to |
491
|
|
|
|
|
|
|
"disaggregate" its annotation type into list of component types |
492
|
|
|
|
|
|
|
contained in the database. After the query is generated, each |
493
|
|
|
|
|
|
|
aggregator is called again in order to build composite annotations |
494
|
|
|
|
|
|
|
from the returned components. |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
For example, during disaggregation, the standard |
497
|
|
|
|
|
|
|
"processed_transcript" aggregator generates a list of component |
498
|
|
|
|
|
|
|
feature types including "UTR", "CDS", and "polyA_site". Later, it |
499
|
|
|
|
|
|
|
aggregates these features into a set of annotations of type |
500
|
|
|
|
|
|
|
"processed_transcript". |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
During aggregation, the list of aggregators is called in reverse |
503
|
|
|
|
|
|
|
order. This allows aggregators to collaborate to create multi-level |
504
|
|
|
|
|
|
|
structures: the transcript aggregator assembles transcripts from |
505
|
|
|
|
|
|
|
introns and exons; the gene aggregator then assembles genes from sets |
506
|
|
|
|
|
|
|
of transcripts. |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
Three default aggregators are provided: |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
transcript assembles transcripts from features of type |
511
|
|
|
|
|
|
|
exon, CDS, 5'UTR, 3'UTR, TSS, and PolyA |
512
|
|
|
|
|
|
|
clone assembles clones from Clone_left_end, Clone_right_end |
513
|
|
|
|
|
|
|
and Sequence features. |
514
|
|
|
|
|
|
|
alignment assembles gapped alignments from features of type |
515
|
|
|
|
|
|
|
"similarity". |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
In addition, this module provides the optional "wormbase_gene" |
518
|
|
|
|
|
|
|
aggregator, which accommodates the WormBase representation of genes. |
519
|
|
|
|
|
|
|
This aggregator aggregates features of method "exon", "CDS", "5'UTR", |
520
|
|
|
|
|
|
|
"3'UTR", "polyA" and "TSS" into a single object. It also expects to |
521
|
|
|
|
|
|
|
find a single feature of type "Sequence" that spans the entire gene. |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
The existing aggregators are easily customized. |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
Note that aggregation will not occur unless you specifically request |
526
|
|
|
|
|
|
|
the aggregation type. For example, this call: |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
@features = $segment->features('alignment'); |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
will generate an array of aggregated alignment features. However, |
531
|
|
|
|
|
|
|
this call: |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
@features = $segment->features(); |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
will return a list of unaggregated similarity segments. |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
For more informnation, see the manual pages for |
538
|
|
|
|
|
|
|
Bio::DB::GFF::Aggregator::processed_transcript, Bio::DB::GFF::Aggregator::clone, |
539
|
|
|
|
|
|
|
etc. |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
=back |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
=head2 Loading GFF3 Files |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
This module will accept GFF3 files, as described at |
546
|
|
|
|
|
|
|
http://song.sourceforge.net/gff3.shtml. However, the implementation |
547
|
|
|
|
|
|
|
has some limitations. |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
=over 4 |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
=item GFF version string is required |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
The GFF file B contain the version comment: |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
##gff-version 3 |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
Unless this version string is present at the top of the GFF file, the |
558
|
|
|
|
|
|
|
loader will attempt to parse the file in GFF2 format, with |
559
|
|
|
|
|
|
|
less-than-desirable results. |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
=item Only one level of nesting allowed |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
A major restriction is that Bio::DB::GFF only allows one level of |
564
|
|
|
|
|
|
|
nesting of features. For nesting, the Target tag will be used |
565
|
|
|
|
|
|
|
preferentially followed by the ID tag, followed by the Parent tag. |
566
|
|
|
|
|
|
|
This means that if genes are represented like this: |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
XXXX XXXX gene XXXX XXXX XXXX ID=myGene |
569
|
|
|
|
|
|
|
XXXX XXXX mRNA XXXX XXXX XXXX ID=myTranscript;Parent=myGene |
570
|
|
|
|
|
|
|
XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript |
571
|
|
|
|
|
|
|
XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
Then there will be one group called myGene containing the "gene" |
574
|
|
|
|
|
|
|
feature and one group called myTranscript containing the mRNA, and two |
575
|
|
|
|
|
|
|
exons. |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
You can work around this restriction to some extent by using the Alias |
578
|
|
|
|
|
|
|
attribute literally: |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
XXXX XXXX gene XXXX XXXX XXXX ID=myGene |
581
|
|
|
|
|
|
|
XXXX XXXX mRNA XXXX XXXX XXXX ID=myTranscript;Parent=myGene;Alias=myGene |
582
|
|
|
|
|
|
|
XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript;Alias=myGene |
583
|
|
|
|
|
|
|
XXXX XXXX exon XXXX XXXX XXXX Parent=myTranscript;Alias=myGene |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
This limitation will be corrected in the next version of Bio::DB::GFF. |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
=back |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
=head1 API |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
The following is the API for Bio::DB::GFF. |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
=cut |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
package Bio::DB::GFF; |
596
|
|
|
|
|
|
|
|
597
|
3
|
|
|
3
|
|
2244
|
use strict; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
63
|
|
598
|
|
|
|
|
|
|
|
599
|
3
|
|
|
3
|
|
1110
|
use IO::File; |
|
3
|
|
|
|
|
1911
|
|
|
3
|
|
|
|
|
261
|
|
600
|
3
|
|
|
3
|
|
12
|
use File::Glob ':glob'; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
456
|
|
601
|
3
|
|
|
3
|
|
849
|
use Bio::DB::GFF::Util::Rearrange; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
132
|
|
602
|
3
|
|
|
3
|
|
1038
|
use Bio::DB::GFF::RelSegment; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
93
|
|
603
|
3
|
|
|
3
|
|
15
|
use Bio::DB::GFF::Feature; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
45
|
|
604
|
3
|
|
|
3
|
|
1125
|
use Bio::DB::GFF::Aggregator; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
69
|
|
605
|
|
|
|
|
|
|
|
606
|
3
|
|
|
3
|
|
12
|
use base qw(Bio::Root::Root Bio::DasI); |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
3480
|
|
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
my %valid_range_types = (overlaps => 1, |
609
|
|
|
|
|
|
|
contains => 1, |
610
|
|
|
|
|
|
|
contained_in => 1); |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=head1 Querying GFF Databases |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
=head2 new |
615
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
Title : new |
617
|
|
|
|
|
|
|
Usage : my $db = Bio::DB::GFF->new(@args); |
618
|
|
|
|
|
|
|
Function: create a new Bio::DB::GFF object |
619
|
|
|
|
|
|
|
Returns : new Bio::DB::GFF object |
620
|
|
|
|
|
|
|
Args : lists of adaptors and aggregators |
621
|
|
|
|
|
|
|
Status : Public |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
These are the arguments: |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
-adaptor Name of the adaptor module to use. If none |
626
|
|
|
|
|
|
|
provided, defaults to "dbi::mysqlopt". |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
-aggregator Array reference to a list of aggregators |
629
|
|
|
|
|
|
|
to apply to the database. If none provided, |
630
|
|
|
|
|
|
|
defaults to ['processed_transcript','alignment']. |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
-preferred_groups When interpreteting the 9th column of a GFF2 file, |
633
|
|
|
|
|
|
|
the indicated group names will have preference over |
634
|
|
|
|
|
|
|
other attributes, even if they do not come first in |
635
|
|
|
|
|
|
|
the list of attributes. This can be a scalar value |
636
|
|
|
|
|
|
|
or an array reference. |
637
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
Any other named argument pairs are passed to |
639
|
|
|
|
|
|
|
the adaptor for processing. |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
The adaptor argument must correspond to a module contained within the |
642
|
|
|
|
|
|
|
Bio::DB::GFF::Adaptor namespace. For example, the |
643
|
|
|
|
|
|
|
Bio::DB::GFF::Adaptor::dbi::mysql adaptor is loaded by specifying |
644
|
|
|
|
|
|
|
'dbi::mysql'. By Perl convention, the adaptors names are lower case |
645
|
|
|
|
|
|
|
because they are loaded at run time. |
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
The aggregator array may contain a list of aggregator names, a list of |
648
|
|
|
|
|
|
|
initialized aggregator objects, or a string in the form |
649
|
|
|
|
|
|
|
"aggregator_name{subpart1,subpart2,subpart3/main_method}" (the |
650
|
|
|
|
|
|
|
"/main_method" part is optional, but if present a feature with the |
651
|
|
|
|
|
|
|
main_method must be present in order for aggregation to occur). For |
652
|
|
|
|
|
|
|
example, if you wish to change the components aggregated by the |
653
|
|
|
|
|
|
|
transcript aggregator, you could pass it to the GFF constructor this |
654
|
|
|
|
|
|
|
way: |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
my $transcript = |
657
|
|
|
|
|
|
|
Bio::DB::Aggregator::transcript->new(-sub_parts=>[qw(exon intron utr |
658
|
|
|
|
|
|
|
polyA spliced_leader)]); |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
my $db = Bio::DB::GFF->new(-aggregator=>[$transcript,'clone','alignment], |
661
|
|
|
|
|
|
|
-adaptor => 'dbi::mysql', |
662
|
|
|
|
|
|
|
-dsn => 'dbi:mysql:elegans42'); |
663
|
|
|
|
|
|
|
|
664
|
|
|
|
|
|
|
Alternatively, you could create an entirely new transcript aggregator |
665
|
|
|
|
|
|
|
this way: |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
my $new_agg = 'transcript{exon,intron,utr,polyA,spliced_leader}'; |
668
|
|
|
|
|
|
|
my $db = Bio::DB::GFF->new(-aggregator=>[$new_agg,'clone','alignment], |
669
|
|
|
|
|
|
|
-adaptor => 'dbi::mysql', |
670
|
|
|
|
|
|
|
-dsn => 'dbi:mysql:elegans42'); |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
See L for more details. |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
The B<-preferred_groups> argument is used to change the default |
675
|
|
|
|
|
|
|
processing of the 9th column of GFF version 2 files. By default, the |
676
|
|
|
|
|
|
|
first tag/value pair is used to establish the group class and name. |
677
|
|
|
|
|
|
|
If you pass -preferred_groups a scalar, the parser will look for a tag |
678
|
|
|
|
|
|
|
of the indicated type and use it as the group even if it is not first |
679
|
|
|
|
|
|
|
in the file. If you pass this argument a list of group classes as an |
680
|
|
|
|
|
|
|
array ref, then the list will establish the precedence for searching. |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
The commonly used 'dbi::mysql' adaptor recognizes the following |
683
|
|
|
|
|
|
|
adaptor-specific arguments: |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
Argument Description |
686
|
|
|
|
|
|
|
-------- ----------- |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
-dsn the DBI data source, e.g. 'dbi:mysql:ens0040' |
689
|
|
|
|
|
|
|
If a partial name is given, such as "ens0040", the |
690
|
|
|
|
|
|
|
"dbi:mysql:" prefix will be added automatically. |
691
|
|
|
|
|
|
|
|
692
|
|
|
|
|
|
|
-user username for authentication |
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
-pass the password for authentication |
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
-refclass landmark Class; defaults to "Sequence" |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
The commonly used 'dbi::mysqlopt' adaptor also recogizes the following |
700
|
|
|
|
|
|
|
arguments. |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
Argument Description |
703
|
|
|
|
|
|
|
-------- ----------- |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
-fasta path to a directory containing FASTA files for the DNA |
706
|
|
|
|
|
|
|
contained in this database (e.g. "/usr/local/share/fasta") |
707
|
|
|
|
|
|
|
|
708
|
|
|
|
|
|
|
-acedb an acedb URL to use when converting features into ACEDB |
709
|
|
|
|
|
|
|
objects (e.g. sace://localhost:2005) |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
=cut |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
#' |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
sub new { |
716
|
5
|
|
|
5
|
1
|
73912
|
my $package = shift; |
717
|
5
|
|
|
|
|
13
|
my ($adaptor,$aggregators,$args,$refclass,$preferred_groups); |
718
|
|
|
|
|
|
|
|
719
|
5
|
50
|
|
|
|
17
|
if (@_ == 1) { # special case, default to dbi::mysqlopt |
720
|
0
|
|
|
|
|
0
|
$adaptor = 'dbi::mysqlopt'; |
721
|
0
|
|
|
|
|
0
|
$args = {DSN => shift}; |
722
|
|
|
|
|
|
|
} else { |
723
|
5
|
|
|
|
|
54
|
($adaptor,$aggregators,$refclass,$preferred_groups,$args) = rearrange([ |
724
|
|
|
|
|
|
|
[qw(ADAPTOR FACTORY)], |
725
|
|
|
|
|
|
|
[qw(AGGREGATOR AGGREGATORS)], |
726
|
|
|
|
|
|
|
'REFCLASS', |
727
|
|
|
|
|
|
|
'PREFERRED_GROUPS' |
728
|
|
|
|
|
|
|
],@_); |
729
|
|
|
|
|
|
|
} |
730
|
|
|
|
|
|
|
|
731
|
5
|
|
50
|
|
|
23
|
$adaptor ||= 'dbi::mysqlopt'; |
732
|
5
|
|
|
|
|
20
|
my $class = "Bio::DB::GFF::Adaptor::\L${adaptor}\E"; |
733
|
5
|
100
|
|
|
|
59
|
unless ($class->can('new')) { |
734
|
3
|
50
|
|
|
|
159
|
eval "require $class;1;" or $package->throw("Unable to load $adaptor adaptor: $@"); |
735
|
|
|
|
|
|
|
} |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
# this hack saves the memory adaptor, which loads the GFF file in new() |
738
|
5
|
50
|
|
|
|
18
|
$args->{PREFERRED_GROUPS} = $preferred_groups if defined $preferred_groups; |
739
|
|
|
|
|
|
|
|
740
|
5
|
|
|
|
|
24
|
my $self = $class->new($args); |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
# handle preferred groups |
743
|
5
|
50
|
|
|
|
16
|
$self->preferred_groups($preferred_groups) if defined $preferred_groups; |
744
|
5
|
|
50
|
|
|
49
|
$self->default_class($refclass || 'Sequence'); |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
# handle the aggregators. |
747
|
|
|
|
|
|
|
# aggregators are responsible for creating complex multi-part features |
748
|
|
|
|
|
|
|
# from the GFF "group" field. If none are provided, then we provide a |
749
|
|
|
|
|
|
|
# list of the two used in WormBase. |
750
|
|
|
|
|
|
|
# Each aggregator can be a scalar or a ref. In the former case |
751
|
|
|
|
|
|
|
# it is treated as a class name to call new() on. In the latter |
752
|
|
|
|
|
|
|
# the aggreator is treated as a ready made object. |
753
|
5
|
50
|
|
|
|
18
|
$aggregators = $self->default_aggregators unless defined $aggregators; |
754
|
5
|
50
|
|
|
|
24
|
my @a = ref($aggregators) eq 'ARRAY' ? @$aggregators : $aggregators; |
755
|
5
|
|
|
|
|
12
|
for my $a (@a) { |
756
|
10
|
|
|
|
|
37
|
$self->add_aggregator($a); |
757
|
|
|
|
|
|
|
} |
758
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
# default settings go here..... |
760
|
5
|
|
|
|
|
36
|
$self->automerge(1); # set automerge to true |
761
|
|
|
|
|
|
|
|
762
|
5
|
|
|
|
|
22
|
$self; |
763
|
|
|
|
|
|
|
} |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
=head2 types |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
Title : types |
769
|
|
|
|
|
|
|
Usage : $db->types(@args) |
770
|
|
|
|
|
|
|
Function: return list of feature types in range or database |
771
|
|
|
|
|
|
|
Returns : a list of Bio::DB::GFF::Typename objects |
772
|
|
|
|
|
|
|
Args : see below |
773
|
|
|
|
|
|
|
Status : public |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
This routine returns a list of feature types known to the database. |
776
|
|
|
|
|
|
|
The list can be database-wide or restricted to a region. It is also |
777
|
|
|
|
|
|
|
possible to find out how many times each feature occurs. |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
For range queries, it is usually more convenient to create a |
780
|
|
|
|
|
|
|
Bio::DB::GFF::Segment object, and then invoke it's types() method. |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
Arguments are as follows: |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
-ref ID of reference sequence |
785
|
|
|
|
|
|
|
-class class of reference sequence |
786
|
|
|
|
|
|
|
-start start of segment |
787
|
|
|
|
|
|
|
-stop stop of segment |
788
|
|
|
|
|
|
|
-enumerate if true, count the features |
789
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
The returned value will be a list of Bio::DB::GFF::Typename objects, |
791
|
|
|
|
|
|
|
which if evaluated in a string context will return the feature type in |
792
|
|
|
|
|
|
|
"method:source" format. This object class also has method() and |
793
|
|
|
|
|
|
|
source() methods for retrieving the like-named fields. |
794
|
|
|
|
|
|
|
|
795
|
|
|
|
|
|
|
If -enumerate is true, then the function returns a hash (not a hash |
796
|
|
|
|
|
|
|
reference) in which the keys are type names in "method:source" format |
797
|
|
|
|
|
|
|
and the values are the number of times each feature appears in the |
798
|
|
|
|
|
|
|
database or segment. |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
The argument -end is a synonum for -stop, and -count is a synonym for |
801
|
|
|
|
|
|
|
-enumerate. |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
=cut |
804
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
sub types { |
806
|
25
|
|
|
25
|
1
|
1390
|
my $self = shift; |
807
|
25
|
|
|
|
|
150
|
my ($refseq,$start,$stop,$enumerate,$refclass,$types) = rearrange ([ |
808
|
|
|
|
|
|
|
[qw(REF REFSEQ)], |
809
|
|
|
|
|
|
|
qw(START), |
810
|
|
|
|
|
|
|
[qw(STOP END)], |
811
|
|
|
|
|
|
|
[qw(ENUMERATE COUNT)], |
812
|
|
|
|
|
|
|
[qw(CLASS SEQCLASS)], |
813
|
|
|
|
|
|
|
[qw(TYPE TYPES)], |
814
|
|
|
|
|
|
|
],@_); |
815
|
25
|
50
|
|
|
|
92
|
$types = $self->parse_types($types) if defined $types; |
816
|
25
|
|
|
|
|
100
|
$self->get_types($refseq,$refclass,$start,$stop,$enumerate,$types); |
817
|
|
|
|
|
|
|
} |
818
|
|
|
|
|
|
|
|
819
|
|
|
|
|
|
|
=head2 classes |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
Title : classes |
822
|
|
|
|
|
|
|
Usage : $db->classes |
823
|
|
|
|
|
|
|
Function: return list of landmark classes in database |
824
|
|
|
|
|
|
|
Returns : a list of classes |
825
|
|
|
|
|
|
|
Args : none |
826
|
|
|
|
|
|
|
Status : public |
827
|
|
|
|
|
|
|
|
828
|
|
|
|
|
|
|
This routine returns the list of reference classes known to the |
829
|
|
|
|
|
|
|
database, or empty if classes are not used by the database. Classes |
830
|
|
|
|
|
|
|
are distinct from types, being essentially qualifiers on the reference |
831
|
|
|
|
|
|
|
namespaces. |
832
|
|
|
|
|
|
|
|
833
|
|
|
|
|
|
|
=cut |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
sub classes { |
836
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
837
|
0
|
|
|
|
|
0
|
return (); |
838
|
|
|
|
|
|
|
} |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
=head2 segment |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
Title : segment |
843
|
|
|
|
|
|
|
Usage : $db->segment(@args); |
844
|
|
|
|
|
|
|
Function: create a segment object |
845
|
|
|
|
|
|
|
Returns : segment object(s) |
846
|
|
|
|
|
|
|
Args : numerous, see below |
847
|
|
|
|
|
|
|
Status : public |
848
|
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
This method generates a segment object, which is a Perl object |
850
|
|
|
|
|
|
|
subclassed from Bio::DB::GFF::Segment. The segment can be used to |
851
|
|
|
|
|
|
|
find overlapping features and the raw DNA. |
852
|
|
|
|
|
|
|
|
853
|
|
|
|
|
|
|
When making the segment() call, you specify the ID of a sequence |
854
|
|
|
|
|
|
|
landmark (e.g. an accession number, a clone or contig), and a |
855
|
|
|
|
|
|
|
positional range relative to the landmark. If no range is specified, |
856
|
|
|
|
|
|
|
then the entire extent of the landmark is used to generate the |
857
|
|
|
|
|
|
|
segment. |
858
|
|
|
|
|
|
|
|
859
|
|
|
|
|
|
|
You may also provide the ID of a "reference" sequence, which will set |
860
|
|
|
|
|
|
|
the coordinate system and orientation used for all features contained |
861
|
|
|
|
|
|
|
within the segment. The reference sequence can be changed later. If |
862
|
|
|
|
|
|
|
no reference sequence is provided, then the coordinate system is based |
863
|
|
|
|
|
|
|
on the landmark. |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
Arguments: |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
-name ID of the landmark sequence. |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
-class Database object class for the landmark sequence. |
870
|
|
|
|
|
|
|
"Sequence" assumed if not specified. This is |
871
|
|
|
|
|
|
|
irrelevant for databases which do not recognize |
872
|
|
|
|
|
|
|
object classes. |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
-start Start of the segment relative to landmark. Positions |
875
|
|
|
|
|
|
|
follow standard 1-based sequence rules. If not specified, |
876
|
|
|
|
|
|
|
defaults to the beginning of the landmark. |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
-end Stop of the segment relative to the landmark. If not specified, |
879
|
|
|
|
|
|
|
defaults to the end of the landmark. |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
-stop Same as -end. |
882
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
-offset For those who prefer 0-based indexing, the offset specifies the |
884
|
|
|
|
|
|
|
position of the new segment relative to the start of the landmark. |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
-length For those who prefer 0-based indexing, the length specifies the |
887
|
|
|
|
|
|
|
length of the new segment. |
888
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
-refseq Specifies the ID of the reference landmark used to establish the |
890
|
|
|
|
|
|
|
coordinate system for the newly-created segment. |
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
-refclass Specifies the class of the reference landmark, for those databases |
893
|
|
|
|
|
|
|
that distinguish different object classes. Defaults to "Sequence". |
894
|
|
|
|
|
|
|
|
895
|
|
|
|
|
|
|
-absolute |
896
|
|
|
|
|
|
|
Return features in absolute coordinates rather than relative to the |
897
|
|
|
|
|
|
|
parent segment. |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
-nocheck Don't check the database for the coordinates and length of this |
900
|
|
|
|
|
|
|
feature. Construct a segment using the indicated name as the |
901
|
|
|
|
|
|
|
reference, a start coordinate of 1, an undefined end coordinate, |
902
|
|
|
|
|
|
|
and a strand of +1. |
903
|
|
|
|
|
|
|
|
904
|
|
|
|
|
|
|
-force Same as -nocheck. |
905
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
-seq,-sequence,-sourceseq Aliases for -name. |
907
|
|
|
|
|
|
|
|
908
|
|
|
|
|
|
|
-begin,-end Aliases for -start and -stop |
909
|
|
|
|
|
|
|
|
910
|
|
|
|
|
|
|
-off,-len Aliases for -offset and -length |
911
|
|
|
|
|
|
|
|
912
|
|
|
|
|
|
|
-seqclass Alias for -class |
913
|
|
|
|
|
|
|
|
914
|
|
|
|
|
|
|
Here's an example to explain how this works: |
915
|
|
|
|
|
|
|
|
916
|
|
|
|
|
|
|
my $db = Bio::DB::GFF->new(-dsn => 'dbi:mysql:human',-adaptor=>'dbi::mysql'); |
917
|
|
|
|
|
|
|
|
918
|
|
|
|
|
|
|
If successful, $db will now hold the database accessor object. We now |
919
|
|
|
|
|
|
|
try to fetch the fragment of sequence whose ID is A0000182 and class |
920
|
|
|
|
|
|
|
is "Accession." |
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
my $segment = $db->segment(-name=>'A0000182',-class=>'Accession'); |
923
|
|
|
|
|
|
|
|
924
|
|
|
|
|
|
|
If successful, $segment now holds the entire segment corresponding to |
925
|
|
|
|
|
|
|
this accession number. By default, the sequence is used as its own |
926
|
|
|
|
|
|
|
reference sequence, so its first base will be 1 and its last base will |
927
|
|
|
|
|
|
|
be the length of the accession. |
928
|
|
|
|
|
|
|
|
929
|
|
|
|
|
|
|
Assuming that this sequence belongs to a longer stretch of DNA, say a |
930
|
|
|
|
|
|
|
contig, we can fetch this information like so: |
931
|
|
|
|
|
|
|
|
932
|
|
|
|
|
|
|
my $sourceseq = $segment->sourceseq; |
933
|
|
|
|
|
|
|
|
934
|
|
|
|
|
|
|
and find the start and stop on the source like this: |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
my $start = $segment->abs_start; |
937
|
|
|
|
|
|
|
my $stop = $segment->abs_stop; |
938
|
|
|
|
|
|
|
|
939
|
|
|
|
|
|
|
If we had another segment, say $s2, which is on the same contiguous |
940
|
|
|
|
|
|
|
piece of DNA, we can pass that to the refseq() method in order to |
941
|
|
|
|
|
|
|
establish it as the coordinate reference point: |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
$segment->refseq($s2); |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
Now calling start() will return the start of the segment relative to |
946
|
|
|
|
|
|
|
the beginning of $s2, accounting for differences in strandedness: |
947
|
|
|
|
|
|
|
|
948
|
|
|
|
|
|
|
my $rel_start = $segment->start; |
949
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
IMPORTANT NOTE: This method can be used to return the segment spanned |
951
|
|
|
|
|
|
|
by an arbitrary named annotation. However, if the annotation appears |
952
|
|
|
|
|
|
|
at multiple locations on the genome, for example an EST that maps to |
953
|
|
|
|
|
|
|
multiple locations, then, provided that all locations reside on the |
954
|
|
|
|
|
|
|
same physical segment, the method will return a segment that spans the |
955
|
|
|
|
|
|
|
minimum and maximum positions. If the reference sequence occupies |
956
|
|
|
|
|
|
|
ranges on different physical segments, then it returns them all in an |
957
|
|
|
|
|
|
|
array context, and raises a "multiple segment exception" exception in |
958
|
|
|
|
|
|
|
a scalar context. |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
=cut |
961
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
#' |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
sub segment { |
965
|
203
|
|
|
203
|
1
|
13787
|
my $self = shift; |
966
|
203
|
|
|
|
|
599
|
my @segments = Bio::DB::GFF::RelSegment->new(-factory => $self, |
967
|
|
|
|
|
|
|
$self->setup_segment_args(@_)); |
968
|
203
|
|
|
|
|
323
|
foreach (@segments) { |
969
|
195
|
50
|
|
|
|
319
|
$_->absolute(1) if $self->absolute; |
970
|
|
|
|
|
|
|
} |
971
|
|
|
|
|
|
|
|
972
|
203
|
|
|
|
|
287
|
$self->_multiple_return_args(@segments); |
973
|
|
|
|
|
|
|
} |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
sub _multiple_return_args { |
976
|
203
|
|
|
203
|
|
179
|
my $self = shift; |
977
|
203
|
|
|
|
|
265
|
my @args = @_; |
978
|
203
|
100
|
|
|
|
413
|
if (@args == 0) { |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
979
|
8
|
|
|
|
|
28
|
return; |
980
|
|
|
|
|
|
|
} elsif (@args == 1) { |
981
|
195
|
|
|
|
|
457
|
return $args[0]; |
982
|
|
|
|
|
|
|
} elsif (wantarray) { # more than one reference sequence |
983
|
0
|
|
|
|
|
0
|
return @args; |
984
|
|
|
|
|
|
|
} else { |
985
|
0
|
|
|
|
|
0
|
$self->error($args[0]->name, |
986
|
|
|
|
|
|
|
" has more than one reference sequence in database. Please call in a list context to retrieve them all."); |
987
|
0
|
|
|
|
|
0
|
$self->throw('multiple segment exception'); |
988
|
0
|
|
|
|
|
0
|
return; |
989
|
|
|
|
|
|
|
} |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
} |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
# backward compatibility -- don't use! |
994
|
|
|
|
|
|
|
# (deliberately undocumented too) |
995
|
|
|
|
|
|
|
sub abs_segment { |
996
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
997
|
0
|
|
|
|
|
0
|
return $self->segment($self->setup_segment_args(@_),-absolute=>1); |
998
|
|
|
|
|
|
|
} |
999
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
sub setup_segment_args { |
1001
|
224
|
|
|
224
|
0
|
190
|
my $self = shift; |
1002
|
224
|
100
|
100
|
|
|
1360
|
return @_ if defined $_[0] && $_[0] =~ /^-/; |
1003
|
156
|
100
|
|
|
|
552
|
return (-name=>$_[0],-start=>$_[1],-stop=>$_[2]) if @_ == 3; |
1004
|
71
|
100
|
|
|
|
239
|
return (-class=>$_[0],-name=>$_[1]) if @_ == 2; |
1005
|
36
|
100
|
|
|
|
261
|
return (-name=>$_[0]) if @_ == 1; |
1006
|
|
|
|
|
|
|
} |
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
=head2 features |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
Title : features |
1011
|
|
|
|
|
|
|
Usage : $db->features(@args) |
1012
|
|
|
|
|
|
|
Function: get all features, possibly filtered by type |
1013
|
|
|
|
|
|
|
Returns : a list of Bio::DB::GFF::Feature objects |
1014
|
|
|
|
|
|
|
Args : see below |
1015
|
|
|
|
|
|
|
Status : public |
1016
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
This routine will retrieve features in the database regardless of |
1018
|
|
|
|
|
|
|
position. It can be used to return all features, or a subset based on |
1019
|
|
|
|
|
|
|
their method and source. |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
Arguments are as follows: |
1022
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
-types List of feature types to return. Argument is an array |
1024
|
|
|
|
|
|
|
reference containing strings of the format "method:source" |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
-merge Whether to apply aggregators to the generated features. |
1027
|
|
|
|
|
|
|
|
1028
|
|
|
|
|
|
|
-rare Turn on optimizations suitable for a relatively rare feature type, |
1029
|
|
|
|
|
|
|
where it makes more sense to filter by feature type first, |
1030
|
|
|
|
|
|
|
and then by position. |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
-attributes A hash reference containing attributes to match. |
1033
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
-iterator Whether to return an iterator across the features. |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
-binsize A true value will create a set of artificial features whose |
1037
|
|
|
|
|
|
|
start and stop positions indicate bins of the given size, and |
1038
|
|
|
|
|
|
|
whose scores are the number of features in the bin. The |
1039
|
|
|
|
|
|
|
class and method of the feature will be set to "bin", |
1040
|
|
|
|
|
|
|
its source to "method:source", and its group to "bin:method:source". |
1041
|
|
|
|
|
|
|
This is a handy way of generating histograms of feature density. |
1042
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
If -iterator is true, then the method returns a single scalar value |
1044
|
|
|
|
|
|
|
consisting of a Bio::SeqIO object. You can call next_seq() repeatedly |
1045
|
|
|
|
|
|
|
on this object to fetch each of the features in turn. If iterator is |
1046
|
|
|
|
|
|
|
false or absent, then all the features are returned as a list. |
1047
|
|
|
|
|
|
|
|
1048
|
|
|
|
|
|
|
Currently aggregation is disabled when iterating over a series of |
1049
|
|
|
|
|
|
|
features. |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
Types are indicated using the nomenclature "method:source". Either of |
1052
|
|
|
|
|
|
|
these fields can be omitted, in which case a wildcard is used for the |
1053
|
|
|
|
|
|
|
missing field. Type names without the colon (e.g. "exon") are |
1054
|
|
|
|
|
|
|
interpreted as the method name and a source wild card. Regular |
1055
|
|
|
|
|
|
|
expressions are allowed in either field, as in: "similarity:BLAST.*". |
1056
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
The -attributes argument is a hashref containing one or more attributes |
1058
|
|
|
|
|
|
|
to match against: |
1059
|
|
|
|
|
|
|
|
1060
|
|
|
|
|
|
|
-attributes => { Gene => 'abc-1', |
1061
|
|
|
|
|
|
|
Note => 'confirmed' } |
1062
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
Attribute matching is simple string matching, and multiple attributes |
1064
|
|
|
|
|
|
|
are ANDed together. |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
=cut |
1067
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
sub features { |
1069
|
15
|
|
|
15
|
1
|
2487
|
my $self = shift; |
1070
|
15
|
|
|
|
|
50
|
my ($types,$automerge,$sparse,$iterator,$refseq,$start,$end,$other); |
1071
|
15
|
100
|
66
|
|
|
129
|
if (defined $_[0] && |
1072
|
|
|
|
|
|
|
$_[0] =~ /^-/) { |
1073
|
10
|
|
|
|
|
66
|
($types,$automerge,$sparse,$iterator, |
1074
|
|
|
|
|
|
|
$refseq,$start,$end, |
1075
|
|
|
|
|
|
|
$other) = rearrange([ |
1076
|
|
|
|
|
|
|
[qw(TYPE TYPES)], |
1077
|
|
|
|
|
|
|
[qw(MERGE AUTOMERGE)], |
1078
|
|
|
|
|
|
|
[qw(RARE SPARSE)], |
1079
|
|
|
|
|
|
|
'ITERATOR', |
1080
|
|
|
|
|
|
|
[qw(REFSEQ SEQ_ID)], |
1081
|
|
|
|
|
|
|
'START', |
1082
|
|
|
|
|
|
|
[qw(STOP END)], |
1083
|
|
|
|
|
|
|
],@_); |
1084
|
|
|
|
|
|
|
} else { |
1085
|
5
|
|
|
|
|
49
|
$types = \@_; |
1086
|
|
|
|
|
|
|
} |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
# for whole database retrievals, we probably don't want to automerge! |
1089
|
15
|
100
|
|
|
|
113
|
$automerge = $self->automerge unless defined $automerge; |
1090
|
15
|
|
50
|
|
|
118
|
$other ||= {}; |
1091
|
15
|
50
|
|
|
|
299
|
$self->_features({ |
1092
|
|
|
|
|
|
|
rangetype => $refseq ? 'overlaps' : 'contains', |
1093
|
|
|
|
|
|
|
types => $types, |
1094
|
|
|
|
|
|
|
refseq => $refseq, |
1095
|
|
|
|
|
|
|
start => $start, |
1096
|
|
|
|
|
|
|
stop => $end, |
1097
|
|
|
|
|
|
|
}, |
1098
|
|
|
|
|
|
|
{ sparse => $sparse, |
1099
|
|
|
|
|
|
|
automerge => $automerge, |
1100
|
|
|
|
|
|
|
iterator =>$iterator, |
1101
|
|
|
|
|
|
|
%$other, |
1102
|
|
|
|
|
|
|
} |
1103
|
|
|
|
|
|
|
); |
1104
|
|
|
|
|
|
|
} |
1105
|
|
|
|
|
|
|
|
1106
|
|
|
|
|
|
|
=head2 get_seq_stream |
1107
|
|
|
|
|
|
|
|
1108
|
|
|
|
|
|
|
Title : get_seq_stream |
1109
|
|
|
|
|
|
|
Usage : my $seqio = $self->get_seq_sream(@args) |
1110
|
|
|
|
|
|
|
Function: Performs a query and returns an iterator over it |
1111
|
|
|
|
|
|
|
Returns : a Bio::SeqIO stream capable of producing sequence |
1112
|
|
|
|
|
|
|
Args : As in features() |
1113
|
|
|
|
|
|
|
Status : public |
1114
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
This routine takes the same arguments as features(), but returns a |
1116
|
|
|
|
|
|
|
Bio::SeqIO::Stream-compliant object. Use it like this: |
1117
|
|
|
|
|
|
|
|
1118
|
|
|
|
|
|
|
$stream = $db->get_seq_stream('exon'); |
1119
|
|
|
|
|
|
|
while (my $exon = $stream->next_seq) { |
1120
|
|
|
|
|
|
|
print $exon,"\n"; |
1121
|
|
|
|
|
|
|
} |
1122
|
|
|
|
|
|
|
|
1123
|
|
|
|
|
|
|
NOTE: This is also called get_feature_stream(), since that's what it |
1124
|
|
|
|
|
|
|
really does. |
1125
|
|
|
|
|
|
|
|
1126
|
|
|
|
|
|
|
=cut |
1127
|
|
|
|
|
|
|
|
1128
|
|
|
|
|
|
|
sub get_seq_stream { |
1129
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1130
|
0
|
0
|
0
|
|
|
0
|
my @args = !defined($_[0]) || $_[0] =~ /^-/ ? (@_,-iterator=>1) |
1131
|
|
|
|
|
|
|
: (-types=>\@_,-iterator=>1); |
1132
|
0
|
|
|
|
|
0
|
$self->features(@args); |
1133
|
|
|
|
|
|
|
} |
1134
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
*get_feature_stream = \&get_seq_stream; |
1136
|
|
|
|
|
|
|
|
1137
|
|
|
|
|
|
|
=head2 get_feature_by_name |
1138
|
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
Title : get_feature_by_name |
1140
|
|
|
|
|
|
|
Usage : $db->get_feature_by_name($class => $name) |
1141
|
|
|
|
|
|
|
Function: fetch features by their name |
1142
|
|
|
|
|
|
|
Returns : a list of Bio::DB::GFF::Feature objects |
1143
|
|
|
|
|
|
|
Args : the class and name of the desired feature |
1144
|
|
|
|
|
|
|
Status : public |
1145
|
|
|
|
|
|
|
|
1146
|
|
|
|
|
|
|
This method can be used to fetch a named feature from the database. |
1147
|
|
|
|
|
|
|
GFF annotations are named using the group class and name fields, so |
1148
|
|
|
|
|
|
|
for features that belong to a group of size one, this method can be |
1149
|
|
|
|
|
|
|
used to retrieve that group (and is equivalent to the segment() |
1150
|
|
|
|
|
|
|
method). Any Alias attributes are also searched for matching names. |
1151
|
|
|
|
|
|
|
|
1152
|
|
|
|
|
|
|
An alternative syntax allows you to search for features by name within |
1153
|
|
|
|
|
|
|
a circumscribed region: |
1154
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
@f = $db->get_feature_by_name(-class => $class,-name=>$name, |
1156
|
|
|
|
|
|
|
-ref => $sequence_name, |
1157
|
|
|
|
|
|
|
-start => $start, |
1158
|
|
|
|
|
|
|
-end => $end); |
1159
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
This method may return zero, one, or several Bio::DB::GFF::Feature |
1161
|
|
|
|
|
|
|
objects. |
1162
|
|
|
|
|
|
|
|
1163
|
|
|
|
|
|
|
Aggregation is performed on features as usual. |
1164
|
|
|
|
|
|
|
|
1165
|
|
|
|
|
|
|
NOTE: At various times, this function was called fetch_group(), |
1166
|
|
|
|
|
|
|
fetch_feature(), fetch_feature_by_name() and segments(). These names |
1167
|
|
|
|
|
|
|
are preserved for backward compatibility. |
1168
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
=cut |
1170
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
sub get_feature_by_name { |
1172
|
16
|
|
|
16
|
1
|
30
|
my $self = shift; |
1173
|
16
|
|
|
|
|
26
|
my ($gclass,$gname,$automerge,$ref,$start,$end); |
1174
|
16
|
50
|
|
|
|
74
|
if (@_ == 1) { |
1175
|
0
|
|
|
|
|
0
|
$gclass = $self->default_class; |
1176
|
0
|
|
|
|
|
0
|
$gname = shift; |
1177
|
|
|
|
|
|
|
} else { |
1178
|
16
|
|
|
|
|
103
|
($gclass,$gname,$automerge,$ref,$start,$end) = rearrange(['CLASS','NAME','AUTOMERGE', |
1179
|
|
|
|
|
|
|
['REF','REFSEQ'], |
1180
|
|
|
|
|
|
|
'START',['STOP','END'] |
1181
|
|
|
|
|
|
|
],@_); |
1182
|
16
|
|
33
|
|
|
56
|
$gclass ||= $self->default_class; |
1183
|
|
|
|
|
|
|
} |
1184
|
16
|
50
|
|
|
|
53
|
$automerge = $self->automerge unless defined $automerge; |
1185
|
|
|
|
|
|
|
|
1186
|
|
|
|
|
|
|
# we need to refactor this... It's repeated code (see below)... |
1187
|
16
|
|
|
|
|
19
|
my @aggregators; |
1188
|
16
|
50
|
|
|
|
37
|
if ($automerge) { |
1189
|
16
|
|
|
|
|
40
|
for my $a ($self->aggregators) { |
1190
|
33
|
50
|
|
|
|
113
|
push @aggregators,$a if $a->disaggregate([],$self); |
1191
|
|
|
|
|
|
|
} |
1192
|
|
|
|
|
|
|
} |
1193
|
|
|
|
|
|
|
|
1194
|
16
|
|
|
|
|
16
|
my %groups; # cache the groups we create to avoid consuming too much unecessary memory |
1195
|
16
|
|
|
|
|
46
|
my $features = []; |
1196
|
16
|
|
|
74
|
|
81
|
my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; |
|
74
|
|
|
|
|
152
|
|
1197
|
16
|
50
|
|
|
|
32
|
my $location = [$ref,$start,$end] if defined $ref; |
1198
|
16
|
|
|
|
|
78
|
$self->_feature_by_name($gclass,$gname,$location,$callback); |
1199
|
|
|
|
|
|
|
|
1200
|
16
|
50
|
|
|
|
40
|
warn "aggregating...\n" if $self->debug; |
1201
|
16
|
|
|
|
|
32
|
foreach my $a (@aggregators) { # last aggregator gets first shot |
1202
|
33
|
50
|
|
|
|
89
|
$a->aggregate($features,$self) or next; |
1203
|
|
|
|
|
|
|
} |
1204
|
|
|
|
|
|
|
|
1205
|
16
|
|
|
|
|
98
|
@$features; |
1206
|
|
|
|
|
|
|
} |
1207
|
|
|
|
|
|
|
|
1208
|
|
|
|
|
|
|
# horrible indecision regarding proper names! |
1209
|
|
|
|
|
|
|
*fetch_group = *fetch_feature = *fetch_feature_by_name = \&get_feature_by_name; |
1210
|
|
|
|
|
|
|
*segments = \&segment; |
1211
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
=head2 get_feature_by_target |
1213
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
Title : get_feature_by_target |
1215
|
|
|
|
|
|
|
Usage : $db->get_feature_by_target($class => $name) |
1216
|
|
|
|
|
|
|
Function: fetch features by their similarity target |
1217
|
|
|
|
|
|
|
Returns : a list of Bio::DB::GFF::Feature objects |
1218
|
|
|
|
|
|
|
Args : the class and name of the desired feature |
1219
|
|
|
|
|
|
|
Status : public |
1220
|
|
|
|
|
|
|
|
1221
|
|
|
|
|
|
|
This method can be used to fetch a named feature from the database |
1222
|
|
|
|
|
|
|
based on its similarity hit. |
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
=cut |
1225
|
|
|
|
|
|
|
|
1226
|
|
|
|
|
|
|
sub get_feature_by_target { |
1227
|
0
|
|
|
0
|
1
|
0
|
shift->get_feature_by_name(@_); |
1228
|
|
|
|
|
|
|
} |
1229
|
|
|
|
|
|
|
|
1230
|
|
|
|
|
|
|
=head2 get_feature_by_attribute |
1231
|
|
|
|
|
|
|
|
1232
|
|
|
|
|
|
|
Title : get_feature_by_attribute |
1233
|
|
|
|
|
|
|
Usage : $db->get_feature_by_attribute(attribute1=>value1,attribute2=>value2) |
1234
|
|
|
|
|
|
|
Function: fetch segments by combinations of attribute values |
1235
|
|
|
|
|
|
|
Returns : a list of Bio::DB::GFF::Feature objects |
1236
|
|
|
|
|
|
|
Args : the class and name of the desired feature |
1237
|
|
|
|
|
|
|
Status : public |
1238
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
This method can be used to fetch a set of features from the database. |
1240
|
|
|
|
|
|
|
Attributes are a list of name=Evalue pairs. They will be logically |
1241
|
|
|
|
|
|
|
ANDED together. |
1242
|
|
|
|
|
|
|
|
1243
|
|
|
|
|
|
|
=cut |
1244
|
|
|
|
|
|
|
|
1245
|
|
|
|
|
|
|
sub get_feature_by_attribute { |
1246
|
5
|
|
|
5
|
1
|
2101
|
my $self = shift; |
1247
|
5
|
50
|
|
|
|
20
|
my %attributes = ref($_[0]) ? %{$_[0]} : @_; |
|
0
|
|
|
|
|
0
|
|
1248
|
|
|
|
|
|
|
|
1249
|
|
|
|
|
|
|
# we need to refactor this... It's repeated code (see above)... |
1250
|
5
|
|
|
|
|
8
|
my @aggregators; |
1251
|
5
|
50
|
|
|
|
10
|
if ($self->automerge) { |
1252
|
5
|
|
|
|
|
15
|
for my $a ($self->aggregators) { |
1253
|
10
|
50
|
|
|
|
26
|
unshift @aggregators,$a if $a->disaggregate([],$self); |
1254
|
|
|
|
|
|
|
} |
1255
|
|
|
|
|
|
|
} |
1256
|
|
|
|
|
|
|
|
1257
|
5
|
|
|
|
|
8
|
my %groups; # cache the groups we create to avoid consuming too much unecessary memory |
1258
|
5
|
|
|
|
|
5
|
my $features = []; |
1259
|
5
|
|
|
10
|
|
22
|
my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; |
|
10
|
|
|
|
|
22
|
|
1260
|
5
|
|
|
|
|
21
|
$self->_feature_by_attribute(\%attributes,$callback); |
1261
|
|
|
|
|
|
|
|
1262
|
5
|
50
|
|
|
|
15
|
warn "aggregating...\n" if $self->debug; |
1263
|
5
|
|
|
|
|
7
|
foreach my $a (@aggregators) { # last aggregator gets first shot |
1264
|
10
|
50
|
|
|
|
23
|
$a->aggregate($features,$self) or next; |
1265
|
|
|
|
|
|
|
} |
1266
|
|
|
|
|
|
|
|
1267
|
5
|
|
|
|
|
67
|
@$features; |
1268
|
|
|
|
|
|
|
} |
1269
|
|
|
|
|
|
|
|
1270
|
|
|
|
|
|
|
# more indecision... |
1271
|
|
|
|
|
|
|
*fetch_feature_by_attribute = \&get_feature_by_attribute; |
1272
|
|
|
|
|
|
|
|
1273
|
|
|
|
|
|
|
=head2 get_feature_by_id |
1274
|
|
|
|
|
|
|
|
1275
|
|
|
|
|
|
|
Title : get_feature_by_id |
1276
|
|
|
|
|
|
|
Usage : $db->get_feature_by_id($id) |
1277
|
|
|
|
|
|
|
Function: fetch segments by feature ID |
1278
|
|
|
|
|
|
|
Returns : a Bio::DB::GFF::Feature object |
1279
|
|
|
|
|
|
|
Args : the feature ID |
1280
|
|
|
|
|
|
|
Status : public |
1281
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
This method can be used to fetch a feature from the database using its |
1283
|
|
|
|
|
|
|
ID. Not all GFF databases support IDs, so be careful with this. |
1284
|
|
|
|
|
|
|
|
1285
|
|
|
|
|
|
|
=cut |
1286
|
|
|
|
|
|
|
|
1287
|
|
|
|
|
|
|
sub get_feature_by_id { |
1288
|
5
|
|
|
5
|
1
|
7
|
my $self = shift; |
1289
|
5
|
50
|
|
|
|
25
|
my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_; |
1290
|
5
|
|
|
|
|
10
|
my %groups; # cache the groups we create to avoid consuming too much unecessary memory |
1291
|
5
|
|
|
|
|
8
|
my $features = []; |
1292
|
5
|
|
|
5
|
|
30
|
my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; |
|
5
|
|
|
|
|
20
|
|
1293
|
5
|
|
|
|
|
27
|
$self->_feature_by_id($id,'feature',$callback); |
1294
|
5
|
50
|
|
|
|
38
|
return wantarray ? @$features : $features->[0]; |
1295
|
|
|
|
|
|
|
} |
1296
|
|
|
|
|
|
|
*fetch_feature_by_id = \&get_feature_by_id; |
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
=head2 get_feature_by_gid |
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
Title : get_feature_by_gid |
1301
|
|
|
|
|
|
|
Usage : $db->get_feature_by_gid($id) |
1302
|
|
|
|
|
|
|
Function: fetch segments by feature ID |
1303
|
|
|
|
|
|
|
Returns : a Bio::DB::GFF::Feature object |
1304
|
|
|
|
|
|
|
Args : the feature ID |
1305
|
|
|
|
|
|
|
Status : public |
1306
|
|
|
|
|
|
|
|
1307
|
|
|
|
|
|
|
This method can be used to fetch a feature from the database using its |
1308
|
|
|
|
|
|
|
group ID. Not all GFF databases support IDs, so be careful with this. |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
The group ID is often more interesting than the feature ID, since |
1311
|
|
|
|
|
|
|
groups can be complex objects containing subobjects. |
1312
|
|
|
|
|
|
|
|
1313
|
|
|
|
|
|
|
=cut |
1314
|
|
|
|
|
|
|
|
1315
|
|
|
|
|
|
|
sub get_feature_by_gid { |
1316
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1317
|
0
|
0
|
|
|
|
0
|
my $id = ref($_[0]) eq 'ARRAY' ? $_[0] : \@_; |
1318
|
0
|
|
|
|
|
0
|
my %groups; # cache the groups we create to avoid consuming too much unecessary memory |
1319
|
0
|
|
|
|
|
0
|
my $features = []; |
1320
|
0
|
|
|
0
|
|
0
|
my $callback = sub { push @$features,$self->make_feature(undef,\%groups,@_) }; |
|
0
|
|
|
|
|
0
|
|
1321
|
0
|
|
|
|
|
0
|
$self->_feature_by_id($id,'group',$callback); |
1322
|
0
|
0
|
|
|
|
0
|
return wantarray ? @$features : $features->[0]; |
1323
|
|
|
|
|
|
|
} |
1324
|
|
|
|
|
|
|
*fetch_feature_by_gid = \&get_feature_by_gid; |
1325
|
|
|
|
|
|
|
|
1326
|
|
|
|
|
|
|
=head2 delete_fattribute_to_features |
1327
|
|
|
|
|
|
|
|
1328
|
|
|
|
|
|
|
Title : delete_fattribute_to_features |
1329
|
|
|
|
|
|
|
Usage : $db->delete_fattribute_to_features(@ids_or_features) |
1330
|
|
|
|
|
|
|
Function: delete one or more fattribute_to_features |
1331
|
|
|
|
|
|
|
Returns : count of fattribute_to_features deleted |
1332
|
|
|
|
|
|
|
Args : list of features or feature ids |
1333
|
|
|
|
|
|
|
Status : public |
1334
|
|
|
|
|
|
|
|
1335
|
|
|
|
|
|
|
Pass this method a list of numeric feature ids or a set of features. |
1336
|
|
|
|
|
|
|
It will attempt to remove the fattribute_to_features rows of those features |
1337
|
|
|
|
|
|
|
from the database and return a count of the rows removed. |
1338
|
|
|
|
|
|
|
|
1339
|
|
|
|
|
|
|
NOTE: This method is also called delete_fattribute_to_feature(). Also see |
1340
|
|
|
|
|
|
|
delete_groups() and delete_features(). |
1341
|
|
|
|
|
|
|
|
1342
|
|
|
|
|
|
|
=cut |
1343
|
|
|
|
|
|
|
|
1344
|
|
|
|
|
|
|
*delete_fattribute_to_feature = \&delete_fattribute_to_features; |
1345
|
|
|
|
|
|
|
|
1346
|
|
|
|
|
|
|
sub delete_fattribute_to_features { |
1347
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1348
|
0
|
|
|
|
|
0
|
my @features_or_ids = @_; |
1349
|
0
|
0
|
|
|
|
0
|
my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids; |
|
0
|
|
|
|
|
0
|
|
1350
|
0
|
0
|
|
|
|
0
|
return unless @ids; |
1351
|
0
|
|
|
|
|
0
|
$self->_delete_fattribute_to_features(@ids); |
1352
|
|
|
|
|
|
|
} |
1353
|
|
|
|
|
|
|
|
1354
|
|
|
|
|
|
|
=head2 delete_features |
1355
|
|
|
|
|
|
|
|
1356
|
|
|
|
|
|
|
Title : delete_features |
1357
|
|
|
|
|
|
|
Usage : $db->delete_features(@ids_or_features) |
1358
|
|
|
|
|
|
|
Function: delete one or more features |
1359
|
|
|
|
|
|
|
Returns : count of features deleted |
1360
|
|
|
|
|
|
|
Args : list of features or feature ids |
1361
|
|
|
|
|
|
|
Status : public |
1362
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
Pass this method a list of numeric feature ids or a set of features. |
1364
|
|
|
|
|
|
|
It will attempt to remove the features from the database and return a |
1365
|
|
|
|
|
|
|
count of the features removed. |
1366
|
|
|
|
|
|
|
|
1367
|
|
|
|
|
|
|
NOTE: This method is also called delete_feature(). Also see |
1368
|
|
|
|
|
|
|
delete_groups(). |
1369
|
|
|
|
|
|
|
|
1370
|
|
|
|
|
|
|
=cut |
1371
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
*delete_feature = \&delete_features; |
1373
|
|
|
|
|
|
|
|
1374
|
|
|
|
|
|
|
sub delete_features { |
1375
|
20
|
|
|
20
|
1
|
23
|
my $self = shift; |
1376
|
20
|
|
|
|
|
33
|
my @features_or_ids = @_; |
1377
|
20
|
50
|
|
|
|
22
|
my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->id : $_} @features_or_ids; |
|
75
|
|
|
|
|
232
|
|
1378
|
20
|
50
|
|
|
|
38
|
return unless @ids; |
1379
|
20
|
|
|
|
|
60
|
$self->_delete_features(@ids); |
1380
|
|
|
|
|
|
|
} |
1381
|
|
|
|
|
|
|
|
1382
|
|
|
|
|
|
|
=head2 delete_groups |
1383
|
|
|
|
|
|
|
|
1384
|
|
|
|
|
|
|
Title : delete_groups |
1385
|
|
|
|
|
|
|
Usage : $db->delete_groups(@ids_or_features) |
1386
|
|
|
|
|
|
|
Function: delete one or more feature groups |
1387
|
|
|
|
|
|
|
Returns : count of features deleted |
1388
|
|
|
|
|
|
|
Args : list of features or feature group ids |
1389
|
|
|
|
|
|
|
Status : public |
1390
|
|
|
|
|
|
|
|
1391
|
|
|
|
|
|
|
Pass this method a list of numeric group ids or a set of features. It |
1392
|
|
|
|
|
|
|
will attempt to recursively remove the features and ALL members of |
1393
|
|
|
|
|
|
|
their group from the database. It returns a count of the number of |
1394
|
|
|
|
|
|
|
features (not groups) returned. |
1395
|
|
|
|
|
|
|
|
1396
|
|
|
|
|
|
|
NOTE: This method is also called delete_group(). Also see |
1397
|
|
|
|
|
|
|
delete_features(). |
1398
|
|
|
|
|
|
|
|
1399
|
|
|
|
|
|
|
=cut |
1400
|
|
|
|
|
|
|
|
1401
|
|
|
|
|
|
|
*delete_group = \&delete_groupss; |
1402
|
|
|
|
|
|
|
|
1403
|
|
|
|
|
|
|
sub delete_groups { |
1404
|
5
|
|
|
5
|
1
|
10
|
my $self = shift; |
1405
|
5
|
|
|
|
|
10
|
my @features_or_ids = @_; |
1406
|
5
|
50
|
|
|
|
10
|
my @ids = map {UNIVERSAL::isa($_,'Bio::DB::GFF::Feature') ? $_->group_id : $_} @features_or_ids; |
|
25
|
|
|
|
|
45
|
|
1407
|
5
|
50
|
|
|
|
13
|
return unless @ids; |
1408
|
5
|
|
|
|
|
29
|
$self->_delete_groups(@ids); |
1409
|
|
|
|
|
|
|
} |
1410
|
|
|
|
|
|
|
|
1411
|
|
|
|
|
|
|
=head2 delete |
1412
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
Title : delete |
1414
|
|
|
|
|
|
|
Usage : $db->delete(@args) |
1415
|
|
|
|
|
|
|
Function: delete features |
1416
|
|
|
|
|
|
|
Returns : count of features deleted -- if available |
1417
|
|
|
|
|
|
|
Args : numerous, see below |
1418
|
|
|
|
|
|
|
Status : public |
1419
|
|
|
|
|
|
|
|
1420
|
|
|
|
|
|
|
This method deletes all features that overlap the specified region or |
1421
|
|
|
|
|
|
|
are of a particular type. If no arguments are provided and the -force |
1422
|
|
|
|
|
|
|
argument is true, then deletes ALL features. |
1423
|
|
|
|
|
|
|
|
1424
|
|
|
|
|
|
|
Arguments: |
1425
|
|
|
|
|
|
|
|
1426
|
|
|
|
|
|
|
-name ID of the landmark sequence. |
1427
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
-ref ID of the landmark sequence (synonym for -name). |
1429
|
|
|
|
|
|
|
|
1430
|
|
|
|
|
|
|
-class Database object class for the landmark sequence. |
1431
|
|
|
|
|
|
|
"Sequence" assumed if not specified. This is |
1432
|
|
|
|
|
|
|
irrelevant for databases which do not recognize |
1433
|
|
|
|
|
|
|
object classes. |
1434
|
|
|
|
|
|
|
|
1435
|
|
|
|
|
|
|
-start Start of the segment relative to landmark. Positions |
1436
|
|
|
|
|
|
|
follow standard 1-based sequence rules. If not specified, |
1437
|
|
|
|
|
|
|
defaults to the beginning of the landmark. |
1438
|
|
|
|
|
|
|
|
1439
|
|
|
|
|
|
|
-end Stop of the segment relative to the landmark. If not specified, |
1440
|
|
|
|
|
|
|
defaults to the end of the landmark. |
1441
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
-offset Zero-based addressing |
1443
|
|
|
|
|
|
|
|
1444
|
|
|
|
|
|
|
-length Length of region |
1445
|
|
|
|
|
|
|
|
1446
|
|
|
|
|
|
|
-type,-types Either a single scalar type to be deleted, or an |
1447
|
|
|
|
|
|
|
reference to an array of types. |
1448
|
|
|
|
|
|
|
|
1449
|
|
|
|
|
|
|
-force Force operation to be performed even if it would delete |
1450
|
|
|
|
|
|
|
entire feature table. |
1451
|
|
|
|
|
|
|
|
1452
|
|
|
|
|
|
|
-range_type Control the range type of the deletion. One of "overlaps" (default) |
1453
|
|
|
|
|
|
|
"contains" or "contained_in" |
1454
|
|
|
|
|
|
|
|
1455
|
|
|
|
|
|
|
Examples: |
1456
|
|
|
|
|
|
|
|
1457
|
|
|
|
|
|
|
$db->delete(-type=>['intron','repeat:repeatMasker']); # remove all introns & repeats |
1458
|
|
|
|
|
|
|
$db->delete(-name=>'chr3',-start=>1,-end=>1000); # remove annotations on chr3 from 1 to 1000 |
1459
|
|
|
|
|
|
|
$db->delete(-name=>'chr3',-type=>'exon'); # remove all exons on chr3 |
1460
|
|
|
|
|
|
|
|
1461
|
|
|
|
|
|
|
The short form of this call, as described in segment() is also allowed: |
1462
|
|
|
|
|
|
|
|
1463
|
|
|
|
|
|
|
$db->delete("chr3",1=>1000); |
1464
|
|
|
|
|
|
|
$db->delete("chr3"); |
1465
|
|
|
|
|
|
|
|
1466
|
|
|
|
|
|
|
IMPORTANT NOTE: This method only deletes features. It does *NOT* |
1467
|
|
|
|
|
|
|
delete the names of groups that contain the deleted features. Group |
1468
|
|
|
|
|
|
|
IDs will be reused if you later load a feature with the same group |
1469
|
|
|
|
|
|
|
name as one that was previously deleted. |
1470
|
|
|
|
|
|
|
|
1471
|
|
|
|
|
|
|
NOTE ON FEATURE COUNTS: The DBI-based versions of this call return the |
1472
|
|
|
|
|
|
|
result code from the SQL DELETE operation. Some dbd drivers return the |
1473
|
|
|
|
|
|
|
count of rows deleted, while others return 0E0. Caveat emptor. |
1474
|
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
=cut |
1476
|
|
|
|
|
|
|
|
1477
|
|
|
|
|
|
|
sub delete { |
1478
|
21
|
|
|
21
|
1
|
16995870
|
my $self = shift; |
1479
|
21
|
|
|
|
|
66
|
my @args = $self->setup_segment_args(@_); |
1480
|
21
|
|
|
|
|
153
|
my ($name,$class,$start,$end,$offset,$length,$type,$force,$range_type) = |
1481
|
|
|
|
|
|
|
rearrange([['NAME','REF'],'CLASS','START',[qw(END STOP)],'OFFSET', |
1482
|
|
|
|
|
|
|
'LENGTH',[qw(TYPE TYPES)],'FORCE','RANGE_TYPE'],@args); |
1483
|
21
|
50
|
|
|
|
74
|
$offset = 0 unless defined $offset; |
1484
|
21
|
50
|
|
|
|
80
|
$start = $offset+1 unless defined $start; |
1485
|
21
|
50
|
33
|
|
|
88
|
$end = $start+$length-1 if !defined $end and $length; |
1486
|
21
|
|
66
|
|
|
78
|
$class ||= $self->default_class; |
1487
|
|
|
|
|
|
|
|
1488
|
21
|
|
|
|
|
61
|
my $types = $self->parse_types($type); # parse out list of types |
1489
|
|
|
|
|
|
|
|
1490
|
21
|
|
50
|
|
|
72
|
$range_type ||= 'overlaps'; |
1491
|
|
|
|
|
|
|
$self->throw("range type must be one of {". |
1492
|
|
|
|
|
|
|
join(',',keys %valid_range_types). |
1493
|
|
|
|
|
|
|
"}\n") |
1494
|
21
|
50
|
|
|
|
55
|
unless $valid_range_types{lc $range_type}; |
1495
|
|
|
|
|
|
|
|
1496
|
|
|
|
|
|
|
|
1497
|
21
|
|
|
|
|
14
|
my @segments; |
1498
|
21
|
100
|
100
|
|
|
90
|
if (defined $name && $name ne '') { |
1499
|
10
|
|
|
|
|
28
|
my @args = (-name=>$name,-class=>$class); |
1500
|
10
|
50
|
|
|
|
26
|
push @args,(-start=>$start) if defined $start; |
1501
|
10
|
50
|
|
|
|
21
|
push @args,(-end =>$end) if defined $end; |
1502
|
10
|
|
|
|
|
23
|
@segments = $self->segment(@args); |
1503
|
10
|
100
|
|
|
|
40
|
return unless @segments; |
1504
|
|
|
|
|
|
|
} |
1505
|
16
|
|
|
|
|
115
|
$self->_delete({segments => \@segments, |
1506
|
|
|
|
|
|
|
types => $types, |
1507
|
|
|
|
|
|
|
range_type => $range_type, |
1508
|
|
|
|
|
|
|
force => $force} |
1509
|
|
|
|
|
|
|
); |
1510
|
|
|
|
|
|
|
} |
1511
|
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
|
=head2 absolute |
1513
|
|
|
|
|
|
|
|
1514
|
|
|
|
|
|
|
Title : absolute |
1515
|
|
|
|
|
|
|
Usage : $abs = $db->absolute([$abs]); |
1516
|
|
|
|
|
|
|
Function: gets/sets absolute mode |
1517
|
|
|
|
|
|
|
Returns : current setting of absolute mode boolean |
1518
|
|
|
|
|
|
|
Args : new setting for absolute mode boolean |
1519
|
|
|
|
|
|
|
Status : public |
1520
|
|
|
|
|
|
|
|
1521
|
|
|
|
|
|
|
$db-Eabsolute(1) will turn on absolute mode for the entire database. |
1522
|
|
|
|
|
|
|
All segments retrieved will use absolute coordinates by default, |
1523
|
|
|
|
|
|
|
rather than relative coordinates. You can still set them to use |
1524
|
|
|
|
|
|
|
relative coordinates by calling $segment-Eabsolute(0). |
1525
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
Note that this is not the same as calling abs_segment(); it continues |
1527
|
|
|
|
|
|
|
to allow you to look up groups that are not used directly as reference |
1528
|
|
|
|
|
|
|
sequences. |
1529
|
|
|
|
|
|
|
|
1530
|
|
|
|
|
|
|
=cut |
1531
|
|
|
|
|
|
|
|
1532
|
|
|
|
|
|
|
sub absolute { |
1533
|
195
|
|
|
195
|
1
|
183
|
my $self = shift; |
1534
|
195
|
|
|
|
|
173
|
my $d = $self->{absolute}; |
1535
|
195
|
50
|
|
|
|
260
|
$self->{absolute} = shift if @_; |
1536
|
195
|
|
|
|
|
399
|
$d; |
1537
|
|
|
|
|
|
|
} |
1538
|
|
|
|
|
|
|
|
1539
|
|
|
|
|
|
|
=head2 strict_bounds_checking |
1540
|
|
|
|
|
|
|
|
1541
|
|
|
|
|
|
|
Title : strict_bounds_checking |
1542
|
|
|
|
|
|
|
Usage : $flag = $db->strict_bounds_checking([$flag]) |
1543
|
|
|
|
|
|
|
Function: gets/sets strict bounds checking |
1544
|
|
|
|
|
|
|
Returns : current setting of bounds checking flag |
1545
|
|
|
|
|
|
|
Args : new setting for bounds checking flag |
1546
|
|
|
|
|
|
|
Status : public |
1547
|
|
|
|
|
|
|
|
1548
|
|
|
|
|
|
|
This flag enables extra checks for segment requests that go beyond the |
1549
|
|
|
|
|
|
|
ends of their reference sequences. If bounds checking is enabled, |
1550
|
|
|
|
|
|
|
then retrieved segments will be truncated to their physical length, |
1551
|
|
|
|
|
|
|
and their truncated() methods will return true. |
1552
|
|
|
|
|
|
|
|
1553
|
|
|
|
|
|
|
If the flag is off (the default), then the module will return segments |
1554
|
|
|
|
|
|
|
that appear to extend beyond their physical boundaries. Requests for |
1555
|
|
|
|
|
|
|
features beyond the end of the segment will, however, return empty. |
1556
|
|
|
|
|
|
|
|
1557
|
|
|
|
|
|
|
=cut |
1558
|
|
|
|
|
|
|
|
1559
|
|
|
|
|
|
|
sub strict_bounds_checking { |
1560
|
170
|
|
|
170
|
1
|
158
|
my $self = shift; |
1561
|
170
|
|
|
|
|
202
|
my $d = $self->{strict}; |
1562
|
170
|
100
|
|
|
|
341
|
$self->{strict} = shift if @_; |
1563
|
170
|
|
|
|
|
376
|
$d; |
1564
|
|
|
|
|
|
|
} |
1565
|
|
|
|
|
|
|
|
1566
|
|
|
|
|
|
|
=head2 get_Seq_by_id |
1567
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
Title : get_Seq_by_id |
1569
|
|
|
|
|
|
|
Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN') |
1570
|
|
|
|
|
|
|
Function: Gets a Bio::Seq object by its name |
1571
|
|
|
|
|
|
|
Returns : a Bio::Seq object |
1572
|
|
|
|
|
|
|
Args : the id (as a string) of a sequence |
1573
|
|
|
|
|
|
|
Throws : "id does not exist" exception |
1574
|
|
|
|
|
|
|
|
1575
|
|
|
|
|
|
|
NOTE: Bio::DB::RandomAccessI compliant method |
1576
|
|
|
|
|
|
|
|
1577
|
|
|
|
|
|
|
=cut |
1578
|
|
|
|
|
|
|
|
1579
|
|
|
|
|
|
|
sub get_Seq_by_id { |
1580
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1581
|
0
|
|
|
|
|
0
|
$self->get_feature_by_name(@_); |
1582
|
|
|
|
|
|
|
} |
1583
|
|
|
|
|
|
|
|
1584
|
|
|
|
|
|
|
|
1585
|
|
|
|
|
|
|
=head2 get_Seq_by_accession |
1586
|
|
|
|
|
|
|
|
1587
|
|
|
|
|
|
|
Title : get_Seq_by_accession |
1588
|
|
|
|
|
|
|
Usage : $seq = $db->get_Seq_by_accession('AL12234') |
1589
|
|
|
|
|
|
|
Function: Gets a Bio::Seq object by its accession |
1590
|
|
|
|
|
|
|
Returns : a Bio::Seq object |
1591
|
|
|
|
|
|
|
Args : the id (as a string) of a sequence |
1592
|
|
|
|
|
|
|
Throws : "id does not exist" exception |
1593
|
|
|
|
|
|
|
|
1594
|
|
|
|
|
|
|
NOTE: Bio::DB::RandomAccessI compliant method |
1595
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
=cut |
1597
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
sub get_Seq_by_accession { |
1599
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1600
|
0
|
|
|
|
|
0
|
$self->get_feature_by_name(@_); |
1601
|
|
|
|
|
|
|
} |
1602
|
|
|
|
|
|
|
|
1603
|
|
|
|
|
|
|
=head2 get_Seq_by_acc |
1604
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
Title : get_Seq_by_acc |
1606
|
|
|
|
|
|
|
Usage : $seq = $db->get_Seq_by_acc('X77802'); |
1607
|
|
|
|
|
|
|
Function: Gets a Bio::Seq object by accession number |
1608
|
|
|
|
|
|
|
Returns : A Bio::Seq object |
1609
|
|
|
|
|
|
|
Args : accession number (as a string) |
1610
|
|
|
|
|
|
|
Throws : "acc does not exist" exception |
1611
|
|
|
|
|
|
|
|
1612
|
|
|
|
|
|
|
NOTE: Bio::DB::RandomAccessI compliant method |
1613
|
|
|
|
|
|
|
|
1614
|
|
|
|
|
|
|
=cut |
1615
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
sub get_Seq_by_acc { |
1617
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1618
|
0
|
|
|
|
|
0
|
$self->get_feature_by_name(@_); |
1619
|
|
|
|
|
|
|
} |
1620
|
|
|
|
|
|
|
|
1621
|
|
|
|
|
|
|
=head2 get_Stream_by_name |
1622
|
|
|
|
|
|
|
|
1623
|
|
|
|
|
|
|
Title : get_Stream_by_name |
1624
|
|
|
|
|
|
|
Usage : $seq = $db->get_Stream_by_name(@ids); |
1625
|
|
|
|
|
|
|
Function: Retrieves a stream of Seq objects given their names |
1626
|
|
|
|
|
|
|
Returns : a Bio::SeqIO stream object |
1627
|
|
|
|
|
|
|
Args : an array of unique ids/accession numbers, or |
1628
|
|
|
|
|
|
|
an array reference |
1629
|
|
|
|
|
|
|
|
1630
|
|
|
|
|
|
|
NOTE: This is also called get_Stream_by_batch() |
1631
|
|
|
|
|
|
|
|
1632
|
|
|
|
|
|
|
=cut |
1633
|
|
|
|
|
|
|
|
1634
|
|
|
|
|
|
|
sub get_Stream_by_name { |
1635
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1636
|
0
|
|
|
|
|
0
|
my @ids = @_; |
1637
|
0
|
0
|
|
|
|
0
|
my $id = ref($ids[0]) ? $ids[0] : \@ids; |
1638
|
0
|
|
|
|
|
0
|
Bio::DB::GFF::ID_Iterator->new($self,$id,'name'); |
1639
|
|
|
|
|
|
|
} |
1640
|
|
|
|
|
|
|
|
1641
|
|
|
|
|
|
|
=head2 get_Stream_by_id |
1642
|
|
|
|
|
|
|
|
1643
|
|
|
|
|
|
|
Title : get_Stream_by_id |
1644
|
|
|
|
|
|
|
Usage : $seq = $db->get_Stream_by_id(@ids); |
1645
|
|
|
|
|
|
|
Function: Retrieves a stream of Seq objects given their ids |
1646
|
|
|
|
|
|
|
Returns : a Bio::SeqIO stream object |
1647
|
|
|
|
|
|
|
Args : an array of unique ids/accession numbers, or |
1648
|
|
|
|
|
|
|
an array reference |
1649
|
|
|
|
|
|
|
|
1650
|
|
|
|
|
|
|
NOTE: This is also called get_Stream_by_batch() |
1651
|
|
|
|
|
|
|
|
1652
|
|
|
|
|
|
|
=cut |
1653
|
|
|
|
|
|
|
|
1654
|
|
|
|
|
|
|
sub get_Stream_by_id { |
1655
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1656
|
0
|
|
|
|
|
0
|
my @ids = @_; |
1657
|
0
|
0
|
|
|
|
0
|
my $id = ref($ids[0]) ? $ids[0] : \@ids; |
1658
|
0
|
|
|
|
|
0
|
Bio::DB::GFF::ID_Iterator->new($self,$id,'feature'); |
1659
|
|
|
|
|
|
|
} |
1660
|
|
|
|
|
|
|
|
1661
|
|
|
|
|
|
|
=head2 get_Stream_by_batch () |
1662
|
|
|
|
|
|
|
|
1663
|
|
|
|
|
|
|
Title : get_Stream_by_batch |
1664
|
|
|
|
|
|
|
Usage : $seq = $db->get_Stream_by_batch(@ids); |
1665
|
|
|
|
|
|
|
Function: Retrieves a stream of Seq objects given their ids |
1666
|
|
|
|
|
|
|
Returns : a Bio::SeqIO stream object |
1667
|
|
|
|
|
|
|
Args : an array of unique ids/accession numbers, or |
1668
|
|
|
|
|
|
|
an array reference |
1669
|
|
|
|
|
|
|
|
1670
|
|
|
|
|
|
|
NOTE: This is the same as get_Stream_by_id(). |
1671
|
|
|
|
|
|
|
|
1672
|
|
|
|
|
|
|
=cut |
1673
|
|
|
|
|
|
|
|
1674
|
|
|
|
|
|
|
*get_Stream_by_batch = \&get_Stream_by_id; |
1675
|
|
|
|
|
|
|
|
1676
|
|
|
|
|
|
|
|
1677
|
|
|
|
|
|
|
=head2 get_Stream_by_group () |
1678
|
|
|
|
|
|
|
|
1679
|
|
|
|
|
|
|
Bioperl compatibility. |
1680
|
|
|
|
|
|
|
|
1681
|
|
|
|
|
|
|
=cut |
1682
|
|
|
|
|
|
|
|
1683
|
|
|
|
|
|
|
sub get_Stream_by_group { |
1684
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1685
|
0
|
|
|
|
|
0
|
my @ids = @_; |
1686
|
0
|
0
|
|
|
|
0
|
my $id = ref($ids[0]) ? $ids[0] : \@ids; |
1687
|
0
|
|
|
|
|
0
|
Bio::DB::GFF::ID_Iterator->new($self,$id,'group'); |
1688
|
|
|
|
|
|
|
} |
1689
|
|
|
|
|
|
|
|
1690
|
|
|
|
|
|
|
=head2 all_seqfeatures |
1691
|
|
|
|
|
|
|
|
1692
|
|
|
|
|
|
|
Title : all_seqfeatures |
1693
|
|
|
|
|
|
|
Usage : @features = $db->all_seqfeatures(@args) |
1694
|
|
|
|
|
|
|
Function: fetch all the features in the database |
1695
|
|
|
|
|
|
|
Returns : an array of features, or an iterator |
1696
|
|
|
|
|
|
|
Args : See below |
1697
|
|
|
|
|
|
|
Status : public |
1698
|
|
|
|
|
|
|
|
1699
|
|
|
|
|
|
|
This is equivalent to calling $db-Efeatures() without any types, and |
1700
|
|
|
|
|
|
|
will return all the features in the database. The -merge and |
1701
|
|
|
|
|
|
|
-iterator arguments are recognized, and behave the same as described |
1702
|
|
|
|
|
|
|
for features(). |
1703
|
|
|
|
|
|
|
|
1704
|
|
|
|
|
|
|
=cut |
1705
|
|
|
|
|
|
|
|
1706
|
|
|
|
|
|
|
sub all_seqfeatures { |
1707
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1708
|
0
|
|
|
|
|
0
|
my ($automerge,$iterator)= rearrange([ |
1709
|
|
|
|
|
|
|
[qw(MERGE AUTOMERGE)], |
1710
|
|
|
|
|
|
|
'ITERATOR' |
1711
|
|
|
|
|
|
|
],@_); |
1712
|
0
|
|
|
|
|
0
|
my @args; |
1713
|
0
|
0
|
|
|
|
0
|
push @args,(-merge=>$automerge) if defined $automerge; |
1714
|
0
|
0
|
|
|
|
0
|
push @args,(-iterator=>$iterator) if defined $iterator; |
1715
|
0
|
|
|
|
|
0
|
$self->features(@args); |
1716
|
|
|
|
|
|
|
} |
1717
|
|
|
|
|
|
|
|
1718
|
|
|
|
|
|
|
=head1 Creating and Loading GFF Databases |
1719
|
|
|
|
|
|
|
|
1720
|
|
|
|
|
|
|
=head2 initialize |
1721
|
|
|
|
|
|
|
|
1722
|
|
|
|
|
|
|
Title : initialize |
1723
|
|
|
|
|
|
|
Usage : $db->initialize(-erase=>$erase,-option1=>value1,-option2=>value2); |
1724
|
|
|
|
|
|
|
Function: initialize a GFF database |
1725
|
|
|
|
|
|
|
Returns : true if initialization successful |
1726
|
|
|
|
|
|
|
Args : a set of named parameters |
1727
|
|
|
|
|
|
|
Status : Public |
1728
|
|
|
|
|
|
|
|
1729
|
|
|
|
|
|
|
This method can be used to initialize an empty database. It takes the following |
1730
|
|
|
|
|
|
|
named arguments: |
1731
|
|
|
|
|
|
|
|
1732
|
|
|
|
|
|
|
-erase A boolean value. If true the database will be wiped clean if it |
1733
|
|
|
|
|
|
|
already contains data. |
1734
|
|
|
|
|
|
|
|
1735
|
|
|
|
|
|
|
Other named arguments may be recognized by subclasses. They become database |
1736
|
|
|
|
|
|
|
meta values that control various settable options. |
1737
|
|
|
|
|
|
|
|
1738
|
|
|
|
|
|
|
As a shortcut (and for backward compatibility) a single true argument |
1739
|
|
|
|
|
|
|
is the same as initialize(-erase=E1). |
1740
|
|
|
|
|
|
|
|
1741
|
|
|
|
|
|
|
=cut |
1742
|
|
|
|
|
|
|
|
1743
|
|
|
|
|
|
|
sub initialize { |
1744
|
5
|
|
|
5
|
1
|
4243
|
my $self = shift; |
1745
|
|
|
|
|
|
|
|
1746
|
5
|
|
|
|
|
26
|
my ($erase,$meta) = rearrange(['ERASE'],@_); |
1747
|
5
|
|
50
|
|
|
23
|
$meta ||= {}; |
1748
|
|
|
|
|
|
|
|
1749
|
|
|
|
|
|
|
# initialize (possibly erasing) |
1750
|
5
|
50
|
|
|
|
17
|
return unless $self->do_initialize($erase); |
1751
|
5
|
|
|
|
|
23
|
my @default = $self->default_meta_values; |
1752
|
|
|
|
|
|
|
|
1753
|
|
|
|
|
|
|
# this is an awkward way of uppercasing the |
1754
|
|
|
|
|
|
|
# even-numbered values (necessary for case-insensitive SQL databases) |
1755
|
5
|
|
|
|
|
20
|
for (my $i=0; $i<@default; $i++) { |
1756
|
0
|
0
|
|
|
|
0
|
$default[$i] = uc $default[$i] if !($i % 2); |
1757
|
|
|
|
|
|
|
} |
1758
|
|
|
|
|
|
|
|
1759
|
5
|
|
|
|
|
17
|
my %values = (@default,%$meta); |
1760
|
5
|
|
|
|
|
10
|
foreach (keys %values) { |
1761
|
0
|
|
|
|
|
0
|
$self->meta($_ => $values{$_}); |
1762
|
|
|
|
|
|
|
} |
1763
|
5
|
|
|
|
|
20
|
1; |
1764
|
|
|
|
|
|
|
} |
1765
|
|
|
|
|
|
|
|
1766
|
|
|
|
|
|
|
|
1767
|
|
|
|
|
|
|
=head2 load_gff |
1768
|
|
|
|
|
|
|
|
1769
|
|
|
|
|
|
|
Title : load_gff |
1770
|
|
|
|
|
|
|
Usage : $db->load_gff($file|$directory|$filehandle [,$verbose]); |
1771
|
|
|
|
|
|
|
Function: load GFF data into database |
1772
|
|
|
|
|
|
|
Returns : count of records loaded |
1773
|
|
|
|
|
|
|
Args : a directory, a file, a list of files, |
1774
|
|
|
|
|
|
|
or a filehandle |
1775
|
|
|
|
|
|
|
Status : Public |
1776
|
|
|
|
|
|
|
|
1777
|
|
|
|
|
|
|
This method takes a single overloaded argument, which can be any of: |
1778
|
|
|
|
|
|
|
|
1779
|
|
|
|
|
|
|
=over 4 |
1780
|
|
|
|
|
|
|
|
1781
|
|
|
|
|
|
|
=item * |
1782
|
|
|
|
|
|
|
|
1783
|
|
|
|
|
|
|
a scalar corresponding to a GFF file on the system |
1784
|
|
|
|
|
|
|
|
1785
|
|
|
|
|
|
|
A pathname to a local GFF file. Any files ending with the .gz, .Z, or |
1786
|
|
|
|
|
|
|
.bz2 suffixes will be transparently decompressed with the appropriate |
1787
|
|
|
|
|
|
|
command-line utility. |
1788
|
|
|
|
|
|
|
|
1789
|
|
|
|
|
|
|
=item * |
1790
|
|
|
|
|
|
|
|
1791
|
|
|
|
|
|
|
an array reference containing a list of GFF files on the system |
1792
|
|
|
|
|
|
|
|
1793
|
|
|
|
|
|
|
For example ['/home/gff/gff1.gz','/home/gff/gff2.gz'] |
1794
|
|
|
|
|
|
|
|
1795
|
|
|
|
|
|
|
=item * |
1796
|
|
|
|
|
|
|
|
1797
|
|
|
|
|
|
|
directory path |
1798
|
|
|
|
|
|
|
|
1799
|
|
|
|
|
|
|
The indicated directory will be searched for all files ending in the |
1800
|
|
|
|
|
|
|
suffixes .gff, .gff.gz, .gff.Z or .gff.bz2. |
1801
|
|
|
|
|
|
|
|
1802
|
|
|
|
|
|
|
=item * |
1803
|
|
|
|
|
|
|
|
1804
|
|
|
|
|
|
|
filehandle |
1805
|
|
|
|
|
|
|
|
1806
|
|
|
|
|
|
|
An open filehandle from which to read the GFF data. Tied filehandles |
1807
|
|
|
|
|
|
|
now work as well. |
1808
|
|
|
|
|
|
|
|
1809
|
|
|
|
|
|
|
=item * |
1810
|
|
|
|
|
|
|
|
1811
|
|
|
|
|
|
|
a pipe expression |
1812
|
|
|
|
|
|
|
|
1813
|
|
|
|
|
|
|
A pipe expression will also work. For example, a GFF file on a remote |
1814
|
|
|
|
|
|
|
web server can be loaded with an expression like this: |
1815
|
|
|
|
|
|
|
|
1816
|
|
|
|
|
|
|
$db->load_gff("lynx -dump -source http://stein.cshl.org/gff_test |"); |
1817
|
|
|
|
|
|
|
|
1818
|
|
|
|
|
|
|
=back |
1819
|
|
|
|
|
|
|
|
1820
|
|
|
|
|
|
|
The optional second argument, if true, will turn on verbose status |
1821
|
|
|
|
|
|
|
reports that indicate the progress. |
1822
|
|
|
|
|
|
|
|
1823
|
|
|
|
|
|
|
If successful, the method will return the number of GFF lines |
1824
|
|
|
|
|
|
|
successfully loaded. |
1825
|
|
|
|
|
|
|
|
1826
|
|
|
|
|
|
|
NOTE:this method used to be called load(), but has been changed. The |
1827
|
|
|
|
|
|
|
old method name is also recognized. |
1828
|
|
|
|
|
|
|
|
1829
|
|
|
|
|
|
|
=cut |
1830
|
|
|
|
|
|
|
|
1831
|
|
|
|
|
|
|
sub load_gff { |
1832
|
5
|
|
|
5
|
1
|
13
|
my $self = shift; |
1833
|
5
|
|
50
|
|
|
15
|
my $file_or_directory = shift || '.'; |
1834
|
5
|
|
|
|
|
5
|
my $verbose = shift; |
1835
|
|
|
|
|
|
|
|
1836
|
5
|
|
|
|
|
19
|
local $self->{__verbose__} = $verbose; |
1837
|
5
|
50
|
33
|
|
|
22
|
return $self->do_load_gff($file_or_directory) if ref($file_or_directory) |
1838
|
|
|
|
|
|
|
&& tied *$file_or_directory; |
1839
|
|
|
|
|
|
|
|
1840
|
5
|
|
|
|
|
8
|
my $tied_stdin = tied(*STDIN); |
1841
|
5
|
50
|
|
|
|
157
|
open my $SAVEIN, "<&STDIN" unless $tied_stdin; |
1842
|
5
|
50
|
|
|
|
41
|
local @ARGV = $self->setup_argv($file_or_directory,'gff','gff3') or return; # to play tricks with reader |
1843
|
5
|
|
|
|
|
25
|
my $result = $self->do_load_gff('ARGV'); |
1844
|
5
|
50
|
|
|
|
138
|
open STDIN, '<', $SAVEIN unless $tied_stdin; # restore STDIN |
1845
|
5
|
|
|
|
|
53
|
return $result; |
1846
|
|
|
|
|
|
|
} |
1847
|
|
|
|
|
|
|
|
1848
|
|
|
|
|
|
|
*load = \&load_gff; |
1849
|
|
|
|
|
|
|
|
1850
|
|
|
|
|
|
|
=head2 load_gff_file |
1851
|
|
|
|
|
|
|
|
1852
|
|
|
|
|
|
|
Title : load_gff_file |
1853
|
|
|
|
|
|
|
Usage : $db->load_gff_file($file [,$verbose]); |
1854
|
|
|
|
|
|
|
Function: load GFF data into database |
1855
|
|
|
|
|
|
|
Returns : count of records loaded |
1856
|
|
|
|
|
|
|
Args : a path to a file |
1857
|
|
|
|
|
|
|
Status : Public |
1858
|
|
|
|
|
|
|
|
1859
|
|
|
|
|
|
|
This is provided as an alternative to load_gff_file. It doesn't munge |
1860
|
|
|
|
|
|
|
STDIN or play tricks with ARGV. |
1861
|
|
|
|
|
|
|
|
1862
|
|
|
|
|
|
|
=cut |
1863
|
|
|
|
|
|
|
|
1864
|
|
|
|
|
|
|
sub load_gff_file { |
1865
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1866
|
0
|
|
|
|
|
0
|
my $file = shift; |
1867
|
0
|
|
|
|
|
0
|
my $verbose = shift; |
1868
|
0
|
0
|
|
|
|
0
|
my $fh = IO::File->new($file) or return; |
1869
|
0
|
|
|
|
|
0
|
return $self->do_load_gff($fh); |
1870
|
|
|
|
|
|
|
} |
1871
|
|
|
|
|
|
|
|
1872
|
|
|
|
|
|
|
=head2 load_fasta |
1873
|
|
|
|
|
|
|
|
1874
|
|
|
|
|
|
|
Title : load_fasta |
1875
|
|
|
|
|
|
|
Usage : $db->load_fasta($file|$directory|$filehandle); |
1876
|
|
|
|
|
|
|
Function: load FASTA data into database |
1877
|
|
|
|
|
|
|
Returns : count of records loaded |
1878
|
|
|
|
|
|
|
Args : a directory, a file, a list of files, |
1879
|
|
|
|
|
|
|
or a filehandle |
1880
|
|
|
|
|
|
|
Status : Public |
1881
|
|
|
|
|
|
|
|
1882
|
|
|
|
|
|
|
This method takes a single overloaded argument, which can be any of: |
1883
|
|
|
|
|
|
|
|
1884
|
|
|
|
|
|
|
=over 4 |
1885
|
|
|
|
|
|
|
|
1886
|
|
|
|
|
|
|
=item * |
1887
|
|
|
|
|
|
|
|
1888
|
|
|
|
|
|
|
scalar corresponding to a FASTA file on the system |
1889
|
|
|
|
|
|
|
|
1890
|
|
|
|
|
|
|
A pathname to a local FASTA file. Any files ending with the .gz, .Z, or |
1891
|
|
|
|
|
|
|
.bz2 suffixes will be transparently decompressed with the appropriate |
1892
|
|
|
|
|
|
|
command-line utility. |
1893
|
|
|
|
|
|
|
|
1894
|
|
|
|
|
|
|
=item * |
1895
|
|
|
|
|
|
|
|
1896
|
|
|
|
|
|
|
array reference containing a list of FASTA files on the |
1897
|
|
|
|
|
|
|
system |
1898
|
|
|
|
|
|
|
|
1899
|
|
|
|
|
|
|
For example ['/home/fasta/genomic.fa.gz','/home/fasta/genomic.fa.gz'] |
1900
|
|
|
|
|
|
|
|
1901
|
|
|
|
|
|
|
=item * |
1902
|
|
|
|
|
|
|
|
1903
|
|
|
|
|
|
|
path to a directory |
1904
|
|
|
|
|
|
|
|
1905
|
|
|
|
|
|
|
The indicated directory will be searched for all files ending in the |
1906
|
|
|
|
|
|
|
suffixes .fa, .fa.gz, .fa.Z or .fa.bz2. |
1907
|
|
|
|
|
|
|
|
1908
|
|
|
|
|
|
|
=item * |
1909
|
|
|
|
|
|
|
|
1910
|
|
|
|
|
|
|
filehandle |
1911
|
|
|
|
|
|
|
|
1912
|
|
|
|
|
|
|
An open filehandle from which to read the FASTA data. |
1913
|
|
|
|
|
|
|
|
1914
|
|
|
|
|
|
|
=item * |
1915
|
|
|
|
|
|
|
|
1916
|
|
|
|
|
|
|
pipe expression |
1917
|
|
|
|
|
|
|
|
1918
|
|
|
|
|
|
|
A pipe expression will also work. For example, a FASTA file on a remote |
1919
|
|
|
|
|
|
|
web server can be loaded with an expression like this: |
1920
|
|
|
|
|
|
|
|
1921
|
|
|
|
|
|
|
$db->load_gff("lynx -dump -source http://stein.cshl.org/fasta_test.fa |"); |
1922
|
|
|
|
|
|
|
|
1923
|
|
|
|
|
|
|
=back |
1924
|
|
|
|
|
|
|
|
1925
|
|
|
|
|
|
|
=cut |
1926
|
|
|
|
|
|
|
|
1927
|
|
|
|
|
|
|
sub load_fasta { |
1928
|
5
|
|
|
5
|
1
|
10
|
my $self = shift; |
1929
|
5
|
|
50
|
|
|
15
|
my $file_or_directory = shift || '.'; |
1930
|
5
|
|
|
|
|
5
|
my $verbose = shift; |
1931
|
|
|
|
|
|
|
|
1932
|
5
|
|
|
|
|
12
|
local $self->{__verbose__} = $verbose; |
1933
|
5
|
50
|
33
|
|
|
20
|
return $self->load_sequence($file_or_directory) if ref($file_or_directory) |
1934
|
|
|
|
|
|
|
&& tied *$file_or_directory; |
1935
|
|
|
|
|
|
|
|
1936
|
5
|
|
|
|
|
7
|
my $tied = tied(*STDIN); |
1937
|
5
|
50
|
|
|
|
71
|
open my $SAVEIN, "<&STDIN" unless $tied; |
1938
|
5
|
50
|
|
|
|
17
|
local @ARGV = $self->setup_argv($file_or_directory,'fa','dna','fasta') or return; # to play tricks with reader |
1939
|
5
|
|
|
|
|
29
|
my $result = $self->load_sequence('ARGV'); |
1940
|
5
|
50
|
|
|
|
150
|
open STDIN, '<', $SAVEIN unless $tied; # restore STDIN |
1941
|
5
|
|
|
|
|
79
|
return $result; |
1942
|
|
|
|
|
|
|
} |
1943
|
|
|
|
|
|
|
|
1944
|
|
|
|
|
|
|
|
1945
|
|
|
|
|
|
|
=head2 load_fasta_file |
1946
|
|
|
|
|
|
|
|
1947
|
|
|
|
|
|
|
Title : load_fasta_file |
1948
|
|
|
|
|
|
|
Usage : $db->load_fasta_file($file [,$verbose]); |
1949
|
|
|
|
|
|
|
Function: load FASTA data into database |
1950
|
|
|
|
|
|
|
Returns : count of records loaded |
1951
|
|
|
|
|
|
|
Args : a path to a file |
1952
|
|
|
|
|
|
|
Status : Public |
1953
|
|
|
|
|
|
|
|
1954
|
|
|
|
|
|
|
This is provided as an alternative to load_fasta. It doesn't munge |
1955
|
|
|
|
|
|
|
STDIN or play tricks with ARGV. |
1956
|
|
|
|
|
|
|
|
1957
|
|
|
|
|
|
|
=cut |
1958
|
|
|
|
|
|
|
|
1959
|
|
|
|
|
|
|
sub load_fasta_file { |
1960
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1961
|
0
|
|
|
|
|
0
|
my $file = shift; |
1962
|
0
|
|
|
|
|
0
|
my $verbose = shift; |
1963
|
0
|
0
|
|
|
|
0
|
my $fh = IO::File->new($file) or return; |
1964
|
0
|
|
|
|
|
0
|
return $self->do_load_fasta($fh); |
1965
|
|
|
|
|
|
|
} |
1966
|
|
|
|
|
|
|
|
1967
|
|
|
|
|
|
|
|
1968
|
|
|
|
|
|
|
=head2 load_sequence_string |
1969
|
|
|
|
|
|
|
|
1970
|
|
|
|
|
|
|
Title : load_sequence_string |
1971
|
|
|
|
|
|
|
Usage : $db->load_sequence_string($id,$dna) |
1972
|
|
|
|
|
|
|
Function: load a single DNA entry |
1973
|
|
|
|
|
|
|
Returns : true if successfully loaded |
1974
|
|
|
|
|
|
|
Args : a raw sequence string (DNA, RNA, protein) |
1975
|
|
|
|
|
|
|
Status : Public |
1976
|
|
|
|
|
|
|
|
1977
|
|
|
|
|
|
|
=cut |
1978
|
|
|
|
|
|
|
|
1979
|
|
|
|
|
|
|
sub load_sequence_string { |
1980
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
1981
|
0
|
|
|
|
|
0
|
my ($acc,$seq) = @_; |
1982
|
0
|
|
|
|
|
0
|
my $offset = 0; |
1983
|
0
|
0
|
|
|
|
0
|
$self->insert_sequence_chunk($acc,\$offset,\$seq) or return; |
1984
|
0
|
0
|
|
|
|
0
|
$self->insert_sequence($acc,$offset,$seq) or return; |
1985
|
0
|
|
|
|
|
0
|
1; |
1986
|
|
|
|
|
|
|
} |
1987
|
|
|
|
|
|
|
|
1988
|
|
|
|
|
|
|
sub setup_argv { |
1989
|
10
|
|
|
10
|
0
|
15
|
my $self = shift; |
1990
|
10
|
|
|
|
|
12
|
my $file_or_directory = shift; |
1991
|
10
|
|
|
|
|
35
|
my @suffixes = @_; |
1992
|
3
|
|
|
3
|
|
15
|
no strict 'refs'; # so that we can call fileno() on the argument |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
11742
|
|
1993
|
|
|
|
|
|
|
|
1994
|
10
|
|
|
|
|
13
|
my @argv; |
1995
|
|
|
|
|
|
|
|
1996
|
10
|
100
|
|
|
|
260
|
if (-d $file_or_directory) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1997
|
|
|
|
|
|
|
# Because glob() is broken with long file names that contain spaces |
1998
|
5
|
50
|
33
|
|
|
41
|
$file_or_directory = Win32::GetShortPathName($file_or_directory) |
1999
|
|
|
|
|
|
|
if $^O =~ /^MSWin/i && eval 'use Win32; 1'; |
2000
|
5
|
|
|
|
|
10
|
@argv = map { glob("$file_or_directory/*.{$_,$_.gz,$_.Z,$_.bz2}")} @suffixes; |
|
15
|
|
|
|
|
1252
|
|
2001
|
|
|
|
|
|
|
}elsif (my $fd = fileno($file_or_directory)) { |
2002
|
0
|
0
|
|
|
|
0
|
open STDIN,"<&=$fd" or $self->throw("Can't dup STDIN"); |
2003
|
0
|
|
|
|
|
0
|
@argv = '-'; |
2004
|
|
|
|
|
|
|
} elsif (ref $file_or_directory) { |
2005
|
0
|
|
|
|
|
0
|
@argv = @$file_or_directory; |
2006
|
|
|
|
|
|
|
} else { |
2007
|
5
|
|
|
|
|
12
|
@argv = $file_or_directory; |
2008
|
|
|
|
|
|
|
} |
2009
|
|
|
|
|
|
|
|
2010
|
10
|
|
|
|
|
23
|
foreach (@argv) { |
2011
|
45
|
50
|
|
|
|
113
|
if (/\.gz$/) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
2012
|
0
|
|
|
|
|
0
|
$_ = "gunzip -c $_ |"; |
2013
|
|
|
|
|
|
|
} elsif (/\.Z$/) { |
2014
|
0
|
|
|
|
|
0
|
$_ = "uncompress -c $_ |"; |
2015
|
|
|
|
|
|
|
} elsif (/\.bz2$/) { |
2016
|
0
|
|
|
|
|
0
|
$_ = "bunzip2 -c $_ |"; |
2017
|
|
|
|
|
|
|
} |
2018
|
|
|
|
|
|
|
} |
2019
|
10
|
|
|
|
|
64
|
@argv; |
2020
|
|
|
|
|
|
|
} |
2021
|
|
|
|
|
|
|
|
2022
|
|
|
|
|
|
|
=head2 lock_on_load |
2023
|
|
|
|
|
|
|
|
2024
|
|
|
|
|
|
|
Title : lock_on_load |
2025
|
|
|
|
|
|
|
Usage : $lock = $db->lock_on_load([$lock]) |
2026
|
|
|
|
|
|
|
Function: set write locking during load |
2027
|
|
|
|
|
|
|
Returns : current value of lock-on-load flag |
2028
|
|
|
|
|
|
|
Args : new value of lock-on-load-flag |
2029
|
|
|
|
|
|
|
Status : Public |
2030
|
|
|
|
|
|
|
|
2031
|
|
|
|
|
|
|
This method is honored by some of the adaptors. If the value is true, |
2032
|
|
|
|
|
|
|
the tables used by the GFF modules will be locked for writing during |
2033
|
|
|
|
|
|
|
loads and inaccessible to other processes. |
2034
|
|
|
|
|
|
|
|
2035
|
|
|
|
|
|
|
=cut |
2036
|
|
|
|
|
|
|
|
2037
|
|
|
|
|
|
|
sub lock_on_load { |
2038
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
2039
|
0
|
|
|
|
|
0
|
my $d = $self->{lock}; |
2040
|
0
|
0
|
|
|
|
0
|
$self->{lock} = shift if @_; |
2041
|
0
|
|
|
|
|
0
|
$d; |
2042
|
|
|
|
|
|
|
} |
2043
|
|
|
|
|
|
|
|
2044
|
|
|
|
|
|
|
=head2 meta |
2045
|
|
|
|
|
|
|
|
2046
|
|
|
|
|
|
|
Title : meta |
2047
|
|
|
|
|
|
|
Usage : $value = $db->meta($name [,$newval]) |
2048
|
|
|
|
|
|
|
Function: get or set a meta variable |
2049
|
|
|
|
|
|
|
Returns : a string |
2050
|
|
|
|
|
|
|
Args : meta variable name and optionally value |
2051
|
|
|
|
|
|
|
Status : abstract |
2052
|
|
|
|
|
|
|
|
2053
|
|
|
|
|
|
|
Get or set a named metavalues for the database. Metavalues can be |
2054
|
|
|
|
|
|
|
used for database-specific settings. |
2055
|
|
|
|
|
|
|
|
2056
|
|
|
|
|
|
|
By default, this method does nothing! |
2057
|
|
|
|
|
|
|
|
2058
|
|
|
|
|
|
|
=cut |
2059
|
|
|
|
|
|
|
|
2060
|
|
|
|
|
|
|
sub meta { |
2061
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
2062
|
0
|
|
|
|
|
0
|
my ($name,$value) = @_; |
2063
|
0
|
|
|
|
|
0
|
return; |
2064
|
|
|
|
|
|
|
} |
2065
|
|
|
|
|
|
|
|
2066
|
|
|
|
|
|
|
=head2 default_meta_values |
2067
|
|
|
|
|
|
|
|
2068
|
|
|
|
|
|
|
Title : default_meta_values |
2069
|
|
|
|
|
|
|
Usage : %values = $db->default_meta_values |
2070
|
|
|
|
|
|
|
Function: empty the database |
2071
|
|
|
|
|
|
|
Returns : a list of tag=>value pairs |
2072
|
|
|
|
|
|
|
Args : none |
2073
|
|
|
|
|
|
|
Status : protected |
2074
|
|
|
|
|
|
|
|
2075
|
|
|
|
|
|
|
This method returns a list of tag=Evalue pairs that contain default |
2076
|
|
|
|
|
|
|
meta information about the database. It is invoked by initialize() to |
2077
|
|
|
|
|
|
|
write out the default meta values. The base class version returns an |
2078
|
|
|
|
|
|
|
empty list. |
2079
|
|
|
|
|
|
|
|
2080
|
|
|
|
|
|
|
For things to work properly, meta value names must be UPPERCASE. |
2081
|
|
|
|
|
|
|
|
2082
|
|
|
|
|
|
|
=cut |
2083
|
|
|
|
|
|
|
|
2084
|
|
|
|
|
|
|
sub default_meta_values { |
2085
|
5
|
|
|
5
|
1
|
8
|
my $self = shift; |
2086
|
5
|
|
|
|
|
7
|
return (); |
2087
|
|
|
|
|
|
|
} |
2088
|
|
|
|
|
|
|
|
2089
|
|
|
|
|
|
|
|
2090
|
|
|
|
|
|
|
=head2 error |
2091
|
|
|
|
|
|
|
|
2092
|
|
|
|
|
|
|
Title : error |
2093
|
|
|
|
|
|
|
Usage : $db->error( [$new error] ); |
2094
|
|
|
|
|
|
|
Function: read or set error message |
2095
|
|
|
|
|
|
|
Returns : error message |
2096
|
|
|
|
|
|
|
Args : an optional argument to set the error message |
2097
|
|
|
|
|
|
|
Status : Public |
2098
|
|
|
|
|
|
|
|
2099
|
|
|
|
|
|
|
This method can be used to retrieve the last error message. Errors |
2100
|
|
|
|
|
|
|
are not reset to empty by successful calls, so contents are only valid |
2101
|
|
|
|
|
|
|
immediately after an error condition has been detected. |
2102
|
|
|
|
|
|
|
|
2103
|
|
|
|
|
|
|
=cut |
2104
|
|
|
|
|
|
|
|
2105
|
|
|
|
|
|
|
sub error { |
2106
|
8
|
|
|
8
|
1
|
13
|
my $self = shift; |
2107
|
8
|
|
|
|
|
15
|
my $g = $self->{error}; |
2108
|
8
|
50
|
|
|
|
47
|
$self->{error} = join '',@_ if @_; |
2109
|
8
|
|
|
|
|
18
|
$g; |
2110
|
|
|
|
|
|
|
} |
2111
|
|
|
|
|
|
|
|
2112
|
|
|
|
|
|
|
=head2 debug |
2113
|
|
|
|
|
|
|
|
2114
|
|
|
|
|
|
|
Title : debug |
2115
|
|
|
|
|
|
|
Usage : $db->debug( [$flag] ); |
2116
|
|
|
|
|
|
|
Function: read or set debug flag |
2117
|
|
|
|
|
|
|
Returns : current value of debug flag |
2118
|
|
|
|
|
|
|
Args : new debug flag (optional) |
2119
|
|
|
|
|
|
|
Status : Public |
2120
|
|
|
|
|
|
|
|
2121
|
|
|
|
|
|
|
This method can be used to turn on debug messages. The exact nature |
2122
|
|
|
|
|
|
|
of those messages depends on the adaptor in use. |
2123
|
|
|
|
|
|
|
|
2124
|
|
|
|
|
|
|
=cut |
2125
|
|
|
|
|
|
|
|
2126
|
|
|
|
|
|
|
sub debug { |
2127
|
216
|
|
|
216
|
1
|
2718
|
my $self = shift; |
2128
|
216
|
|
|
|
|
240
|
my $g = $self->{debug}; |
2129
|
216
|
100
|
|
|
|
348
|
$self->{debug} = shift if @_; |
2130
|
216
|
|
|
|
|
339
|
$g; |
2131
|
|
|
|
|
|
|
} |
2132
|
|
|
|
|
|
|
|
2133
|
|
|
|
|
|
|
|
2134
|
|
|
|
|
|
|
=head2 automerge |
2135
|
|
|
|
|
|
|
|
2136
|
|
|
|
|
|
|
Title : automerge |
2137
|
|
|
|
|
|
|
Usage : $db->automerge( [$new automerge] ); |
2138
|
|
|
|
|
|
|
Function: get or set automerge value |
2139
|
|
|
|
|
|
|
Returns : current value (boolean) |
2140
|
|
|
|
|
|
|
Args : an optional argument to set the automerge value |
2141
|
|
|
|
|
|
|
Status : Public |
2142
|
|
|
|
|
|
|
|
2143
|
|
|
|
|
|
|
By default, this module will use the aggregators to merge groups into |
2144
|
|
|
|
|
|
|
single composite objects. This default can be changed to false by |
2145
|
|
|
|
|
|
|
calling automerge(0). |
2146
|
|
|
|
|
|
|
|
2147
|
|
|
|
|
|
|
=cut |
2148
|
|
|
|
|
|
|
|
2149
|
|
|
|
|
|
|
sub automerge { |
2150
|
111
|
|
|
111
|
1
|
140
|
my $self = shift; |
2151
|
111
|
|
|
|
|
135
|
my $g = $self->{automerge}; |
2152
|
111
|
100
|
|
|
|
233
|
$self->{automerge} = shift if @_; |
2153
|
111
|
|
|
|
|
168
|
$g; |
2154
|
|
|
|
|
|
|
} |
2155
|
|
|
|
|
|
|
|
2156
|
|
|
|
|
|
|
=head2 attributes |
2157
|
|
|
|
|
|
|
|
2158
|
|
|
|
|
|
|
Title : attributes |
2159
|
|
|
|
|
|
|
Usage : @attributes = $db->attributes($id,$name) |
2160
|
|
|
|
|
|
|
Function: get the "attributes" on a particular feature |
2161
|
|
|
|
|
|
|
Returns : an array of string |
2162
|
|
|
|
|
|
|
Args : feature ID |
2163
|
|
|
|
|
|
|
Status : public |
2164
|
|
|
|
|
|
|
|
2165
|
|
|
|
|
|
|
Some GFF version 2 files use the groups column to store a series of |
2166
|
|
|
|
|
|
|
attribute/value pairs. In this interpretation of GFF, the first such |
2167
|
|
|
|
|
|
|
pair is treated as the primary group for the feature; subsequent pairs |
2168
|
|
|
|
|
|
|
are treated as attributes. Two attributes have special meaning: |
2169
|
|
|
|
|
|
|
"Note" is for backward compatibility and is used for unstructured text |
2170
|
|
|
|
|
|
|
remarks. "Alias" is considered as a synonym for the feature name. |
2171
|
|
|
|
|
|
|
|
2172
|
|
|
|
|
|
|
If no name is provided, then attributes() returns a flattened hash, of |
2173
|
|
|
|
|
|
|
attribute=Evalue pairs. This lets you do: |
2174
|
|
|
|
|
|
|
|
2175
|
|
|
|
|
|
|
%attributes = $db->attributes($id); |
2176
|
|
|
|
|
|
|
|
2177
|
|
|
|
|
|
|
If no arguments are provided, attributes() will return the list of |
2178
|
|
|
|
|
|
|
all attribute names: |
2179
|
|
|
|
|
|
|
|
2180
|
|
|
|
|
|
|
@attribute_names = $db->attributes(); |
2181
|
|
|
|
|
|
|
|
2182
|
|
|
|
|
|
|
Normally, however, attributes() will be called by the feature: |
2183
|
|
|
|
|
|
|
|
2184
|
|
|
|
|
|
|
@notes = $feature->attributes('Note'); |
2185
|
|
|
|
|
|
|
|
2186
|
|
|
|
|
|
|
In a scalar context, attributes() returns the first value of the |
2187
|
|
|
|
|
|
|
attribute if a tag is present, otherwise a hash reference in which the |
2188
|
|
|
|
|
|
|
keys are attribute names and the values are anonymous arrays |
2189
|
|
|
|
|
|
|
containing the values. |
2190
|
|
|
|
|
|
|
|
2191
|
|
|
|
|
|
|
=cut |
2192
|
|
|
|
|
|
|
|
2193
|
|
|
|
|
|
|
sub attributes { |
2194
|
20
|
|
|
20
|
1
|
20
|
my $self = shift; |
2195
|
20
|
|
|
|
|
38
|
my ($id,$tag) = @_; |
2196
|
20
|
50
|
|
|
|
129
|
my @result = $self->do_attributes(@_) or return; |
2197
|
20
|
100
|
|
|
|
80
|
return @result if wantarray; |
2198
|
|
|
|
|
|
|
|
2199
|
|
|
|
|
|
|
# what to do in an array context |
2200
|
10
|
100
|
|
|
|
37
|
return $result[0] if $tag; |
2201
|
5
|
|
|
|
|
8
|
my %result; |
2202
|
5
|
|
|
|
|
22
|
while (my($key,$value) = splice(@result,0,2)) { |
2203
|
15
|
|
|
|
|
7
|
push @{$result{$key}},$value; |
|
15
|
|
|
|
|
60
|
|
2204
|
|
|
|
|
|
|
} |
2205
|
5
|
|
|
|
|
22
|
return \%result; |
2206
|
|
|
|
|
|
|
} |
2207
|
|
|
|
|
|
|
|
2208
|
|
|
|
|
|
|
=head2 fast_queries |
2209
|
|
|
|
|
|
|
|
2210
|
|
|
|
|
|
|
Title : fast_queries |
2211
|
|
|
|
|
|
|
Usage : $flag = $db->fast_queries([$flag]) |
2212
|
|
|
|
|
|
|
Function: turn on and off the "fast queries" option |
2213
|
|
|
|
|
|
|
Returns : a boolean |
2214
|
|
|
|
|
|
|
Args : a boolean flag (optional) |
2215
|
|
|
|
|
|
|
Status : public |
2216
|
|
|
|
|
|
|
|
2217
|
|
|
|
|
|
|
The mysql database driver (and possibly others) support a "fast" query |
2218
|
|
|
|
|
|
|
mode that caches results on the server side. This makes queries come |
2219
|
|
|
|
|
|
|
back faster, particularly when creating iterators. The downside is |
2220
|
|
|
|
|
|
|
that while iterating, new queries will die with a "command synch" |
2221
|
|
|
|
|
|
|
error. This method turns the feature on and off. |
2222
|
|
|
|
|
|
|
|
2223
|
|
|
|
|
|
|
For databases that do not support a fast query, this method has no |
2224
|
|
|
|
|
|
|
effect. |
2225
|
|
|
|
|
|
|
|
2226
|
|
|
|
|
|
|
=cut |
2227
|
|
|
|
|
|
|
|
2228
|
|
|
|
|
|
|
# override this method in order to set the mysql_use_result attribute, which is an obscure |
2229
|
|
|
|
|
|
|
# but extremely powerful optimization for both performance and memory. |
2230
|
|
|
|
|
|
|
sub fast_queries { |
2231
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
2232
|
0
|
|
|
|
|
0
|
my $d = $self->{fast_queries}; |
2233
|
0
|
0
|
|
|
|
0
|
$self->{fast_queries} = shift if @_; |
2234
|
0
|
|
|
|
|
0
|
$d; |
2235
|
|
|
|
|
|
|
} |
2236
|
|
|
|
|
|
|
|
2237
|
|
|
|
|
|
|
=head2 add_aggregator |
2238
|
|
|
|
|
|
|
|
2239
|
|
|
|
|
|
|
Title : add_aggregator |
2240
|
|
|
|
|
|
|
Usage : $db->add_aggregator($aggregator) |
2241
|
|
|
|
|
|
|
Function: add an aggregator to the list |
2242
|
|
|
|
|
|
|
Returns : nothing |
2243
|
|
|
|
|
|
|
Args : an aggregator |
2244
|
|
|
|
|
|
|
Status : public |
2245
|
|
|
|
|
|
|
|
2246
|
|
|
|
|
|
|
This method will append an aggregator to the end of the list of |
2247
|
|
|
|
|
|
|
registered aggregators. Three different argument types are accepted: |
2248
|
|
|
|
|
|
|
|
2249
|
|
|
|
|
|
|
1) a Bio::DB::GFF::Aggregator object -- will be added |
2250
|
|
|
|
|
|
|
2) a string in the form "aggregator_name{subpart1,subpart2,subpart3/main_method}" |
2251
|
|
|
|
|
|
|
-- will be turned into a Bio::DB::GFF::Aggregator object (the /main_method |
2252
|
|
|
|
|
|
|
part is optional). |
2253
|
|
|
|
|
|
|
3) a valid Perl token -- will be turned into a Bio::DB::GFF::Aggregator |
2254
|
|
|
|
|
|
|
subclass, where the token corresponds to the subclass name. |
2255
|
|
|
|
|
|
|
|
2256
|
|
|
|
|
|
|
=cut |
2257
|
|
|
|
|
|
|
|
2258
|
|
|
|
|
|
|
sub add_aggregator { |
2259
|
20
|
|
|
20
|
1
|
42
|
my $self = shift; |
2260
|
20
|
|
|
|
|
23
|
my $aggregator = shift; |
2261
|
20
|
|
100
|
|
|
93
|
my $list = $self->{aggregators} ||= []; |
2262
|
20
|
100
|
|
|
|
76
|
if (ref $aggregator) { # an object |
|
|
50
|
|
|
|
|
|
2263
|
5
|
|
|
|
|
10
|
@$list = grep {$_->get_method ne $aggregator->get_method} @$list; |
|
10
|
|
|
|
|
30
|
|
2264
|
5
|
|
|
|
|
17
|
push @$list,$aggregator; |
2265
|
|
|
|
|
|
|
} |
2266
|
|
|
|
|
|
|
|
2267
|
|
|
|
|
|
|
elsif ($aggregator =~ /^(\w+)\{([^\/\}]+)\/?(.*)\}$/) { |
2268
|
0
|
|
|
|
|
0
|
my($agg_name,$subparts,$mainpart) = ($1,$2,$3); |
2269
|
0
|
|
|
|
|
0
|
my @subparts = split /,\s*/,$subparts; |
2270
|
0
|
|
|
|
|
0
|
my @args = (-method => $agg_name, |
2271
|
|
|
|
|
|
|
-sub_parts => \@subparts); |
2272
|
0
|
0
|
|
|
|
0
|
if ($mainpart) { |
2273
|
0
|
|
|
|
|
0
|
push @args,(-main_method => $mainpart, |
2274
|
|
|
|
|
|
|
-whole_object => 1); |
2275
|
|
|
|
|
|
|
} |
2276
|
0
|
0
|
|
|
|
0
|
warn "making an aggregator with (@args), subparts = @subparts" if $self->debug; |
2277
|
0
|
|
|
|
|
0
|
push @$list,Bio::DB::GFF::Aggregator->new(@args); |
2278
|
|
|
|
|
|
|
} |
2279
|
|
|
|
|
|
|
|
2280
|
|
|
|
|
|
|
else { |
2281
|
15
|
|
|
|
|
41
|
my $class = "Bio::DB::GFF::Aggregator::\L${aggregator}\E"; |
2282
|
15
|
50
|
|
|
|
670
|
eval "require $class; 1" or $self->throw("Unable to load $aggregator aggregator: $@"); |
2283
|
15
|
|
|
|
|
100
|
push @$list,$class->new(); |
2284
|
|
|
|
|
|
|
} |
2285
|
|
|
|
|
|
|
} |
2286
|
|
|
|
|
|
|
|
2287
|
|
|
|
|
|
|
=head2 aggregators |
2288
|
|
|
|
|
|
|
|
2289
|
|
|
|
|
|
|
Title : aggregators |
2290
|
|
|
|
|
|
|
Usage : $db->aggregators([@new_aggregators]); |
2291
|
|
|
|
|
|
|
Function: retrieve list of aggregators |
2292
|
|
|
|
|
|
|
Returns : list of aggregators |
2293
|
|
|
|
|
|
|
Args : a list of aggregators to set (optional) |
2294
|
|
|
|
|
|
|
Status : public |
2295
|
|
|
|
|
|
|
|
2296
|
|
|
|
|
|
|
This method will get or set the list of aggregators assigned to |
2297
|
|
|
|
|
|
|
the database. If 1 or more arguments are passed, the existing |
2298
|
|
|
|
|
|
|
set will be cleared. |
2299
|
|
|
|
|
|
|
|
2300
|
|
|
|
|
|
|
=cut |
2301
|
|
|
|
|
|
|
|
2302
|
|
|
|
|
|
|
sub aggregators { |
2303
|
106
|
|
|
106
|
1
|
102
|
my $self = shift; |
2304
|
106
|
|
|
|
|
126
|
my $d = $self->{aggregators}; |
2305
|
106
|
50
|
|
|
|
162
|
if (@_) { |
2306
|
0
|
|
|
|
|
0
|
$self->clear_aggregators; |
2307
|
0
|
|
|
|
|
0
|
$self->add_aggregator($_) foreach @_; |
2308
|
|
|
|
|
|
|
} |
2309
|
106
|
50
|
|
|
|
146
|
return unless $d; |
2310
|
106
|
|
|
|
|
214
|
return @$d; |
2311
|
|
|
|
|
|
|
} |
2312
|
|
|
|
|
|
|
|
2313
|
|
|
|
|
|
|
=head2 clear_aggregators |
2314
|
|
|
|
|
|
|
|
2315
|
|
|
|
|
|
|
Title : clear_aggregators |
2316
|
|
|
|
|
|
|
Usage : $db->clear_aggregators |
2317
|
|
|
|
|
|
|
Function: clears list of aggregators |
2318
|
|
|
|
|
|
|
Returns : nothing |
2319
|
|
|
|
|
|
|
Args : none |
2320
|
|
|
|
|
|
|
Status : public |
2321
|
|
|
|
|
|
|
|
2322
|
|
|
|
|
|
|
This method will clear the aggregators stored in the database object. |
2323
|
|
|
|
|
|
|
Use aggregators() or add_aggregator() to add some back. |
2324
|
|
|
|
|
|
|
|
2325
|
|
|
|
|
|
|
=cut |
2326
|
|
|
|
|
|
|
|
2327
|
5
|
|
|
5
|
1
|
15
|
sub clear_aggregators { shift->{aggregators} = [] } |
2328
|
|
|
|
|
|
|
|
2329
|
|
|
|
|
|
|
=head2 preferred_groups |
2330
|
|
|
|
|
|
|
|
2331
|
|
|
|
|
|
|
Title : preferred_groups |
2332
|
|
|
|
|
|
|
Usage : $db->preferred_groups([$group_name_or_arrayref]) |
2333
|
|
|
|
|
|
|
Function: get/set list of groups for altering GFF2 parsing |
2334
|
|
|
|
|
|
|
Returns : a list of classes |
2335
|
|
|
|
|
|
|
Args : new list (scalar or array ref) |
2336
|
|
|
|
|
|
|
Status : public |
2337
|
|
|
|
|
|
|
|
2338
|
|
|
|
|
|
|
=cut |
2339
|
|
|
|
|
|
|
|
2340
|
|
|
|
|
|
|
sub preferred_groups { |
2341
|
15
|
|
|
15
|
1
|
57
|
my $self = shift; |
2342
|
15
|
|
|
|
|
17
|
my $d = $self->{preferred_groups}; |
2343
|
15
|
100
|
|
|
|
25
|
if (@_) { |
2344
|
5
|
50
|
|
|
|
10
|
my @v = map {ref($_) eq 'ARRAY' ? @$_ : $_} @_; |
|
5
|
|
|
|
|
28
|
|
2345
|
5
|
|
|
|
|
9
|
$self->{preferred_groups} = \@v; |
2346
|
5
|
|
|
|
|
17
|
delete $self->{preferred_groups_hash}; |
2347
|
|
|
|
|
|
|
} |
2348
|
15
|
100
|
|
|
|
35
|
return unless $d; |
2349
|
10
|
|
|
|
|
25
|
return @$d; |
2350
|
|
|
|
|
|
|
} |
2351
|
|
|
|
|
|
|
|
2352
|
|
|
|
|
|
|
sub _preferred_groups_hash { |
2353
|
144
|
|
|
144
|
|
120
|
my $self = shift; |
2354
|
144
|
|
|
|
|
100
|
my $gff3 = shift; |
2355
|
144
|
100
|
|
|
|
279
|
return $self->{preferred_groups_hash} if exists $self->{preferred_groups_hash}; |
2356
|
5
|
|
|
|
|
7
|
my $count = 0; |
2357
|
|
|
|
|
|
|
|
2358
|
5
|
|
|
|
|
10
|
my @preferred = $self->preferred_groups; |
2359
|
|
|
|
|
|
|
|
2360
|
|
|
|
|
|
|
# defaults |
2361
|
5
|
50
|
|
|
|
15
|
if (!@preferred) { |
2362
|
0
|
0
|
0
|
|
|
0
|
@preferred = $gff3 || $self->{load_data}{gff3_flag} ? qw(Target Parent ID) : qw(Target Sequence Transcript); |
2363
|
|
|
|
|
|
|
} |
2364
|
|
|
|
|
|
|
|
2365
|
5
|
|
|
|
|
10
|
my %preferred = map {lc($_) => @preferred-$count++} @preferred; |
|
15
|
|
|
|
|
38
|
|
2366
|
5
|
|
|
|
|
19
|
return $self->{preferred_groups_hash} = \%preferred; |
2367
|
|
|
|
|
|
|
} |
2368
|
|
|
|
|
|
|
|
2369
|
|
|
|
|
|
|
=head1 Methods for use by Subclasses |
2370
|
|
|
|
|
|
|
|
2371
|
|
|
|
|
|
|
The following methods are chiefly of interest to subclasses and are |
2372
|
|
|
|
|
|
|
not intended for use by end programmers. |
2373
|
|
|
|
|
|
|
|
2374
|
|
|
|
|
|
|
=head2 abscoords |
2375
|
|
|
|
|
|
|
|
2376
|
|
|
|
|
|
|
Title : abscoords |
2377
|
|
|
|
|
|
|
Usage : $db->abscoords($name,$class,$refseq) |
2378
|
|
|
|
|
|
|
Function: finds position of a landmark in reference coordinates |
2379
|
|
|
|
|
|
|
Returns : ($ref,$class,$start,$stop,$strand) |
2380
|
|
|
|
|
|
|
Args : name and class of landmark |
2381
|
|
|
|
|
|
|
Status : public |
2382
|
|
|
|
|
|
|
|
2383
|
|
|
|
|
|
|
This method is called by Bio::DB::GFF::RelSegment to obtain the |
2384
|
|
|
|
|
|
|
absolute coordinates of a sequence landmark. The arguments are the |
2385
|
|
|
|
|
|
|
name and class of the landmark. If successful, abscoords() returns |
2386
|
|
|
|
|
|
|
the ID of the reference sequence, its class, its start and stop |
2387
|
|
|
|
|
|
|
positions, and the orientation of the reference sequence's coordinate |
2388
|
|
|
|
|
|
|
system ("+" for forward strand, "-" for reverse strand). |
2389
|
|
|
|
|
|
|
|
2390
|
|
|
|
|
|
|
If $refseq is present in the argument list, it forces the query to |
2391
|
|
|
|
|
|
|
search for the landmark in a particular reference sequence. |
2392
|
|
|
|
|
|
|
|
2393
|
|
|
|
|
|
|
=cut |
2394
|
|
|
|
|
|
|
|
2395
|
|
|
|
|
|
|
sub abscoords { |
2396
|
193
|
|
|
193
|
1
|
180
|
my $self = shift; |
2397
|
193
|
|
|
|
|
234
|
my ($name,$class,$refseq) = @_; |
2398
|
193
|
|
33
|
|
|
311
|
$class ||= $self->{default_class}; |
2399
|
193
|
|
|
|
|
463
|
$self->get_abscoords($name,$class,$refseq); |
2400
|
|
|
|
|
|
|
} |
2401
|
|
|
|
|
|
|
|
2402
|
|
|
|
|
|
|
=head1 Protected API |
2403
|
|
|
|
|
|
|
|
2404
|
|
|
|
|
|
|
The following methods are not intended for public consumption, but are |
2405
|
|
|
|
|
|
|
intended to be overridden/implemented by adaptors. |
2406
|
|
|
|
|
|
|
|
2407
|
|
|
|
|
|
|
=head2 default_aggregators |
2408
|
|
|
|
|
|
|
|
2409
|
|
|
|
|
|
|
Title : default_aggregators |
2410
|
|
|
|
|
|
|
Usage : $db->default_aggregators; |
2411
|
|
|
|
|
|
|
Function: retrieve list of aggregators |
2412
|
|
|
|
|
|
|
Returns : array reference containing list of aggregator names |
2413
|
|
|
|
|
|
|
Args : none |
2414
|
|
|
|
|
|
|
Status : protected |
2415
|
|
|
|
|
|
|
|
2416
|
|
|
|
|
|
|
This method (which is intended to be overridden by adaptors) returns a |
2417
|
|
|
|
|
|
|
list of standard aggregators to be applied when no aggregators are |
2418
|
|
|
|
|
|
|
specified in the constructor. |
2419
|
|
|
|
|
|
|
|
2420
|
|
|
|
|
|
|
=cut |
2421
|
|
|
|
|
|
|
|
2422
|
|
|
|
|
|
|
sub default_aggregators { |
2423
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
2424
|
0
|
|
|
|
|
0
|
return ['processed_transcript','alignment']; |
2425
|
|
|
|
|
|
|
} |
2426
|
|
|
|
|
|
|
|
2427
|
|
|
|
|
|
|
=head2 do_load_gff |
2428
|
|
|
|
|
|
|
|
2429
|
|
|
|
|
|
|
Title : do_load_gff |
2430
|
|
|
|
|
|
|
Usage : $db->do_load_gff($handle) |
2431
|
|
|
|
|
|
|
Function: load a GFF input stream |
2432
|
|
|
|
|
|
|
Returns : number of features loaded |
2433
|
|
|
|
|
|
|
Args : A filehandle. |
2434
|
|
|
|
|
|
|
Status : protected |
2435
|
|
|
|
|
|
|
|
2436
|
|
|
|
|
|
|
This method is called to load a GFF data stream. The method will read |
2437
|
|
|
|
|
|
|
GFF features from EE and load them into the database. On exit the |
2438
|
|
|
|
|
|
|
method must return the number of features loaded. |
2439
|
|
|
|
|
|
|
|
2440
|
|
|
|
|
|
|
Note that the method is responsible for parsing the GFF lines. This |
2441
|
|
|
|
|
|
|
is to allow for differences in the interpretation of the "group" |
2442
|
|
|
|
|
|
|
field, which are legion. |
2443
|
|
|
|
|
|
|
|
2444
|
|
|
|
|
|
|
You probably want to use load_gff() instead. It is more flexible |
2445
|
|
|
|
|
|
|
about the arguments it accepts. |
2446
|
|
|
|
|
|
|
|
2447
|
|
|
|
|
|
|
=cut |
2448
|
|
|
|
|
|
|
|
2449
|
|
|
|
|
|
|
sub do_load_gff { |
2450
|
5
|
|
|
5
|
1
|
409
|
my $self = shift; |
2451
|
5
|
|
|
|
|
8
|
my $io_handle = shift; |
2452
|
|
|
|
|
|
|
|
2453
|
|
|
|
|
|
|
local $self->{load_data} = { |
2454
|
5
|
50
|
33
|
|
|
55
|
lineend => (-t STDERR && !$ENV{EMACS} ? "\r" : "\n"), |
2455
|
|
|
|
|
|
|
count => 0 |
2456
|
|
|
|
|
|
|
}; |
2457
|
|
|
|
|
|
|
|
2458
|
5
|
|
|
|
|
22
|
$self->setup_load(); |
2459
|
5
|
|
|
|
|
9
|
my $mode = 'gff'; |
2460
|
|
|
|
|
|
|
|
2461
|
5
|
|
|
|
|
206
|
while (<$io_handle>) { |
2462
|
214
|
|
|
|
|
177
|
chomp; |
2463
|
214
|
50
|
|
|
|
279
|
if ($mode eq 'gff') { |
|
|
0
|
|
|
|
|
|
2464
|
214
|
50
|
|
|
|
297
|
if (/^>/) { # Sequence coming |
2465
|
0
|
|
|
|
|
0
|
$mode = 'fasta'; |
2466
|
0
|
|
|
|
|
0
|
$self->_load_sequence_start; |
2467
|
0
|
|
|
|
|
0
|
$self->_load_sequence_line($_); |
2468
|
|
|
|
|
|
|
} else { |
2469
|
214
|
|
|
|
|
260
|
$self->_load_gff_line($_); |
2470
|
|
|
|
|
|
|
} |
2471
|
|
|
|
|
|
|
} |
2472
|
|
|
|
|
|
|
elsif ($mode eq 'fasta') { |
2473
|
0
|
0
|
|
|
|
0
|
if (/^##|\t/) { # Back to GFF mode |
2474
|
0
|
|
|
|
|
0
|
$self->_load_sequence_finish; |
2475
|
0
|
|
|
|
|
0
|
$mode = 'gff'; |
2476
|
0
|
|
|
|
|
0
|
$self->_load_gff_line($_); |
2477
|
|
|
|
|
|
|
} else { |
2478
|
0
|
|
|
|
|
0
|
$self->_load_sequence_line($_); |
2479
|
|
|
|
|
|
|
} |
2480
|
|
|
|
|
|
|
} |
2481
|
|
|
|
|
|
|
} |
2482
|
5
|
|
|
|
|
20
|
$self->finish_load(); |
2483
|
5
|
|
|
|
|
24
|
$self->_load_sequence_finish; |
2484
|
|
|
|
|
|
|
|
2485
|
5
|
|
|
|
|
17
|
return $self->{load_data}{count}; |
2486
|
|
|
|
|
|
|
} |
2487
|
|
|
|
|
|
|
|
2488
|
|
|
|
|
|
|
sub _load_gff_line { |
2489
|
214
|
|
|
214
|
|
148
|
my $self = shift; |
2490
|
214
|
|
|
|
|
174
|
my $line = shift; |
2491
|
214
|
|
|
|
|
185
|
my $lineend = $self->{load_data}{lineend}; |
2492
|
|
|
|
|
|
|
|
2493
|
214
|
100
|
|
|
|
324
|
$self->{load_data}{gff3_flag}++ if $line =~ /^\#\#\s*gff-version\s+3/; |
2494
|
|
|
|
|
|
|
|
2495
|
214
|
100
|
100
|
|
|
458
|
if (defined $self->{load_data}{gff3_flag} and !defined $self->{load_data}{gff3_warning}) { |
2496
|
2
|
|
|
|
|
24
|
$self->print_gff3_warning(); |
2497
|
2
|
|
|
|
|
8
|
$self->{load_data}{gff3_warning}=1; |
2498
|
|
|
|
|
|
|
} |
2499
|
|
|
|
|
|
|
|
2500
|
214
|
50
|
|
|
|
304
|
$self->preferred_groups(split(/\s+/,$1)) if $line =~ /^\#\#\s*group-tags?\s+(.+)/; |
2501
|
|
|
|
|
|
|
|
2502
|
214
|
100
|
|
|
|
302
|
if ($line =~ /^\#\#\s*sequence-region\s+(\S+)\s+(-?\d+)\s+(-?\d+)/i) { # header line |
2503
|
10
|
|
|
|
|
122
|
$self->load_gff_line( |
2504
|
|
|
|
|
|
|
{ |
2505
|
|
|
|
|
|
|
ref => $1, |
2506
|
|
|
|
|
|
|
class => 'Sequence', |
2507
|
|
|
|
|
|
|
source => 'reference', |
2508
|
|
|
|
|
|
|
method => 'Component', |
2509
|
|
|
|
|
|
|
start => $2, |
2510
|
|
|
|
|
|
|
stop => $3, |
2511
|
|
|
|
|
|
|
score => undef, |
2512
|
|
|
|
|
|
|
strand => undef, |
2513
|
|
|
|
|
|
|
phase => undef, |
2514
|
|
|
|
|
|
|
gclass => 'Sequence', |
2515
|
|
|
|
|
|
|
gname => $1, |
2516
|
|
|
|
|
|
|
tstart => undef, |
2517
|
|
|
|
|
|
|
tstop => undef, |
2518
|
|
|
|
|
|
|
attributes => [], |
2519
|
|
|
|
|
|
|
} |
2520
|
|
|
|
|
|
|
); |
2521
|
10
|
|
|
|
|
45
|
return $self->{load_data}{count}++; |
2522
|
|
|
|
|
|
|
} |
2523
|
|
|
|
|
|
|
|
2524
|
204
|
100
|
|
|
|
314
|
return if /^#/; |
2525
|
|
|
|
|
|
|
|
2526
|
190
|
|
|
|
|
535
|
my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t",$line; |
2527
|
190
|
50
|
66
|
|
|
1026
|
return unless defined($ref) && defined($method) && defined($start) && defined($stop); |
|
|
|
66
|
|
|
|
|
|
|
|
33
|
|
|
|
|
2528
|
165
|
|
|
|
|
231
|
foreach (\$score,\$strand,\$phase) { |
2529
|
495
|
100
|
|
|
|
678
|
undef $$_ if $$_ eq '.'; |
2530
|
|
|
|
|
|
|
} |
2531
|
|
|
|
|
|
|
|
2532
|
165
|
|
|
|
|
355
|
my ($gclass,$gname,$tstart,$tstop,$attributes) = $self->split_group($group,$self->{load_data}{gff3_flag}); |
2533
|
|
|
|
|
|
|
|
2534
|
|
|
|
|
|
|
# no standard way in the GFF file to denote the class of the reference sequence -- drat! |
2535
|
|
|
|
|
|
|
# so we invoke the factory to do it |
2536
|
165
|
|
|
|
|
306
|
my $class = $self->refclass($ref); |
2537
|
|
|
|
|
|
|
|
2538
|
|
|
|
|
|
|
# call subclass to do the dirty work |
2539
|
165
|
50
|
|
|
|
327
|
if ($start > $stop) { |
2540
|
0
|
|
|
|
|
0
|
($start,$stop) = ($stop,$start); |
2541
|
0
|
0
|
|
|
|
0
|
if ($strand eq '+') { |
|
|
0
|
|
|
|
|
|
2542
|
0
|
|
|
|
|
0
|
$strand = '-'; |
2543
|
|
|
|
|
|
|
} elsif ($strand eq '-') { |
2544
|
0
|
|
|
|
|
0
|
$strand = '+'; |
2545
|
|
|
|
|
|
|
} |
2546
|
|
|
|
|
|
|
} |
2547
|
|
|
|
|
|
|
# GFF2/3 transition stuff |
2548
|
165
|
50
|
|
|
|
306
|
$gclass = [$gclass] unless ref $gclass; |
2549
|
165
|
50
|
|
|
|
239
|
$gname = [$gname] unless ref $gname; |
2550
|
165
|
|
|
|
|
279
|
for (my $i=0; $i<@$gname;$i++) { |
2551
|
165
|
|
|
|
|
1152
|
$self->load_gff_line({ref => $ref, |
2552
|
|
|
|
|
|
|
class => $class, |
2553
|
|
|
|
|
|
|
source => $source, |
2554
|
|
|
|
|
|
|
method => $method, |
2555
|
|
|
|
|
|
|
start => $start, |
2556
|
|
|
|
|
|
|
stop => $stop, |
2557
|
|
|
|
|
|
|
score => $score, |
2558
|
|
|
|
|
|
|
strand => $strand, |
2559
|
|
|
|
|
|
|
phase => $phase, |
2560
|
|
|
|
|
|
|
gclass => $gclass->[$i], |
2561
|
|
|
|
|
|
|
gname => $gname->[$i], |
2562
|
|
|
|
|
|
|
tstart => $tstart, |
2563
|
|
|
|
|
|
|
tstop => $tstop, |
2564
|
|
|
|
|
|
|
attributes => $attributes} |
2565
|
|
|
|
|
|
|
); |
2566
|
165
|
|
|
|
|
792
|
$self->{load_data}{count}++; |
2567
|
|
|
|
|
|
|
} |
2568
|
|
|
|
|
|
|
} |
2569
|
|
|
|
|
|
|
|
2570
|
|
|
|
|
|
|
sub _load_sequence_start { |
2571
|
5
|
|
|
5
|
|
11
|
my $self = shift; |
2572
|
5
|
|
|
|
|
10
|
my $ld = $self->{load_data}; |
2573
|
5
|
|
|
|
|
12
|
undef $ld->{id}; |
2574
|
5
|
|
|
|
|
12
|
$ld->{offset} = 0; |
2575
|
5
|
|
|
|
|
14
|
$ld->{seq} = ''; |
2576
|
|
|
|
|
|
|
} |
2577
|
|
|
|
|
|
|
sub _load_sequence_finish { |
2578
|
10
|
|
|
10
|
|
15
|
my $self = shift; |
2579
|
10
|
|
|
|
|
22
|
my $ld = $self->{load_data}; |
2580
|
10
|
100
|
|
|
|
36
|
$self->insert_sequence($ld->{id},$ld->{offset},$ld->{seq}) if defined $ld->{id}; |
2581
|
|
|
|
|
|
|
} |
2582
|
|
|
|
|
|
|
|
2583
|
|
|
|
|
|
|
sub _load_sequence_line { |
2584
|
32950
|
|
|
32950
|
|
19252
|
my $self = shift; |
2585
|
32950
|
|
|
|
|
19959
|
my $line = shift; |
2586
|
32950
|
|
|
|
|
21035
|
my $ld = $self->{load_data}; |
2587
|
32950
|
|
|
|
|
20852
|
my $lineend = $ld->{lineend}; |
2588
|
|
|
|
|
|
|
|
2589
|
32950
|
100
|
|
|
|
35739
|
if (/^>(\S+)/) { |
2590
|
2620
|
100
|
|
|
|
6043
|
$self->insert_sequence($ld->{id},$ld->{offset},$ld->{seq}) if defined $ld->{id}; |
2591
|
2620
|
|
|
|
|
2763
|
$ld->{id} = $1; |
2592
|
2620
|
|
|
|
|
1715
|
$ld->{offset} = 0; |
2593
|
2620
|
|
|
|
|
1608
|
$ld->{seq} = ''; |
2594
|
2620
|
|
|
|
|
1592
|
$ld->{count}++; |
2595
|
2620
|
50
|
33
|
|
|
8945
|
print STDERR $ld->{count}," sequences loaded$lineend" if $self->{__verbose__} && $ld->{count} % 1000 == 0; |
2596
|
|
|
|
|
|
|
} else { |
2597
|
30330
|
|
|
|
|
21940
|
$ld->{seq} .= $_; |
2598
|
30330
|
|
|
|
|
33153
|
$self->insert_sequence_chunk($ld->{id},\$ld->{offset},\$ld->{seq}); |
2599
|
|
|
|
|
|
|
} |
2600
|
|
|
|
|
|
|
|
2601
|
|
|
|
|
|
|
} |
2602
|
|
|
|
|
|
|
|
2603
|
|
|
|
|
|
|
=head2 load_sequence |
2604
|
|
|
|
|
|
|
|
2605
|
|
|
|
|
|
|
Title : load_sequence |
2606
|
|
|
|
|
|
|
Usage : $db->load_sequence($handle) |
2607
|
|
|
|
|
|
|
Function: load a FASTA data stream |
2608
|
|
|
|
|
|
|
Returns : number of sequences |
2609
|
|
|
|
|
|
|
Args : a filehandle to the FASTA file |
2610
|
|
|
|
|
|
|
Status : protected |
2611
|
|
|
|
|
|
|
|
2612
|
|
|
|
|
|
|
You probably want to use load_fasta() instead. |
2613
|
|
|
|
|
|
|
|
2614
|
|
|
|
|
|
|
=cut |
2615
|
|
|
|
|
|
|
|
2616
|
|
|
|
|
|
|
# note - there is some repeated code here |
2617
|
|
|
|
|
|
|
sub load_sequence { |
2618
|
5
|
|
|
5
|
1
|
8
|
my $self = shift; |
2619
|
5
|
|
|
|
|
10
|
my $io_handle = shift; |
2620
|
|
|
|
|
|
|
|
2621
|
|
|
|
|
|
|
local $self->{load_data} = { |
2622
|
5
|
50
|
33
|
|
|
37
|
lineend => (-t STDERR && !$ENV{EMACS} ? "\r" : "\n"), |
2623
|
|
|
|
|
|
|
count => 0 |
2624
|
|
|
|
|
|
|
}; |
2625
|
|
|
|
|
|
|
|
2626
|
5
|
|
|
|
|
37
|
$self->_load_sequence_start; |
2627
|
5
|
|
|
|
|
185
|
while (<$io_handle>) { |
2628
|
32950
|
|
|
|
|
20558
|
chomp; |
2629
|
32950
|
|
|
|
|
29652
|
$self->_load_sequence_line($_); |
2630
|
|
|
|
|
|
|
} |
2631
|
5
|
|
|
|
|
21
|
$self->_load_sequence_finish; |
2632
|
5
|
|
|
|
|
27
|
return $self->{load_data}{count}; |
2633
|
|
|
|
|
|
|
} |
2634
|
|
|
|
|
|
|
|
2635
|
|
|
|
|
|
|
sub insert_sequence_chunk { |
2636
|
30330
|
|
|
30330
|
0
|
18560
|
my $self = shift; |
2637
|
30330
|
|
|
|
|
21676
|
my ($id,$offsetp,$seqp) = @_; |
2638
|
30330
|
50
|
|
|
|
25069
|
if (my $cs = $self->dna_chunk_size) { |
2639
|
0
|
|
|
|
|
0
|
while (length($$seqp) >= $cs) { |
2640
|
0
|
|
|
|
|
0
|
my $chunk = substr($$seqp,0,$cs); |
2641
|
0
|
|
|
|
|
0
|
$self->insert_sequence($id,$$offsetp,$chunk); |
2642
|
0
|
|
|
|
|
0
|
$$offsetp += length($chunk); |
2643
|
0
|
|
|
|
|
0
|
substr($$seqp,0,$cs) = ''; |
2644
|
|
|
|
|
|
|
} |
2645
|
|
|
|
|
|
|
} |
2646
|
30330
|
|
|
|
|
61145
|
return 1; # the calling routine may expect success or failure |
2647
|
|
|
|
|
|
|
} |
2648
|
|
|
|
|
|
|
|
2649
|
|
|
|
|
|
|
# used to store big pieces of DNA in itty bitty pieces |
2650
|
|
|
|
|
|
|
sub dna_chunk_size { |
2651
|
30330
|
|
|
30330
|
0
|
37323
|
return 0; |
2652
|
|
|
|
|
|
|
} |
2653
|
|
|
|
|
|
|
|
2654
|
|
|
|
|
|
|
sub insert_sequence { |
2655
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
2656
|
0
|
|
|
|
|
0
|
my($id,$offset,$seq) = @_; |
2657
|
0
|
|
|
|
|
0
|
$self->throw('insert_sequence(): must be defined in subclass'); |
2658
|
|
|
|
|
|
|
} |
2659
|
|
|
|
|
|
|
|
2660
|
|
|
|
|
|
|
# This is the default class for reference points. Defaults to Sequence. |
2661
|
|
|
|
|
|
|
sub default_class { |
2662
|
335
|
|
|
335
|
0
|
268
|
my $self = shift; |
2663
|
335
|
50
|
|
|
|
564
|
return 'Sequence' unless ref $self; |
2664
|
335
|
|
|
|
|
428
|
my $d = $self->{default_class}; |
2665
|
335
|
100
|
|
|
|
489
|
$self->{default_class} = shift if @_; |
2666
|
335
|
|
|
|
|
569
|
$d; |
2667
|
|
|
|
|
|
|
} |
2668
|
|
|
|
|
|
|
|
2669
|
|
|
|
|
|
|
# gets name of the reference sequence, and returns its class |
2670
|
|
|
|
|
|
|
# currently just calls default_class |
2671
|
|
|
|
|
|
|
sub refclass { |
2672
|
165
|
|
|
165
|
1
|
119
|
my $self = shift; |
2673
|
165
|
|
|
|
|
111
|
my $name = shift; |
2674
|
165
|
|
|
|
|
201
|
return $self->default_class; |
2675
|
|
|
|
|
|
|
} |
2676
|
|
|
|
|
|
|
|
2677
|
|
|
|
|
|
|
=head2 setup_load |
2678
|
|
|
|
|
|
|
|
2679
|
|
|
|
|
|
|
Title : setup_load |
2680
|
|
|
|
|
|
|
Usage : $db->setup_load |
2681
|
|
|
|
|
|
|
Function: called before load_gff_line() |
2682
|
|
|
|
|
|
|
Returns : void |
2683
|
|
|
|
|
|
|
Args : none |
2684
|
|
|
|
|
|
|
Status : abstract |
2685
|
|
|
|
|
|
|
|
2686
|
|
|
|
|
|
|
This abstract method gives subclasses a chance to do any |
2687
|
|
|
|
|
|
|
schema-specific initialization prior to loading a set of GFF records. |
2688
|
|
|
|
|
|
|
It must be implemented by a subclass. |
2689
|
|
|
|
|
|
|
|
2690
|
|
|
|
|
|
|
=cut |
2691
|
|
|
|
|
|
|
|
2692
|
|
|
|
0
|
1
|
|
sub setup_load { |
2693
|
|
|
|
|
|
|
# default, do nothing |
2694
|
|
|
|
|
|
|
} |
2695
|
|
|
|
|
|
|
|
2696
|
|
|
|
|
|
|
=head2 finish_load |
2697
|
|
|
|
|
|
|
|
2698
|
|
|
|
|
|
|
Title : finish_load |
2699
|
|
|
|
|
|
|
Usage : $db->finish_load |
2700
|
|
|
|
|
|
|
Function: called after load_gff_line() |
2701
|
|
|
|
|
|
|
Returns : number of records loaded |
2702
|
|
|
|
|
|
|
Args : none |
2703
|
|
|
|
|
|
|
Status :abstract |
2704
|
|
|
|
|
|
|
|
2705
|
|
|
|
|
|
|
This method gives subclasses a chance to do any schema-specific |
2706
|
|
|
|
|
|
|
cleanup after loading a set of GFF records. |
2707
|
|
|
|
|
|
|
|
2708
|
|
|
|
|
|
|
=cut |
2709
|
|
|
|
|
|
|
|
2710
|
|
|
|
0
|
1
|
|
sub finish_load { |
2711
|
|
|
|
|
|
|
# default, do nothing |
2712
|
|
|
|
|
|
|
} |
2713
|
|
|
|
|
|
|
|
2714
|
|
|
|
|
|
|
=head2 load_gff_line |
2715
|
|
|
|
|
|
|
|
2716
|
|
|
|
|
|
|
Title : load_gff_line |
2717
|
|
|
|
|
|
|
Usage : $db->load_gff_line(@args) |
2718
|
|
|
|
|
|
|
Function: called to load one parsed line of GFF |
2719
|
|
|
|
|
|
|
Returns : true if successfully inserted |
2720
|
|
|
|
|
|
|
Args : see below |
2721
|
|
|
|
|
|
|
Status : abstract |
2722
|
|
|
|
|
|
|
|
2723
|
|
|
|
|
|
|
This abstract method is called once per line of the GFF and passed a |
2724
|
|
|
|
|
|
|
hashref containing parsed GFF fields. The fields are: |
2725
|
|
|
|
|
|
|
|
2726
|
|
|
|
|
|
|
{ref => $ref, |
2727
|
|
|
|
|
|
|
class => $class, |
2728
|
|
|
|
|
|
|
source => $source, |
2729
|
|
|
|
|
|
|
method => $method, |
2730
|
|
|
|
|
|
|
start => $start, |
2731
|
|
|
|
|
|
|
stop => $stop, |
2732
|
|
|
|
|
|
|
score => $score, |
2733
|
|
|
|
|
|
|
strand => $strand, |
2734
|
|
|
|
|
|
|
phase => $phase, |
2735
|
|
|
|
|
|
|
gclass => $gclass, |
2736
|
|
|
|
|
|
|
gname => $gname, |
2737
|
|
|
|
|
|
|
tstart => $tstart, |
2738
|
|
|
|
|
|
|
tstop => $tstop, |
2739
|
|
|
|
|
|
|
attributes => $attributes} |
2740
|
|
|
|
|
|
|
|
2741
|
|
|
|
|
|
|
=cut |
2742
|
|
|
|
|
|
|
|
2743
|
|
|
|
|
|
|
sub load_gff_line { |
2744
|
0
|
|
|
0
|
1
|
0
|
shift->throw("load_gff_line(): must be implemented by an adaptor"); |
2745
|
|
|
|
|
|
|
} |
2746
|
|
|
|
|
|
|
|
2747
|
|
|
|
|
|
|
|
2748
|
|
|
|
|
|
|
=head2 do_initialize |
2749
|
|
|
|
|
|
|
|
2750
|
|
|
|
|
|
|
Title : do_initialize |
2751
|
|
|
|
|
|
|
Usage : $db->do_initialize([$erase]) |
2752
|
|
|
|
|
|
|
Function: initialize and possibly erase database |
2753
|
|
|
|
|
|
|
Returns : true if successful |
2754
|
|
|
|
|
|
|
Args : optional erase flag |
2755
|
|
|
|
|
|
|
Status : protected |
2756
|
|
|
|
|
|
|
|
2757
|
|
|
|
|
|
|
This method implements the initialize() method described above, and |
2758
|
|
|
|
|
|
|
takes the same arguments. |
2759
|
|
|
|
|
|
|
|
2760
|
|
|
|
|
|
|
=cut |
2761
|
|
|
|
|
|
|
|
2762
|
|
|
|
|
|
|
sub do_initialize { |
2763
|
0
|
|
|
0
|
1
|
0
|
shift->throw('do_initialize(): must be implemented by an adaptor'); |
2764
|
|
|
|
|
|
|
} |
2765
|
|
|
|
|
|
|
|
2766
|
|
|
|
|
|
|
=head2 dna |
2767
|
|
|
|
|
|
|
|
2768
|
|
|
|
|
|
|
Title : dna |
2769
|
|
|
|
|
|
|
Usage : $db->dna($id,$start,$stop,$class) |
2770
|
|
|
|
|
|
|
Function: return the raw DNA string for a segment |
2771
|
|
|
|
|
|
|
Returns : a raw DNA string |
2772
|
|
|
|
|
|
|
Args : id of the sequence, its class, start and stop positions |
2773
|
|
|
|
|
|
|
Status : public |
2774
|
|
|
|
|
|
|
|
2775
|
|
|
|
|
|
|
This method is invoked by Bio::DB::GFF::Segment to fetch the raw DNA |
2776
|
|
|
|
|
|
|
sequence. |
2777
|
|
|
|
|
|
|
|
2778
|
|
|
|
|
|
|
Arguments: -name sequence name |
2779
|
|
|
|
|
|
|
-start start position |
2780
|
|
|
|
|
|
|
-stop stop position |
2781
|
|
|
|
|
|
|
-class sequence class |
2782
|
|
|
|
|
|
|
|
2783
|
|
|
|
|
|
|
If start and stop are both undef, then the entire DNA is retrieved. |
2784
|
|
|
|
|
|
|
So to fetch the whole dna, call like this: |
2785
|
|
|
|
|
|
|
|
2786
|
|
|
|
|
|
|
$db->dna($name_of_sequence); |
2787
|
|
|
|
|
|
|
|
2788
|
|
|
|
|
|
|
or like this: |
2789
|
|
|
|
|
|
|
|
2790
|
|
|
|
|
|
|
$db->dna(-name=>$name_of_sequence,-class=>$class_of_sequence); |
2791
|
|
|
|
|
|
|
|
2792
|
|
|
|
|
|
|
NOTE: you will probably prefer to create a Segment and then invoke its |
2793
|
|
|
|
|
|
|
dna() method. |
2794
|
|
|
|
|
|
|
|
2795
|
|
|
|
|
|
|
=cut |
2796
|
|
|
|
|
|
|
|
2797
|
|
|
|
|
|
|
# call to return the DNA string for the indicated region |
2798
|
|
|
|
|
|
|
# real work is done by get_dna() |
2799
|
|
|
|
|
|
|
sub dna { |
2800
|
120
|
|
|
120
|
1
|
102
|
my $self = shift; |
2801
|
120
|
|
|
|
|
383
|
my ($id,$start,$stop,$class) = rearrange([ |
2802
|
|
|
|
|
|
|
[qw(NAME ID REF REFSEQ)], |
2803
|
|
|
|
|
|
|
qw(START), |
2804
|
|
|
|
|
|
|
[qw(STOP END)], |
2805
|
|
|
|
|
|
|
'CLASS', |
2806
|
|
|
|
|
|
|
],@_); |
2807
|
|
|
|
|
|
|
# return unless defined $start && defined $stop; |
2808
|
120
|
|
|
|
|
370
|
$self->get_dna($id,$start,$stop,$class); |
2809
|
|
|
|
|
|
|
} |
2810
|
|
|
|
|
|
|
|
2811
|
0
|
|
|
0
|
0
|
0
|
sub fetch_sequence { shift->dna(@_) } |
2812
|
|
|
|
|
|
|
|
2813
|
|
|
|
|
|
|
sub features_in_range { |
2814
|
85
|
|
|
85
|
0
|
65
|
my $self = shift; |
2815
|
85
|
|
|
|
|
402
|
my ($range_type,$refseq,$class,$start,$stop,$types,$parent,$sparse,$automerge,$iterator,$other) = |
2816
|
|
|
|
|
|
|
rearrange([ |
2817
|
|
|
|
|
|
|
[qw(RANGE_TYPE)], |
2818
|
|
|
|
|
|
|
[qw(REF REFSEQ)], |
2819
|
|
|
|
|
|
|
qw(CLASS), |
2820
|
|
|
|
|
|
|
qw(START), |
2821
|
|
|
|
|
|
|
[qw(STOP END)], |
2822
|
|
|
|
|
|
|
[qw(TYPE TYPES)], |
2823
|
|
|
|
|
|
|
qw(PARENT), |
2824
|
|
|
|
|
|
|
[qw(RARE SPARSE)], |
2825
|
|
|
|
|
|
|
[qw(MERGE AUTOMERGE)], |
2826
|
|
|
|
|
|
|
'ITERATOR' |
2827
|
|
|
|
|
|
|
],@_); |
2828
|
85
|
|
100
|
|
|
376
|
$other ||= {}; |
2829
|
|
|
|
|
|
|
# $automerge = $types && $self->automerge unless defined $automerge; |
2830
|
85
|
100
|
|
|
|
230
|
$automerge = $self->automerge unless defined $automerge; |
2831
|
|
|
|
|
|
|
$self->throw("range type must be one of {". |
2832
|
|
|
|
|
|
|
join(',',keys %valid_range_types). |
2833
|
|
|
|
|
|
|
"}\n") |
2834
|
85
|
50
|
|
|
|
206
|
unless $valid_range_types{lc $range_type}; |
2835
|
85
|
|
|
|
|
591
|
$self->_features({ |
2836
|
|
|
|
|
|
|
rangetype => lc $range_type, |
2837
|
|
|
|
|
|
|
refseq => $refseq, |
2838
|
|
|
|
|
|
|
refclass => $class, |
2839
|
|
|
|
|
|
|
start => $start, |
2840
|
|
|
|
|
|
|
stop => $stop, |
2841
|
|
|
|
|
|
|
types => $types }, |
2842
|
|
|
|
|
|
|
{ |
2843
|
|
|
|
|
|
|
sparse => $sparse, |
2844
|
|
|
|
|
|
|
automerge => $automerge, |
2845
|
|
|
|
|
|
|
iterator => $iterator, |
2846
|
|
|
|
|
|
|
%$other, |
2847
|
|
|
|
|
|
|
}, |
2848
|
|
|
|
|
|
|
$parent); |
2849
|
|
|
|
|
|
|
} |
2850
|
|
|
|
|
|
|
|
2851
|
|
|
|
|
|
|
=head2 get_dna |
2852
|
|
|
|
|
|
|
|
2853
|
|
|
|
|
|
|
Title : get_dna |
2854
|
|
|
|
|
|
|
Usage : $db->get_dna($id,$start,$stop,$class) |
2855
|
|
|
|
|
|
|
Function: get DNA for indicated segment |
2856
|
|
|
|
|
|
|
Returns : the dna string |
2857
|
|
|
|
|
|
|
Args : sequence ID, start, stop and class |
2858
|
|
|
|
|
|
|
Status : protected |
2859
|
|
|
|
|
|
|
|
2860
|
|
|
|
|
|
|
If start E stop and the sequence is nucleotide, then this method |
2861
|
|
|
|
|
|
|
should return the reverse complement. The sequence class may be |
2862
|
|
|
|
|
|
|
ignored by those databases that do not recognize different object |
2863
|
|
|
|
|
|
|
types. |
2864
|
|
|
|
|
|
|
|
2865
|
|
|
|
|
|
|
=cut |
2866
|
|
|
|
|
|
|
|
2867
|
|
|
|
|
|
|
sub get_dna { |
2868
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
2869
|
0
|
|
|
|
|
0
|
my ($id,$start,$stop,$class,) = @_; |
2870
|
0
|
|
|
|
|
0
|
$self->throw("get_dna() must be implemented by an adaptor"); |
2871
|
|
|
|
|
|
|
} |
2872
|
|
|
|
|
|
|
|
2873
|
|
|
|
|
|
|
=head2 get_features |
2874
|
|
|
|
|
|
|
|
2875
|
|
|
|
|
|
|
Title : get_features |
2876
|
|
|
|
|
|
|
Usage : $db->get_features($search,$options,$callback) |
2877
|
|
|
|
|
|
|
Function: get list of features for a region |
2878
|
|
|
|
|
|
|
Returns : count of number of features retrieved |
2879
|
|
|
|
|
|
|
Args : see below |
2880
|
|
|
|
|
|
|
Status : protected |
2881
|
|
|
|
|
|
|
|
2882
|
|
|
|
|
|
|
The first argument is a hash reference containing search criteria for |
2883
|
|
|
|
|
|
|
retrieving features. It contains the following keys: |
2884
|
|
|
|
|
|
|
|
2885
|
|
|
|
|
|
|
rangetype One of "overlaps", "contains" or "contained_in". Indicates |
2886
|
|
|
|
|
|
|
the type of range query requested. |
2887
|
|
|
|
|
|
|
|
2888
|
|
|
|
|
|
|
refseq ID of the landmark that establishes the absolute |
2889
|
|
|
|
|
|
|
coordinate system. |
2890
|
|
|
|
|
|
|
|
2891
|
|
|
|
|
|
|
refclass Class of this landmark. Can be ignored by implementations |
2892
|
|
|
|
|
|
|
that don't recognize such distinctions. |
2893
|
|
|
|
|
|
|
|
2894
|
|
|
|
|
|
|
start Start of the range, inclusive. |
2895
|
|
|
|
|
|
|
|
2896
|
|
|
|
|
|
|
stop Stop of the range, inclusive. |
2897
|
|
|
|
|
|
|
|
2898
|
|
|
|
|
|
|
types Array reference containing the list of annotation types |
2899
|
|
|
|
|
|
|
to fetch from the database. Each annotation type is an |
2900
|
|
|
|
|
|
|
array reference consisting of [source,method]. |
2901
|
|
|
|
|
|
|
|
2902
|
|
|
|
|
|
|
The second argument is a hash reference containing certain options |
2903
|
|
|
|
|
|
|
that affect the way information is retrieved: |
2904
|
|
|
|
|
|
|
|
2905
|
|
|
|
|
|
|
sort_by_group |
2906
|
|
|
|
|
|
|
A flag. If true, means that the returned features should be |
2907
|
|
|
|
|
|
|
sorted by the group that they're in. |
2908
|
|
|
|
|
|
|
|
2909
|
|
|
|
|
|
|
sparse A flag. If true, means that the expected density of the |
2910
|
|
|
|
|
|
|
features is such that it will be more efficient to search |
2911
|
|
|
|
|
|
|
by type rather than by range. If it is taking a long |
2912
|
|
|
|
|
|
|
time to fetch features, give this a try. |
2913
|
|
|
|
|
|
|
|
2914
|
|
|
|
|
|
|
binsize A true value will create a set of artificial features whose |
2915
|
|
|
|
|
|
|
start and stop positions indicate bins of the given size, and |
2916
|
|
|
|
|
|
|
whose scores are the number of features in the bin. The |
2917
|
|
|
|
|
|
|
class of the feature will be set to "bin", and its name to |
2918
|
|
|
|
|
|
|
"method:source". This is a handy way of generating histograms |
2919
|
|
|
|
|
|
|
of feature density. |
2920
|
|
|
|
|
|
|
|
2921
|
|
|
|
|
|
|
The third argument, the $callback, is a code reference to which |
2922
|
|
|
|
|
|
|
retrieved features are passed. It is described in more detail below. |
2923
|
|
|
|
|
|
|
|
2924
|
|
|
|
|
|
|
This routine is responsible for getting arrays of GFF data out of the |
2925
|
|
|
|
|
|
|
database and passing them to the callback subroutine. The callback |
2926
|
|
|
|
|
|
|
does the work of constructing a Bio::DB::GFF::Feature object out of |
2927
|
|
|
|
|
|
|
that data. The callback expects a list of 13 fields: |
2928
|
|
|
|
|
|
|
|
2929
|
|
|
|
|
|
|
$refseq The reference sequence |
2930
|
|
|
|
|
|
|
$start feature start |
2931
|
|
|
|
|
|
|
$stop feature stop |
2932
|
|
|
|
|
|
|
$source feature source |
2933
|
|
|
|
|
|
|
$method feature method |
2934
|
|
|
|
|
|
|
$score feature score |
2935
|
|
|
|
|
|
|
$strand feature strand |
2936
|
|
|
|
|
|
|
$phase feature phase |
2937
|
|
|
|
|
|
|
$groupclass group class (may be undef) |
2938
|
|
|
|
|
|
|
$groupname group ID (may be undef) |
2939
|
|
|
|
|
|
|
$tstart target start for similarity hits (may be undef) |
2940
|
|
|
|
|
|
|
$tstop target stop for similarity hits (may be undef) |
2941
|
|
|
|
|
|
|
$feature_id A unique feature ID (may be undef) |
2942
|
|
|
|
|
|
|
|
2943
|
|
|
|
|
|
|
These fields are in the same order as the raw GFF file, with the |
2944
|
|
|
|
|
|
|
exception that the group column has been parsed into group class and |
2945
|
|
|
|
|
|
|
group name fields. |
2946
|
|
|
|
|
|
|
|
2947
|
|
|
|
|
|
|
The feature ID, if provided, is a unique identifier of the feature |
2948
|
|
|
|
|
|
|
line. The module does not depend on this ID in any way, but it is |
2949
|
|
|
|
|
|
|
available via Bio::DB::GFF-Eid() if wanted. In the dbi::mysql and |
2950
|
|
|
|
|
|
|
dbi::mysqlopt adaptor, the ID is a unique row ID. In the acedb |
2951
|
|
|
|
|
|
|
adaptor it is not used. |
2952
|
|
|
|
|
|
|
|
2953
|
|
|
|
|
|
|
=cut |
2954
|
|
|
|
|
|
|
|
2955
|
|
|
|
|
|
|
=head2 feature_summary(), coverage_array() |
2956
|
|
|
|
|
|
|
|
2957
|
|
|
|
|
|
|
The DBI adaptors provide methods for rapidly fetching coverage |
2958
|
|
|
|
|
|
|
statistics across a region of interest. Please see |
2959
|
|
|
|
|
|
|
L for more information about these |
2960
|
|
|
|
|
|
|
methods. |
2961
|
|
|
|
|
|
|
|
2962
|
|
|
|
|
|
|
=cut |
2963
|
|
|
|
|
|
|
|
2964
|
|
|
|
|
|
|
sub get_features{ |
2965
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
2966
|
0
|
|
|
|
|
0
|
my ($search,$options,$callback) = @_; |
2967
|
0
|
|
|
|
|
0
|
$self->throw("get_features() must be implemented by an adaptor"); |
2968
|
|
|
|
|
|
|
} |
2969
|
|
|
|
|
|
|
|
2970
|
|
|
|
|
|
|
|
2971
|
|
|
|
|
|
|
=head2 _feature_by_name |
2972
|
|
|
|
|
|
|
|
2973
|
|
|
|
|
|
|
Title : _feature_by_name |
2974
|
|
|
|
|
|
|
Usage : $db->_feature_by_name($class,$name,$location,$callback) |
2975
|
|
|
|
|
|
|
Function: get a list of features by name and class |
2976
|
|
|
|
|
|
|
Returns : count of number of features retrieved |
2977
|
|
|
|
|
|
|
Args : name of feature, class of feature, and a callback |
2978
|
|
|
|
|
|
|
Status : abstract |
2979
|
|
|
|
|
|
|
|
2980
|
|
|
|
|
|
|
This method is used internally. The callback arguments are the same |
2981
|
|
|
|
|
|
|
as those used by make_feature(). This method must be overidden by |
2982
|
|
|
|
|
|
|
subclasses. |
2983
|
|
|
|
|
|
|
|
2984
|
|
|
|
|
|
|
=cut |
2985
|
|
|
|
|
|
|
|
2986
|
|
|
|
|
|
|
sub _feature_by_name { |
2987
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2988
|
0
|
|
|
|
|
0
|
my ($class,$name,$location,$callback) = @_; |
2989
|
0
|
|
|
|
|
0
|
$self->throw("_feature_by_name() must be implemented by an adaptor"); |
2990
|
|
|
|
|
|
|
} |
2991
|
|
|
|
|
|
|
|
2992
|
|
|
|
|
|
|
sub _feature_by_attribute { |
2993
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
2994
|
0
|
|
|
|
|
0
|
my ($attributes,$callback) = @_; |
2995
|
0
|
|
|
|
|
0
|
$self->throw("_feature_by_name() must be implemented by an adaptor"); |
2996
|
|
|
|
|
|
|
} |
2997
|
|
|
|
|
|
|
|
2998
|
|
|
|
|
|
|
=head2 _feature_by_id |
2999
|
|
|
|
|
|
|
|
3000
|
|
|
|
|
|
|
Title : _feature_by_id |
3001
|
|
|
|
|
|
|
Usage : $db->_feature_by_id($ids,$type,$callback) |
3002
|
|
|
|
|
|
|
Function: get a feature based |
3003
|
|
|
|
|
|
|
Returns : count of number of features retrieved |
3004
|
|
|
|
|
|
|
Args : arrayref to feature IDs to fetch |
3005
|
|
|
|
|
|
|
Status : abstract |
3006
|
|
|
|
|
|
|
|
3007
|
|
|
|
|
|
|
This method is used internally to fetch features either by their ID or |
3008
|
|
|
|
|
|
|
their group ID. $ids is a arrayref containing a list of IDs, $type is |
3009
|
|
|
|
|
|
|
one of "feature" or "group", and $callback is a callback. The |
3010
|
|
|
|
|
|
|
callback arguments are the same as those used by make_feature(). This |
3011
|
|
|
|
|
|
|
method must be overidden by subclasses. |
3012
|
|
|
|
|
|
|
|
3013
|
|
|
|
|
|
|
=cut |
3014
|
|
|
|
|
|
|
|
3015
|
|
|
|
|
|
|
sub _feature_by_id { |
3016
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
3017
|
0
|
|
|
|
|
0
|
my ($ids,$type,$callback) = @_; |
3018
|
0
|
|
|
|
|
0
|
$self->throw("_feature_by_id() must be implemented by an adaptor"); |
3019
|
|
|
|
|
|
|
} |
3020
|
|
|
|
|
|
|
|
3021
|
|
|
|
|
|
|
=head2 overlapping_features |
3022
|
|
|
|
|
|
|
|
3023
|
|
|
|
|
|
|
Title : overlapping_features |
3024
|
|
|
|
|
|
|
Usage : $db->overlapping_features(@args) |
3025
|
|
|
|
|
|
|
Function: get features that overlap the indicated range |
3026
|
|
|
|
|
|
|
Returns : a list of Bio::DB::GFF::Feature objects |
3027
|
|
|
|
|
|
|
Args : see below |
3028
|
|
|
|
|
|
|
Status : public |
3029
|
|
|
|
|
|
|
|
3030
|
|
|
|
|
|
|
This method is invoked by Bio::DB::GFF::Segment-Efeatures() to find |
3031
|
|
|
|
|
|
|
the list of features that overlap a given range. It is generally |
3032
|
|
|
|
|
|
|
preferable to create the Segment first, and then fetch the features. |
3033
|
|
|
|
|
|
|
|
3034
|
|
|
|
|
|
|
This method takes set of named arguments: |
3035
|
|
|
|
|
|
|
|
3036
|
|
|
|
|
|
|
-refseq ID of the reference sequence |
3037
|
|
|
|
|
|
|
-class Class of the reference sequence |
3038
|
|
|
|
|
|
|
-start Start of the desired range in refseq coordinates |
3039
|
|
|
|
|
|
|
-stop Stop of the desired range in refseq coordinates |
3040
|
|
|
|
|
|
|
-types List of feature types to return. Argument is an array |
3041
|
|
|
|
|
|
|
reference containing strings of the format "method:source" |
3042
|
|
|
|
|
|
|
-parent A parent Bio::DB::GFF::Segment object, used to create |
3043
|
|
|
|
|
|
|
relative coordinates in the generated features. |
3044
|
|
|
|
|
|
|
-rare Turn on an optimization suitable for a relatively rare feature type, |
3045
|
|
|
|
|
|
|
where it will be faster to filter by feature type first |
3046
|
|
|
|
|
|
|
and then by position, rather than vice versa. |
3047
|
|
|
|
|
|
|
-merge Whether to apply aggregators to the generated features. |
3048
|
|
|
|
|
|
|
-iterator Whether to return an iterator across the features. |
3049
|
|
|
|
|
|
|
|
3050
|
|
|
|
|
|
|
If -iterator is true, then the method returns a single scalar value |
3051
|
|
|
|
|
|
|
consisting of a Bio::SeqIO object. You can call next_seq() repeatedly |
3052
|
|
|
|
|
|
|
on this object to fetch each of the features in turn. If iterator is |
3053
|
|
|
|
|
|
|
false or absent, then all the features are returned as a list. |
3054
|
|
|
|
|
|
|
|
3055
|
|
|
|
|
|
|
Currently aggregation is disabled when iterating over a series of |
3056
|
|
|
|
|
|
|
features. |
3057
|
|
|
|
|
|
|
|
3058
|
|
|
|
|
|
|
Types are indicated using the nomenclature "method:source". Either of |
3059
|
|
|
|
|
|
|
these fields can be omitted, in which case a wildcard is used for the |
3060
|
|
|
|
|
|
|
missing field. Type names without the colon (e.g. "exon") are |
3061
|
|
|
|
|
|
|
interpreted as the method name and a source wild card. Regular |
3062
|
|
|
|
|
|
|
expressions are allowed in either field, as in: "similarity:BLAST.*". |
3063
|
|
|
|
|
|
|
|
3064
|
|
|
|
|
|
|
=cut |
3065
|
|
|
|
|
|
|
|
3066
|
|
|
|
|
|
|
# call to return the features that overlap the named region |
3067
|
|
|
|
|
|
|
# real work is done by get_features |
3068
|
|
|
|
|
|
|
sub overlapping_features { |
3069
|
85
|
|
|
85
|
1
|
65
|
my $self = shift; |
3070
|
85
|
|
|
|
|
200
|
$self->features_in_range(-range_type=>'overlaps',@_); |
3071
|
|
|
|
|
|
|
} |
3072
|
|
|
|
|
|
|
|
3073
|
|
|
|
|
|
|
=head2 contained_features |
3074
|
|
|
|
|
|
|
|
3075
|
|
|
|
|
|
|
Title : contained_features |
3076
|
|
|
|
|
|
|
Usage : $db->contained_features(@args) |
3077
|
|
|
|
|
|
|
Function: get features that are contained within the indicated range |
3078
|
|
|
|
|
|
|
Returns : a list of Bio::DB::GFF::Feature objects |
3079
|
|
|
|
|
|
|
Args : see overlapping_features() |
3080
|
|
|
|
|
|
|
Status : public |
3081
|
|
|
|
|
|
|
|
3082
|
|
|
|
|
|
|
This call is similar to overlapping_features(), except that it only |
3083
|
|
|
|
|
|
|
retrieves features whose end points are completely contained within |
3084
|
|
|
|
|
|
|
the specified range. |
3085
|
|
|
|
|
|
|
|
3086
|
|
|
|
|
|
|
Generally you will want to fetch a Bio::DB::GFF::Segment object and |
3087
|
|
|
|
|
|
|
call its contained_features() method rather than call this directly. |
3088
|
|
|
|
|
|
|
|
3089
|
|
|
|
|
|
|
=cut |
3090
|
|
|
|
|
|
|
|
3091
|
|
|
|
|
|
|
# The same, except that it only returns features that are completely contained within the |
3092
|
|
|
|
|
|
|
# range (much faster usually) |
3093
|
|
|
|
|
|
|
sub contained_features { |
3094
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
3095
|
0
|
|
|
|
|
0
|
$self->features_in_range(-range_type=>'contains',@_); |
3096
|
|
|
|
|
|
|
} |
3097
|
|
|
|
|
|
|
|
3098
|
|
|
|
|
|
|
=head2 contained_in |
3099
|
|
|
|
|
|
|
|
3100
|
|
|
|
|
|
|
Title : contained_in |
3101
|
|
|
|
|
|
|
Usage : @features = $s->contained_in(@args) |
3102
|
|
|
|
|
|
|
Function: get features that contain this segment |
3103
|
|
|
|
|
|
|
Returns : a list of Bio::DB::GFF::Feature objects |
3104
|
|
|
|
|
|
|
Args : see features() |
3105
|
|
|
|
|
|
|
Status : Public |
3106
|
|
|
|
|
|
|
|
3107
|
|
|
|
|
|
|
This is identical in behavior to features() except that it returns |
3108
|
|
|
|
|
|
|
only those features that completely contain the segment. |
3109
|
|
|
|
|
|
|
|
3110
|
|
|
|
|
|
|
=cut |
3111
|
|
|
|
|
|
|
|
3112
|
|
|
|
|
|
|
sub contained_in { |
3113
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
3114
|
0
|
|
|
|
|
0
|
$self->features_in_range(-range_type=>'contained_in',@_); |
3115
|
|
|
|
|
|
|
} |
3116
|
|
|
|
|
|
|
|
3117
|
|
|
|
|
|
|
=head2 get_abscoords |
3118
|
|
|
|
|
|
|
|
3119
|
|
|
|
|
|
|
Title : get_abscoords |
3120
|
|
|
|
|
|
|
Usage : $db->get_abscoords($name,$class,$refseq) |
3121
|
|
|
|
|
|
|
Function: get the absolute coordinates of sequence with name & class |
3122
|
|
|
|
|
|
|
Returns : ($absref,$absstart,$absstop,$absstrand) |
3123
|
|
|
|
|
|
|
Args : name and class of the landmark |
3124
|
|
|
|
|
|
|
Status : protected |
3125
|
|
|
|
|
|
|
|
3126
|
|
|
|
|
|
|
Given the name and class of a genomic landmark, this function returns |
3127
|
|
|
|
|
|
|
a four-element array consisting of: |
3128
|
|
|
|
|
|
|
|
3129
|
|
|
|
|
|
|
$absref the ID of the reference sequence that contains this landmark |
3130
|
|
|
|
|
|
|
$absstart the position at which the landmark starts |
3131
|
|
|
|
|
|
|
$absstop the position at which the landmark stops |
3132
|
|
|
|
|
|
|
$absstrand the strand of the landmark, relative to the reference sequence |
3133
|
|
|
|
|
|
|
|
3134
|
|
|
|
|
|
|
If $refseq is provided, the function searches only within the |
3135
|
|
|
|
|
|
|
specified reference sequence. |
3136
|
|
|
|
|
|
|
|
3137
|
|
|
|
|
|
|
=cut |
3138
|
|
|
|
|
|
|
|
3139
|
|
|
|
|
|
|
sub get_abscoords { |
3140
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
3141
|
0
|
|
|
|
|
0
|
my ($name,$class,$refseq) = @_; |
3142
|
0
|
|
|
|
|
0
|
$self->throw("get_abscoords() must be implemented by an adaptor"); |
3143
|
|
|
|
|
|
|
} |
3144
|
|
|
|
|
|
|
|
3145
|
|
|
|
|
|
|
=head2 get_types |
3146
|
|
|
|
|
|
|
|
3147
|
|
|
|
|
|
|
Title : get_types |
3148
|
|
|
|
|
|
|
Usage : $db->get_types($absref,$class,$start,$stop,$count) |
3149
|
|
|
|
|
|
|
Function: get list of all feature types on the indicated segment |
3150
|
|
|
|
|
|
|
Returns : list or hash of Bio::DB::GFF::Typename objects |
3151
|
|
|
|
|
|
|
Args : see below |
3152
|
|
|
|
|
|
|
Status : protected |
3153
|
|
|
|
|
|
|
|
3154
|
|
|
|
|
|
|
Arguments are: |
3155
|
|
|
|
|
|
|
|
3156
|
|
|
|
|
|
|
$absref the ID of the reference sequence |
3157
|
|
|
|
|
|
|
$class the class of the reference sequence |
3158
|
|
|
|
|
|
|
$start the position to start counting |
3159
|
|
|
|
|
|
|
$stop the position to end counting |
3160
|
|
|
|
|
|
|
$count a boolean indicating whether to count the number |
3161
|
|
|
|
|
|
|
of occurrences of each feature type |
3162
|
|
|
|
|
|
|
|
3163
|
|
|
|
|
|
|
If $count is true, then a hash is returned. The keys of the hash are |
3164
|
|
|
|
|
|
|
feature type names in the format "method:source" and the values are |
3165
|
|
|
|
|
|
|
the number of times a feature of this type overlaps the indicated |
3166
|
|
|
|
|
|
|
segment. Otherwise, the call returns a set of Bio::DB::GFF::Typename |
3167
|
|
|
|
|
|
|
objects. If $start or $stop are undef, then all features on the |
3168
|
|
|
|
|
|
|
indicated segment are enumerated. If $absref is undef, then the call |
3169
|
|
|
|
|
|
|
returns all feature types in the database. |
3170
|
|
|
|
|
|
|
|
3171
|
|
|
|
|
|
|
=cut |
3172
|
|
|
|
|
|
|
|
3173
|
|
|
|
|
|
|
sub get_types { |
3174
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
3175
|
0
|
|
|
|
|
0
|
my ($refseq,$class,$start,$stop,$count,$types) = @_; |
3176
|
0
|
|
|
|
|
0
|
$self->throw("get_types() must be implemented by an adaptor"); |
3177
|
|
|
|
|
|
|
} |
3178
|
|
|
|
|
|
|
|
3179
|
|
|
|
|
|
|
=head2 make_feature |
3180
|
|
|
|
|
|
|
|
3181
|
|
|
|
|
|
|
Title : make_feature |
3182
|
|
|
|
|
|
|
Usage : $db->make_feature(@args) |
3183
|
|
|
|
|
|
|
Function: Create a Bio::DB::GFF::Feature object from string data |
3184
|
|
|
|
|
|
|
Returns : a Bio::DB::GFF::Feature object |
3185
|
|
|
|
|
|
|
Args : see below |
3186
|
|
|
|
|
|
|
Status : internal |
3187
|
|
|
|
|
|
|
|
3188
|
|
|
|
|
|
|
This takes 14 arguments (really!): |
3189
|
|
|
|
|
|
|
|
3190
|
|
|
|
|
|
|
$parent A Bio::DB::GFF::RelSegment object |
3191
|
|
|
|
|
|
|
$group_hash A hashref containing unique list of GFF groups |
3192
|
|
|
|
|
|
|
$refname The name of the reference sequence for this feature |
3193
|
|
|
|
|
|
|
$refclass The class of the reference sequence for this feature |
3194
|
|
|
|
|
|
|
$start Start of feature |
3195
|
|
|
|
|
|
|
$stop Stop of feature |
3196
|
|
|
|
|
|
|
$source Feature source field |
3197
|
|
|
|
|
|
|
$method Feature method field |
3198
|
|
|
|
|
|
|
$score Feature score field |
3199
|
|
|
|
|
|
|
$strand Feature strand |
3200
|
|
|
|
|
|
|
$phase Feature phase |
3201
|
|
|
|
|
|
|
$group_class Class of feature group |
3202
|
|
|
|
|
|
|
$group_name Name of feature group |
3203
|
|
|
|
|
|
|
$tstart For homologies, start of hit on target |
3204
|
|
|
|
|
|
|
$tstop Stop of hit on target |
3205
|
|
|
|
|
|
|
|
3206
|
|
|
|
|
|
|
The $parent argument, if present, is used to establish relative |
3207
|
|
|
|
|
|
|
coordinates in the resulting Bio::DB::Feature object. This allows one |
3208
|
|
|
|
|
|
|
feature to generate a list of other features that are relative to its |
3209
|
|
|
|
|
|
|
coordinate system (for example, finding the coordinates of the second |
3210
|
|
|
|
|
|
|
exon relative to the coordinates of the first). |
3211
|
|
|
|
|
|
|
|
3212
|
|
|
|
|
|
|
The $group_hash allows the group_class/group_name strings to be turned |
3213
|
|
|
|
|
|
|
into rich database objects via the make_obect() method (see above). |
3214
|
|
|
|
|
|
|
Because these objects may be expensive to create, $group_hash is used |
3215
|
|
|
|
|
|
|
to uniquefy them. The index of this hash is the composite key |
3216
|
|
|
|
|
|
|
{$group_class,$group_name,$tstart,$tstop}. Values are whatever object |
3217
|
|
|
|
|
|
|
is returned by the make_object() method. |
3218
|
|
|
|
|
|
|
|
3219
|
|
|
|
|
|
|
The remainder of the fields are taken from the GFF line, with the |
3220
|
|
|
|
|
|
|
exception that "Target" features, which contain information about the |
3221
|
|
|
|
|
|
|
target of a homology search, are parsed into their components. |
3222
|
|
|
|
|
|
|
|
3223
|
|
|
|
|
|
|
=cut |
3224
|
|
|
|
|
|
|
|
3225
|
|
|
|
|
|
|
# This call is responsible for turning a line of GFF into a |
3226
|
|
|
|
|
|
|
# feature object. |
3227
|
|
|
|
|
|
|
# The $parent argument is a Bio::DB::GFF::Segment object and is used |
3228
|
|
|
|
|
|
|
# to establish the coordinate system for the new feature. |
3229
|
|
|
|
|
|
|
# The $group_hash argument is an hash ref that holds previously- |
3230
|
|
|
|
|
|
|
# generated group objects. |
3231
|
|
|
|
|
|
|
# Other arguments are taken right out of the GFF table. |
3232
|
|
|
|
|
|
|
sub make_feature { |
3233
|
844
|
|
|
844
|
1
|
644
|
my $self = shift; |
3234
|
844
|
|
|
|
|
1335
|
my ($parent,$group_hash, # these arguments provided by generic mechanisms |
3235
|
|
|
|
|
|
|
$srcseq, # the rest is provided by adaptor |
3236
|
|
|
|
|
|
|
$start,$stop, |
3237
|
|
|
|
|
|
|
$source,$method, |
3238
|
|
|
|
|
|
|
$score,$strand,$phase, |
3239
|
|
|
|
|
|
|
$group_class,$group_name, |
3240
|
|
|
|
|
|
|
$tstart,$tstop, |
3241
|
|
|
|
|
|
|
$db_id,$group_id) = @_; |
3242
|
|
|
|
|
|
|
|
3243
|
844
|
100
|
|
|
|
1099
|
return unless $srcseq; # return undef if called with no arguments. This behavior is used for |
3244
|
|
|
|
|
|
|
# on-the-fly aggregation. |
3245
|
|
|
|
|
|
|
|
3246
|
829
|
|
|
|
|
498
|
my $group; # undefined |
3247
|
829
|
50
|
33
|
|
|
2322
|
if (defined $group_class && defined $group_name) { |
3248
|
829
|
|
100
|
|
|
1577
|
$tstart ||= ''; |
3249
|
829
|
|
100
|
|
|
1357
|
$tstop ||= ''; |
3250
|
829
|
100
|
|
|
|
954
|
if ($group_hash) { |
3251
|
559
|
|
66
|
|
|
1785
|
$group = $group_hash->{$group_class,$group_name,$tstart,$tstop} |
3252
|
|
|
|
|
|
|
||= $self->make_object($group_class,$group_name,$tstart,$tstop); |
3253
|
|
|
|
|
|
|
} else { |
3254
|
270
|
|
|
|
|
409
|
$group = $self->make_object($group_class,$group_name,$tstart,$tstop); |
3255
|
|
|
|
|
|
|
} |
3256
|
|
|
|
|
|
|
} |
3257
|
|
|
|
|
|
|
|
3258
|
|
|
|
|
|
|
# fix for some broken GFF files |
3259
|
|
|
|
|
|
|
# unfortunately - has undesired side effects |
3260
|
|
|
|
|
|
|
# if (defined $tstart && defined $tstop && !defined $strand) { |
3261
|
|
|
|
|
|
|
# $strand = $tstart <= $tstop ? '+' : '-'; |
3262
|
|
|
|
|
|
|
# } |
3263
|
|
|
|
|
|
|
|
3264
|
829
|
100
|
|
|
|
1079
|
if (ref $parent) { # note that the src sequence is ignored |
3265
|
505
|
|
|
|
|
861
|
return Bio::DB::GFF::Feature->new_from_parent($parent,$start,$stop, |
3266
|
|
|
|
|
|
|
$method,$source, |
3267
|
|
|
|
|
|
|
$score,$strand,$phase, |
3268
|
|
|
|
|
|
|
$group,$db_id,$group_id, |
3269
|
|
|
|
|
|
|
$tstart,$tstop); |
3270
|
|
|
|
|
|
|
} else { |
3271
|
324
|
|
|
|
|
629
|
return Bio::DB::GFF::Feature->new($self,$srcseq, |
3272
|
|
|
|
|
|
|
$start,$stop, |
3273
|
|
|
|
|
|
|
$method,$source, |
3274
|
|
|
|
|
|
|
$score,$strand,$phase, |
3275
|
|
|
|
|
|
|
$group,$db_id,$group_id, |
3276
|
|
|
|
|
|
|
$tstart,$tstop); |
3277
|
|
|
|
|
|
|
} |
3278
|
|
|
|
|
|
|
} |
3279
|
|
|
|
|
|
|
|
3280
|
|
|
|
|
|
|
sub make_aggregated_feature { |
3281
|
15
|
|
|
15
|
0
|
18
|
my $self = shift; |
3282
|
15
|
|
|
|
|
27
|
my ($accumulated_features,$parent,$aggregators) = splice(@_,0,3); |
3283
|
15
|
|
|
|
|
26
|
my $feature = $self->make_feature($parent,undef,@_); |
3284
|
15
|
50
|
66
|
|
|
27
|
return [$feature] if $feature && !$feature->group; |
3285
|
|
|
|
|
|
|
|
3286
|
|
|
|
|
|
|
# if we have accumulated features and either: |
3287
|
|
|
|
|
|
|
# (1) make_feature() returned undef, indicated very end or |
3288
|
|
|
|
|
|
|
# (2) the current group is different from the previous one |
3289
|
|
|
|
|
|
|
|
3290
|
15
|
|
|
|
|
38
|
local $^W = 0; # irritating uninitialized value warning in next statement |
3291
|
15
|
50
|
66
|
|
|
70
|
if (@$accumulated_features && |
|
|
|
66
|
|
|
|
|
3292
|
|
|
|
|
|
|
(!defined($feature) || ($accumulated_features->[-1]->group ne $feature->group))) { |
3293
|
10
|
|
|
|
|
18
|
foreach my $a (@$aggregators) { # last aggregator gets first shot |
3294
|
30
|
50
|
|
|
|
60
|
$a->aggregate($accumulated_features,$self) or next; |
3295
|
|
|
|
|
|
|
} |
3296
|
10
|
|
|
|
|
20
|
my @result = @$accumulated_features; |
3297
|
10
|
100
|
|
|
|
18
|
@$accumulated_features = $feature ? ($feature) : (); |
3298
|
10
|
50
|
|
|
|
25
|
return unless @result; |
3299
|
10
|
|
|
|
|
37
|
return \@result ; |
3300
|
|
|
|
|
|
|
} |
3301
|
5
|
|
|
|
|
10
|
push @$accumulated_features,$feature; |
3302
|
5
|
|
|
|
|
13
|
return; |
3303
|
|
|
|
|
|
|
} |
3304
|
|
|
|
|
|
|
|
3305
|
|
|
|
|
|
|
=head2 make_match_sub |
3306
|
|
|
|
|
|
|
|
3307
|
|
|
|
|
|
|
Title : make_match_sub |
3308
|
|
|
|
|
|
|
Usage : $db->make_match_sub($types) |
3309
|
|
|
|
|
|
|
Function: creates a subroutine used for filtering features |
3310
|
|
|
|
|
|
|
Returns : a code reference |
3311
|
|
|
|
|
|
|
Args : a list of parsed type names |
3312
|
|
|
|
|
|
|
Status : protected |
3313
|
|
|
|
|
|
|
|
3314
|
|
|
|
|
|
|
This method is used internally to generate a code subroutine that will |
3315
|
|
|
|
|
|
|
accept or reject a feature based on its method and source. It takes |
3316
|
|
|
|
|
|
|
an array of parsed type names in the format returned by parse_types(), |
3317
|
|
|
|
|
|
|
and generates an anonymous subroutine. The subroutine takes a single |
3318
|
|
|
|
|
|
|
Bio::DB::GFF::Feature object and returns true if the feature matches |
3319
|
|
|
|
|
|
|
one of the desired feature types, and false otherwise. |
3320
|
|
|
|
|
|
|
|
3321
|
|
|
|
|
|
|
=cut |
3322
|
|
|
|
|
|
|
|
3323
|
|
|
|
|
|
|
# a subroutine that matches features indicated by list of types |
3324
|
|
|
|
|
|
|
sub make_match_sub { |
3325
|
168
|
|
|
168
|
1
|
117
|
my $self = shift; |
3326
|
168
|
|
|
|
|
131
|
my $types = shift; |
3327
|
|
|
|
|
|
|
|
3328
|
168
|
50
|
33
|
0
|
|
582
|
return sub { 1 } unless ref $types && @$types; |
|
0
|
|
|
|
|
0
|
|
3329
|
|
|
|
|
|
|
|
3330
|
168
|
|
|
|
|
109
|
my @expr; |
3331
|
168
|
|
|
|
|
217
|
for my $type (@$types) { |
3332
|
1432
|
|
|
|
|
1308
|
my ($method,$source) = @$type; |
3333
|
1432
|
50
|
|
|
|
1720
|
$method = $method ? "\\Q$method\\E" : ".*"; |
3334
|
1432
|
50
|
|
|
|
1237
|
$source = $source ? ":\\Q$source\\E" : "(?::.+)?"; |
3335
|
1432
|
|
|
|
|
1682
|
push @expr,"${method}${source}"; |
3336
|
|
|
|
|
|
|
} |
3337
|
168
|
|
|
|
|
358
|
my $expr = join '|',@expr; |
3338
|
168
|
100
|
|
|
|
878
|
return $self->{match_subs}{$expr} if $self->{match_subs}{$expr}; |
3339
|
|
|
|
|
|
|
|
3340
|
15
|
|
|
|
|
46
|
my $sub =<
|
3341
|
|
|
|
|
|
|
sub { |
3342
|
|
|
|
|
|
|
my \$feature = shift or return; |
3343
|
|
|
|
|
|
|
return \$feature->type =~ /^($expr)\$/i; |
3344
|
|
|
|
|
|
|
} |
3345
|
|
|
|
|
|
|
END |
3346
|
15
|
50
|
|
|
|
39
|
warn "match sub: $sub\n" if $self->debug; |
3347
|
15
|
|
|
|
|
25
|
undef $@; |
3348
|
15
|
|
|
|
|
2632
|
my $compiled_sub = eval $sub; |
3349
|
15
|
50
|
|
|
|
38
|
$self->throw($@) if $@; |
3350
|
15
|
|
|
|
|
84
|
return $self->{match_subs}{$expr} = $compiled_sub; |
3351
|
|
|
|
|
|
|
} |
3352
|
|
|
|
|
|
|
|
3353
|
|
|
|
|
|
|
=head2 make_object |
3354
|
|
|
|
|
|
|
|
3355
|
|
|
|
|
|
|
Title : make_object |
3356
|
|
|
|
|
|
|
Usage : $db->make_object($class,$name,$start,$stop) |
3357
|
|
|
|
|
|
|
Function: creates a feature object |
3358
|
|
|
|
|
|
|
Returns : a feature object |
3359
|
|
|
|
|
|
|
Args : see below |
3360
|
|
|
|
|
|
|
Status : protected |
3361
|
|
|
|
|
|
|
|
3362
|
|
|
|
|
|
|
This method is called to make an object from the GFF "group" field. |
3363
|
|
|
|
|
|
|
By default, all Target groups are turned into Bio::DB::GFF::Homol |
3364
|
|
|
|
|
|
|
objects, and everything else becomes a Bio::DB::GFF::Featname. |
3365
|
|
|
|
|
|
|
However, adaptors are free to override this method to generate more |
3366
|
|
|
|
|
|
|
interesting objects, such as true BioPerl objects, or Acedb objects. |
3367
|
|
|
|
|
|
|
|
3368
|
|
|
|
|
|
|
Arguments are: |
3369
|
|
|
|
|
|
|
|
3370
|
|
|
|
|
|
|
$name database ID for object |
3371
|
|
|
|
|
|
|
$class class of object |
3372
|
|
|
|
|
|
|
$start for similarities, start of match inside object |
3373
|
|
|
|
|
|
|
$stop for similarities, stop of match inside object |
3374
|
|
|
|
|
|
|
|
3375
|
|
|
|
|
|
|
=cut |
3376
|
|
|
|
|
|
|
|
3377
|
|
|
|
|
|
|
# abstract call to turn a feature into an object, given its class and name |
3378
|
|
|
|
|
|
|
sub make_object { |
3379
|
595
|
|
|
595
|
1
|
443
|
my $self = shift; |
3380
|
595
|
|
|
|
|
589
|
my ($class,$name,$start,$stop) = @_; |
3381
|
595
|
100
|
66
|
|
|
1903
|
return Bio::DB::GFF::Homol->new($self,$class,$name,$start,$stop) |
3382
|
|
|
|
|
|
|
if defined $start and length $start; |
3383
|
432
|
|
|
|
|
977
|
return Bio::DB::GFF::Featname->new($class,$name); |
3384
|
|
|
|
|
|
|
} |
3385
|
|
|
|
|
|
|
|
3386
|
|
|
|
|
|
|
|
3387
|
|
|
|
|
|
|
=head2 do_attributes |
3388
|
|
|
|
|
|
|
|
3389
|
|
|
|
|
|
|
Title : do_attributes |
3390
|
|
|
|
|
|
|
Usage : $db->do_attributes($id [,$tag]); |
3391
|
|
|
|
|
|
|
Function: internal method to retrieve attributes given an id and tag |
3392
|
|
|
|
|
|
|
Returns : a list of Bio::DB::GFF::Feature objects |
3393
|
|
|
|
|
|
|
Args : a feature id and a attribute tag (optional) |
3394
|
|
|
|
|
|
|
Status : protected |
3395
|
|
|
|
|
|
|
|
3396
|
|
|
|
|
|
|
This method is overridden by subclasses in order to return a list of |
3397
|
|
|
|
|
|
|
attributes. If called with a tag, returns the value of attributes of |
3398
|
|
|
|
|
|
|
that tag type. If called without a tag, returns a flattened array of |
3399
|
|
|
|
|
|
|
(tag=Evalue) pairs. A particular tag can be present multiple times. |
3400
|
|
|
|
|
|
|
|
3401
|
|
|
|
|
|
|
=cut |
3402
|
|
|
|
|
|
|
|
3403
|
|
|
|
|
|
|
sub do_attributes { |
3404
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
3405
|
0
|
|
|
|
|
0
|
my ($id,$tag) = @_; |
3406
|
0
|
|
|
|
|
0
|
return (); |
3407
|
|
|
|
|
|
|
} |
3408
|
|
|
|
|
|
|
|
3409
|
|
|
|
|
|
|
=head2 clone |
3410
|
|
|
|
|
|
|
|
3411
|
|
|
|
|
|
|
The clone() method should be used when you want to pass the |
3412
|
|
|
|
|
|
|
Bio::DB::GFF object to a child process across a fork(). The child must |
3413
|
|
|
|
|
|
|
call clone() before making any queries. |
3414
|
|
|
|
|
|
|
|
3415
|
|
|
|
|
|
|
The default behavior is to do nothing, but adaptors that use the DBI |
3416
|
|
|
|
|
|
|
interface may need to implement this in order to avoid database handle |
3417
|
|
|
|
|
|
|
errors. See the dbi adaptor for an example. |
3418
|
|
|
|
|
|
|
|
3419
|
|
|
|
|
|
|
=cut |
3420
|
|
|
|
|
|
|
|
3421
|
|
|
|
2
|
1
|
|
sub clone { } |
3422
|
|
|
|
|
|
|
|
3423
|
|
|
|
|
|
|
|
3424
|
|
|
|
|
|
|
=head1 Internal Methods |
3425
|
|
|
|
|
|
|
|
3426
|
|
|
|
|
|
|
The following methods are internal to Bio::DB::GFF and are not |
3427
|
|
|
|
|
|
|
guaranteed to remain the same. |
3428
|
|
|
|
|
|
|
|
3429
|
|
|
|
|
|
|
=head2 _features |
3430
|
|
|
|
|
|
|
|
3431
|
|
|
|
|
|
|
Title : _features |
3432
|
|
|
|
|
|
|
Usage : $db->_features($search,$options,$parent) |
3433
|
|
|
|
|
|
|
Function: internal method |
3434
|
|
|
|
|
|
|
Returns : a list of Bio::DB::GFF::Feature objects |
3435
|
|
|
|
|
|
|
Args : see below |
3436
|
|
|
|
|
|
|
Status : internal |
3437
|
|
|
|
|
|
|
|
3438
|
|
|
|
|
|
|
This is an internal method that is called by overlapping_features(), |
3439
|
|
|
|
|
|
|
contained_features() and features() to create features based on a |
3440
|
|
|
|
|
|
|
parent segment's coordinate system. It takes three arguments, a |
3441
|
|
|
|
|
|
|
search options hashref, an options hashref, and a parent segment. |
3442
|
|
|
|
|
|
|
|
3443
|
|
|
|
|
|
|
The search hashref contains the following keys: |
3444
|
|
|
|
|
|
|
|
3445
|
|
|
|
|
|
|
rangetype One of "overlaps", "contains" or "contained_in". Indicates |
3446
|
|
|
|
|
|
|
the type of range query requested. |
3447
|
|
|
|
|
|
|
refseq reference sequence ID |
3448
|
|
|
|
|
|
|
refclass reference sequence class |
3449
|
|
|
|
|
|
|
start start of range |
3450
|
|
|
|
|
|
|
stop stop of range |
3451
|
|
|
|
|
|
|
types arrayref containing list of types in "method:source" form |
3452
|
|
|
|
|
|
|
|
3453
|
|
|
|
|
|
|
The options hashref contains zero or more of the following keys: |
3454
|
|
|
|
|
|
|
|
3455
|
|
|
|
|
|
|
sparse turn on optimizations for a rare feature |
3456
|
|
|
|
|
|
|
automerge if true, invoke aggregators to merge features |
3457
|
|
|
|
|
|
|
iterator if true, return an iterator |
3458
|
|
|
|
|
|
|
|
3459
|
|
|
|
|
|
|
The $parent argument is a scalar object containing a |
3460
|
|
|
|
|
|
|
Bio::DB::GFF::RelSegment object or descendent. |
3461
|
|
|
|
|
|
|
|
3462
|
|
|
|
|
|
|
=cut |
3463
|
|
|
|
|
|
|
|
3464
|
|
|
|
|
|
|
#' |
3465
|
|
|
|
|
|
|
|
3466
|
|
|
|
|
|
|
sub _features { |
3467
|
100
|
|
|
100
|
|
115
|
my $self = shift; |
3468
|
100
|
|
|
|
|
150
|
my ($search,$options,$parent) = @_; |
3469
|
0
|
|
|
|
|
0
|
(@{$search}{qw(start stop)}) = (@{$search}{qw(stop start)}) |
|
0
|
|
|
|
|
0
|
|
3470
|
100
|
50
|
66
|
|
|
338
|
if defined($search->{start}) && $search->{start} > $search->{stop}; |
3471
|
100
|
50
|
|
|
|
171
|
$search->{refseq} = $search->{seq_id} if exists $search->{seq_id}; |
3472
|
|
|
|
|
|
|
|
3473
|
100
|
|
|
|
|
262
|
my $types = $self->parse_types($search->{types}); # parse out list of types |
3474
|
100
|
|
|
|
|
148
|
my @aggregated_types = @$types; # keep a copy |
3475
|
|
|
|
|
|
|
|
3476
|
|
|
|
|
|
|
# allow the aggregators to operate on the original |
3477
|
100
|
|
|
|
|
90
|
my @aggregators; |
3478
|
100
|
100
|
|
|
|
164
|
if ($options->{automerge}) { |
3479
|
85
|
|
|
|
|
163
|
for my $a ($self->aggregators) { |
3480
|
175
|
100
|
|
|
|
275
|
$a = $a->clone if $options->{iterator}; |
3481
|
175
|
100
|
|
|
|
397
|
unshift @aggregators,$a |
3482
|
|
|
|
|
|
|
if $a->disaggregate(\@aggregated_types,$self); |
3483
|
|
|
|
|
|
|
} |
3484
|
|
|
|
|
|
|
} |
3485
|
|
|
|
|
|
|
|
3486
|
100
|
100
|
|
|
|
192
|
if ($options->{iterator}) { |
3487
|
15
|
|
|
|
|
18
|
my @accumulated_features; |
3488
|
15
|
|
|
15
|
|
40
|
my $callback = $options->{automerge} ? sub { $self->make_aggregated_feature(\@accumulated_features,$parent,\@aggregators,@_) } |
3489
|
15
|
100
|
|
270
|
|
79
|
: sub { [$self->make_feature($parent,undef,@_)] }; |
|
270
|
|
|
|
|
437
|
|
3490
|
|
|
|
|
|
|
return $self->get_features_iterator({ %$search, |
3491
|
|
|
|
|
|
|
types => \@aggregated_types }, |
3492
|
|
|
|
|
|
|
{ %$options, |
3493
|
|
|
|
|
|
|
sort_by_group => $options->{automerge} }, |
3494
|
15
|
|
|
|
|
107
|
$callback |
3495
|
|
|
|
|
|
|
); |
3496
|
|
|
|
|
|
|
} |
3497
|
|
|
|
|
|
|
|
3498
|
85
|
|
|
|
|
91
|
my %groups; # cache the groups we create to avoid consuming too much unecessary memory |
3499
|
85
|
|
|
|
|
85
|
my $features = []; |
3500
|
|
|
|
|
|
|
|
3501
|
85
|
|
|
470
|
|
427
|
my $callback = sub { push @$features,$self->make_feature($parent,\%groups,@_) }; |
|
470
|
|
|
|
|
742
|
|
3502
|
85
|
|
|
|
|
618
|
$self->get_features({ %$search, |
3503
|
|
|
|
|
|
|
types => \@aggregated_types }, |
3504
|
|
|
|
|
|
|
$options, |
3505
|
|
|
|
|
|
|
$callback); |
3506
|
|
|
|
|
|
|
|
3507
|
85
|
100
|
|
|
|
271
|
if ($options->{automerge}) { |
3508
|
80
|
50
|
|
|
|
163
|
warn "aggregating...\n" if $self->debug; |
3509
|
80
|
|
|
|
|
128
|
foreach my $a (@aggregators) { # last aggregator gets first shot |
3510
|
95
|
50
|
|
|
|
148
|
warn "Aggregator $a:\n" if $self->debug; |
3511
|
95
|
|
|
|
|
232
|
$a->aggregate($features,$self); |
3512
|
|
|
|
|
|
|
} |
3513
|
|
|
|
|
|
|
} |
3514
|
|
|
|
|
|
|
|
3515
|
85
|
|
|
|
|
781
|
@$features; |
3516
|
|
|
|
|
|
|
} |
3517
|
|
|
|
|
|
|
|
3518
|
|
|
|
|
|
|
=head2 get_features_iterator |
3519
|
|
|
|
|
|
|
|
3520
|
|
|
|
|
|
|
Title : get_features_iterator |
3521
|
|
|
|
|
|
|
Usage : $db->get_features_iterator($search,$options,$callback) |
3522
|
|
|
|
|
|
|
Function: get an iterator on a features query |
3523
|
|
|
|
|
|
|
Returns : a Bio::SeqIO object |
3524
|
|
|
|
|
|
|
Args : as per get_features() |
3525
|
|
|
|
|
|
|
Status : Public |
3526
|
|
|
|
|
|
|
|
3527
|
|
|
|
|
|
|
This method takes the same arguments as get_features(), but returns an |
3528
|
|
|
|
|
|
|
iterator that can be used to fetch features sequentially, as per |
3529
|
|
|
|
|
|
|
Bio::SeqIO. |
3530
|
|
|
|
|
|
|
|
3531
|
|
|
|
|
|
|
Internally, this method is simply a front end to range_query(). |
3532
|
|
|
|
|
|
|
The latter method constructs and executes the query, returning a |
3533
|
|
|
|
|
|
|
statement handle. This routine passes the statement handle to the |
3534
|
|
|
|
|
|
|
constructor for the iterator, along with the callback. |
3535
|
|
|
|
|
|
|
|
3536
|
|
|
|
|
|
|
=cut |
3537
|
|
|
|
|
|
|
|
3538
|
|
|
|
|
|
|
sub get_features_iterator { |
3539
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
3540
|
0
|
|
|
|
|
0
|
my ($search,$options,$callback) = @_; |
3541
|
0
|
|
|
|
|
0
|
$self->throw('feature iteration is not implemented in this adaptor'); |
3542
|
|
|
|
|
|
|
} |
3543
|
|
|
|
|
|
|
|
3544
|
|
|
|
|
|
|
=head2 split_group |
3545
|
|
|
|
|
|
|
|
3546
|
|
|
|
|
|
|
Title : split_group |
3547
|
|
|
|
|
|
|
Usage : $db->split_group($group_field,$gff3_flag) |
3548
|
|
|
|
|
|
|
Function: parse GFF group field |
3549
|
|
|
|
|
|
|
Returns : ($gclass,$gname,$tstart,$tstop,$attributes) |
3550
|
|
|
|
|
|
|
Args : the gff group column and a flag indicating gff3 compatibility |
3551
|
|
|
|
|
|
|
Status : internal |
3552
|
|
|
|
|
|
|
|
3553
|
|
|
|
|
|
|
This is a method that is called by load_gff_line to parse out the |
3554
|
|
|
|
|
|
|
contents of one or more group fields. It returns the class of the |
3555
|
|
|
|
|
|
|
group, its name, the start and stop of the target, if any, and an |
3556
|
|
|
|
|
|
|
array reference containing any attributes that were stuck into the |
3557
|
|
|
|
|
|
|
group field, in [attribute_name,attribute_value] format. |
3558
|
|
|
|
|
|
|
|
3559
|
|
|
|
|
|
|
=cut |
3560
|
|
|
|
|
|
|
|
3561
|
|
|
|
|
|
|
sub split_group { |
3562
|
165
|
|
|
165
|
1
|
130
|
my $self = shift; |
3563
|
165
|
|
|
|
|
183
|
my ($group,$gff3) = @_; |
3564
|
165
|
100
|
|
|
|
188
|
if ($gff3) { |
3565
|
66
|
|
|
|
|
142
|
my @groups = split /[;&]/,$group; # so easy! |
3566
|
66
|
|
|
|
|
114
|
return $self->_split_gff3_group(@groups); |
3567
|
|
|
|
|
|
|
} else { |
3568
|
|
|
|
|
|
|
# handle group parsing |
3569
|
|
|
|
|
|
|
# protect embedded semicolons in the group; there must be faster/more elegant way |
3570
|
|
|
|
|
|
|
# to do this. |
3571
|
99
|
|
|
|
|
96
|
$group =~ s/\\;/$;/g; |
3572
|
99
|
|
|
|
|
189
|
while ($group =~ s/( \"[^\"]*);([^\"]*\")/$1$;$2/) { 1 } |
|
0
|
|
|
|
|
0
|
|
3573
|
99
|
|
|
|
|
201
|
my @groups = split(/\s*;\s*/,$group); |
3574
|
99
|
|
|
|
|
111
|
foreach (@groups) { s/$;/;/g } |
|
138
|
|
|
|
|
207
|
|
3575
|
99
|
|
|
|
|
147
|
return $self->_split_gff2_group(@groups); |
3576
|
|
|
|
|
|
|
} |
3577
|
|
|
|
|
|
|
} |
3578
|
|
|
|
|
|
|
|
3579
|
|
|
|
|
|
|
=head2 _split_gff2_group |
3580
|
|
|
|
|
|
|
|
3581
|
|
|
|
|
|
|
This is an internal method called by split_group(). |
3582
|
|
|
|
|
|
|
|
3583
|
|
|
|
|
|
|
=cut |
3584
|
|
|
|
|
|
|
|
3585
|
|
|
|
|
|
|
# this has gotten quite nasty due to transition from GFF2 to GFF2.5 |
3586
|
|
|
|
|
|
|
# (artemis) to GFF3. |
3587
|
|
|
|
|
|
|
|
3588
|
|
|
|
|
|
|
sub _split_gff2_group { |
3589
|
99
|
|
|
99
|
|
72
|
my $self = shift; |
3590
|
99
|
|
|
|
|
132
|
my @groups = @_; |
3591
|
99
|
|
|
|
|
54
|
my $target_found; |
3592
|
|
|
|
|
|
|
|
3593
|
99
|
|
|
|
|
69
|
my ($gclass,$gname,$tstart,$tstop,@attributes,@notes); |
3594
|
|
|
|
|
|
|
|
3595
|
99
|
|
|
|
|
99
|
for (@groups) { |
3596
|
|
|
|
|
|
|
|
3597
|
138
|
|
|
|
|
387
|
my ($tag,$value) = /^(\S+)(?:\s+(.+))?/; |
3598
|
138
|
50
|
|
|
|
186
|
$value = '' unless defined $value; |
3599
|
138
|
100
|
|
|
|
207
|
if ($value =~ /^\"(.+)\"$/) { #remove quotes |
3600
|
24
|
|
|
|
|
33
|
$value = $1; |
3601
|
|
|
|
|
|
|
} |
3602
|
138
|
|
|
|
|
111
|
$value =~ s/\\t/\t/g; |
3603
|
138
|
|
|
|
|
84
|
$value =~ s/\\r/\r/g; |
3604
|
138
|
|
|
|
|
150
|
$value =~ s/\s+$//; |
3605
|
|
|
|
|
|
|
|
3606
|
|
|
|
|
|
|
# Any additional groups become part of the attributes hash |
3607
|
|
|
|
|
|
|
# For historical reasons, the tag "Note" is treated as an |
3608
|
|
|
|
|
|
|
# attribute, even if it is the only group. |
3609
|
138
|
|
50
|
|
|
147
|
$tag ||= ''; |
3610
|
138
|
50
|
33
|
|
|
672
|
if ($tag eq 'tstart' && $target_found) { |
|
|
50
|
33
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
3611
|
0
|
|
|
|
|
0
|
$tstart = $value; |
3612
|
|
|
|
|
|
|
} |
3613
|
|
|
|
|
|
|
|
3614
|
|
|
|
|
|
|
elsif ($tag eq 'tend' && $target_found) { |
3615
|
0
|
|
|
|
|
0
|
$tstop = $value; |
3616
|
|
|
|
|
|
|
} |
3617
|
|
|
|
|
|
|
|
3618
|
|
|
|
|
|
|
elsif (ucfirst $tag eq 'Note') { |
3619
|
12
|
|
|
|
|
27
|
push @notes, [$tag => $value]; |
3620
|
|
|
|
|
|
|
} |
3621
|
|
|
|
|
|
|
|
3622
|
|
|
|
|
|
|
elsif ($tag eq 'Target' && /([^:\"\s]+):([^\"\s]+)/) { # major disagreement in implementors of GFF2 here |
3623
|
21
|
|
|
|
|
24
|
$target_found++; |
3624
|
21
|
|
|
|
|
45
|
($gclass,$gname) = ($1,$2); |
3625
|
21
|
|
|
|
|
144
|
($tstart,$tstop) = / (\d+) (\d+)/; |
3626
|
|
|
|
|
|
|
} |
3627
|
|
|
|
|
|
|
|
3628
|
|
|
|
|
|
|
elsif (!defined($value)) { |
3629
|
0
|
|
|
|
|
0
|
push @notes, [Note => $tag]; # e.g. "Confirmed_by_EST" |
3630
|
|
|
|
|
|
|
} |
3631
|
|
|
|
|
|
|
|
3632
|
|
|
|
|
|
|
else { |
3633
|
105
|
|
|
|
|
195
|
push @attributes, [$tag => $value]; |
3634
|
|
|
|
|
|
|
} |
3635
|
|
|
|
|
|
|
} |
3636
|
|
|
|
|
|
|
|
3637
|
|
|
|
|
|
|
# group assignment |
3638
|
99
|
100
|
33
|
|
|
345
|
if (@attributes && !($gclass && $gname) ) { |
|
|
|
66
|
|
|
|
|
3639
|
|
|
|
|
|
|
|
3640
|
78
|
50
|
|
|
|
153
|
my $preferred = ref($self) ? $self->_preferred_groups_hash : {}; |
3641
|
|
|
|
|
|
|
|
3642
|
78
|
|
|
|
|
81
|
for my $pair (@attributes) { |
3643
|
105
|
|
|
|
|
123
|
my ($c,$n) = @$pair; |
3644
|
|
|
|
|
|
|
($gclass,$gname) = ($c,$n) |
3645
|
|
|
|
|
|
|
if !$gclass # pick up first one |
3646
|
|
|
|
|
|
|
|| |
3647
|
105
|
100
|
100
|
|
|
333
|
($preferred->{lc $gclass}||0) < ($preferred->{lc $c}||0); # pick up higher priority one |
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
3648
|
|
|
|
|
|
|
} |
3649
|
|
|
|
|
|
|
|
3650
|
78
|
|
|
|
|
75
|
@attributes = grep {$gclass ne $_->[0]} @attributes; |
|
105
|
|
|
|
|
198
|
|
3651
|
|
|
|
|
|
|
} |
3652
|
|
|
|
|
|
|
|
3653
|
99
|
|
|
|
|
81
|
push @attributes, @notes; |
3654
|
|
|
|
|
|
|
|
3655
|
99
|
|
|
|
|
255
|
return ($gclass,$gname,$tstart,$tstop,\@attributes); |
3656
|
|
|
|
|
|
|
} |
3657
|
|
|
|
|
|
|
|
3658
|
|
|
|
|
|
|
|
3659
|
|
|
|
|
|
|
=head2 gff3_name_munging |
3660
|
|
|
|
|
|
|
|
3661
|
|
|
|
|
|
|
Title : gff3_name_munging |
3662
|
|
|
|
|
|
|
Usage : $db->gff3_name_munging($boolean) |
3663
|
|
|
|
|
|
|
Function: get/set gff3_name_munging flag |
3664
|
|
|
|
|
|
|
Returns : $current value of flag |
3665
|
|
|
|
|
|
|
Args : new value of flag (optional) |
3666
|
|
|
|
|
|
|
Status : utility |
3667
|
|
|
|
|
|
|
|
3668
|
|
|
|
|
|
|
If this is set to true (default false), then features identified in |
3669
|
|
|
|
|
|
|
gff3 files with an ID in the format foo:bar will be parsed so that |
3670
|
|
|
|
|
|
|
"foo" is the class and "bar" is the name. This is mostly for backward |
3671
|
|
|
|
|
|
|
compatibility with GFF2. |
3672
|
|
|
|
|
|
|
|
3673
|
|
|
|
|
|
|
=cut |
3674
|
|
|
|
|
|
|
|
3675
|
|
|
|
|
|
|
sub gff3_name_munging { |
3676
|
71
|
|
|
71
|
1
|
66
|
my $self = shift; |
3677
|
71
|
|
|
|
|
68
|
my $d = $self->{gff3_name_munging}; |
3678
|
71
|
100
|
|
|
|
107
|
$self->{gff3_name_munging} = shift if @_; |
3679
|
71
|
|
|
|
|
108
|
$d; |
3680
|
|
|
|
|
|
|
} |
3681
|
|
|
|
|
|
|
|
3682
|
|
|
|
|
|
|
=head2 _split_gff3_group |
3683
|
|
|
|
|
|
|
|
3684
|
|
|
|
|
|
|
This is called internally from split_group(). |
3685
|
|
|
|
|
|
|
|
3686
|
|
|
|
|
|
|
=cut |
3687
|
|
|
|
|
|
|
|
3688
|
|
|
|
|
|
|
sub _split_gff3_group { |
3689
|
66
|
|
|
66
|
|
42
|
my $self = shift; |
3690
|
66
|
|
|
|
|
72
|
my @groups = @_; |
3691
|
66
|
|
|
|
|
78
|
my $dc = $self->default_class; |
3692
|
66
|
|
|
|
|
64
|
my (%id,@attributes); |
3693
|
|
|
|
|
|
|
|
3694
|
66
|
|
|
|
|
68
|
for my $group (@groups) { |
3695
|
92
|
|
|
|
|
136
|
my ($tag,$value) = split /=/,$group; |
3696
|
92
|
|
|
|
|
126
|
$tag = unescape($tag); |
3697
|
92
|
|
|
|
|
108
|
my @values = map {unescape($_)} split /,/,$value; |
|
92
|
|
|
|
|
86
|
|
3698
|
|
|
|
|
|
|
|
3699
|
|
|
|
|
|
|
# GFF2 traditionally did not distinguish between a feature's name |
3700
|
|
|
|
|
|
|
# and the group it belonged to. This code is a transition between |
3701
|
|
|
|
|
|
|
# gff2 and the new parent/ID dichotomy in gff3. |
3702
|
92
|
50
|
66
|
|
|
348
|
if ($tag eq 'Parent') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
3703
|
0
|
|
|
|
|
0
|
my (@names,@classes); |
3704
|
0
|
|
|
|
|
0
|
for (@values) { |
3705
|
0
|
|
|
|
|
0
|
my ($name,$class) = $self->_gff3_name_munging($_,$dc); |
3706
|
0
|
|
|
|
|
0
|
push @names,$name; |
3707
|
0
|
|
|
|
|
0
|
push @classes,$class; |
3708
|
|
|
|
|
|
|
} |
3709
|
0
|
0
|
|
|
|
0
|
$id{$tag} = @names > 1 ? [\@names,\@classes] : [$names[0],$classes[0]]; |
3710
|
|
|
|
|
|
|
} |
3711
|
|
|
|
|
|
|
elsif ($tag eq 'ID' || $tag eq 'Name') { |
3712
|
52
|
|
|
|
|
74
|
$id{$tag} = [$self->_gff3_name_munging(shift(@values),$dc)]; |
3713
|
|
|
|
|
|
|
} |
3714
|
|
|
|
|
|
|
elsif ($tag eq 'Target') { |
3715
|
14
|
|
|
|
|
48
|
my ($gname,$tstart,$tstop) = split /\s+/,shift @values; |
3716
|
14
|
|
|
|
|
26
|
$id{$tag} = [$self->_gff3_name_munging($gname,$dc),$tstart,$tstop]; |
3717
|
|
|
|
|
|
|
} |
3718
|
|
|
|
|
|
|
elsif ($tag =~ /synonym/i) { |
3719
|
0
|
|
|
|
|
0
|
$tag = 'Alias'; |
3720
|
|
|
|
|
|
|
} |
3721
|
92
|
|
|
|
|
186
|
push @attributes,[$tag=>$_] foreach @values; |
3722
|
|
|
|
|
|
|
} |
3723
|
|
|
|
|
|
|
|
3724
|
66
|
|
|
|
|
94
|
my $priorities = $self->_preferred_groups_hash(1); |
3725
|
66
|
|
|
|
|
52
|
my ($gclass,$gname,$tstart,$tstop); |
3726
|
66
|
|
|
|
|
110
|
for my $preferred (sort {$priorities->{lc $b}<=>$priorities->{lc $a}} |
|
0
|
|
|
|
|
0
|
|
3727
|
|
|
|
|
|
|
keys %id) { |
3728
|
66
|
50
|
|
|
|
98
|
unless (defined $gname) { |
3729
|
66
|
|
|
|
|
42
|
($gname,$gclass,$tstart,$tstop) = @{$id{$preferred}}; |
|
66
|
|
|
|
|
136
|
|
3730
|
|
|
|
|
|
|
} |
3731
|
|
|
|
|
|
|
} |
3732
|
|
|
|
|
|
|
|
3733
|
|
|
|
|
|
|
# set null gclass to empty string to preserve compatibility with |
3734
|
|
|
|
|
|
|
# programs that expect a defined gclass if no gname |
3735
|
66
|
50
|
50
|
|
|
136
|
$gclass ||= '' if defined $gname; |
3736
|
|
|
|
|
|
|
|
3737
|
66
|
|
|
|
|
216
|
return ($gclass,$gname,$tstart,$tstop,\@attributes); |
3738
|
|
|
|
|
|
|
} |
3739
|
|
|
|
|
|
|
|
3740
|
|
|
|
|
|
|
# accomodation for wormbase style of class:name naming |
3741
|
|
|
|
|
|
|
sub _gff3_name_munging { |
3742
|
66
|
|
|
66
|
|
48
|
my $self = shift; |
3743
|
66
|
|
|
|
|
56
|
my ($name,$default_class) = @_; |
3744
|
66
|
50
|
|
|
|
136
|
return ($name,$default_class) unless $self->gff3_name_munging; |
3745
|
|
|
|
|
|
|
|
3746
|
66
|
50
|
|
|
|
182
|
if ($name =~ /^(\w+):(.+)/) { |
3747
|
66
|
|
|
|
|
248
|
return ($2,$1); |
3748
|
|
|
|
|
|
|
} else { |
3749
|
0
|
|
|
|
|
0
|
return ($name,$default_class); |
3750
|
|
|
|
|
|
|
} |
3751
|
|
|
|
|
|
|
} |
3752
|
|
|
|
|
|
|
|
3753
|
|
|
|
|
|
|
=head2 _delete_features(), _delete_groups(),_delete(),_delete_fattribute_to_features() |
3754
|
|
|
|
|
|
|
|
3755
|
|
|
|
|
|
|
Title : _delete_features(), _delete_groups(),_delete(),_delete_fattribute_to_features() |
3756
|
|
|
|
|
|
|
Usage : $count = $db->_delete_features(@feature_ids) |
3757
|
|
|
|
|
|
|
$count = $db->_delete_groups(@group_ids) |
3758
|
|
|
|
|
|
|
$count = $db->_delete(\%delete_spec) |
3759
|
|
|
|
|
|
|
$count = $db->_delete_fattribute_to_features(@feature_ids) |
3760
|
|
|
|
|
|
|
Function: low-level feature/group deleter |
3761
|
|
|
|
|
|
|
Returns : count of groups removed |
3762
|
|
|
|
|
|
|
Args : list of feature or group ids removed |
3763
|
|
|
|
|
|
|
Status : for implementation by subclasses |
3764
|
|
|
|
|
|
|
|
3765
|
|
|
|
|
|
|
These methods need to be implemented in adaptors. For _delete_features, |
3766
|
|
|
|
|
|
|
_delete_groups and _delete_fattribute_to_features, the arguments are a list of |
3767
|
|
|
|
|
|
|
feature or group IDs to remove. For _delete(), the argument is a hashref with |
3768
|
|
|
|
|
|
|
the three keys 'segments', 'types' and 'force'. The first contains an arrayref |
3769
|
|
|
|
|
|
|
of Bio::DB::GFF::RelSegment objects to delete (all FEATURES within the segment |
3770
|
|
|
|
|
|
|
are deleted). The second contains an arrayref of [method,source] feature types |
3771
|
|
|
|
|
|
|
to delete. The two are ANDed together. If 'force' has a true value, this |
3772
|
|
|
|
|
|
|
forces the operation to continue even if it would delete all features. |
3773
|
|
|
|
|
|
|
|
3774
|
|
|
|
|
|
|
=cut |
3775
|
|
|
|
|
|
|
|
3776
|
|
|
|
|
|
|
sub _delete_features { |
3777
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
3778
|
0
|
|
|
|
|
0
|
my @feature_ids = @_; |
3779
|
0
|
|
|
|
|
0
|
$self->throw('_delete_features is not implemented in this adaptor'); |
3780
|
|
|
|
|
|
|
} |
3781
|
|
|
|
|
|
|
|
3782
|
|
|
|
|
|
|
sub _delete_groups { |
3783
|
5
|
|
|
5
|
|
8
|
my $self = shift; |
3784
|
5
|
|
|
|
|
10
|
my @group_ids = @_; |
3785
|
5
|
|
|
|
|
19
|
$self->throw('_delete_groups is not implemented in this adaptor'); |
3786
|
|
|
|
|
|
|
} |
3787
|
|
|
|
|
|
|
|
3788
|
|
|
|
|
|
|
sub _delete { |
3789
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
3790
|
0
|
|
|
|
|
0
|
my $delete_options = shift; |
3791
|
0
|
|
|
|
|
0
|
$self->throw('_delete is not implemented in this adaptor'); |
3792
|
|
|
|
|
|
|
} |
3793
|
|
|
|
|
|
|
|
3794
|
|
|
|
|
|
|
sub _delete_fattribute_to_features { |
3795
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
3796
|
0
|
|
|
|
|
0
|
my @feature_ids = @_; |
3797
|
0
|
|
|
|
|
0
|
$self->throw('_delete_fattribute_to_features is not implemented in this adaptor'); |
3798
|
|
|
|
|
|
|
} |
3799
|
|
|
|
|
|
|
|
3800
|
|
|
|
|
|
|
|
3801
|
|
|
|
|
|
|
sub unescape { |
3802
|
184
|
|
|
184
|
0
|
122
|
my $v = shift; |
3803
|
184
|
|
|
|
|
144
|
$v =~ tr/+/ /; |
3804
|
184
|
|
|
|
|
142
|
$v =~ s/%([0-9a-fA-F]{2})/chr hex($1)/ge; |
|
0
|
|
|
|
|
0
|
|
3805
|
184
|
|
|
|
|
224
|
return $v; |
3806
|
|
|
|
|
|
|
} |
3807
|
|
|
|
|
|
|
|
3808
|
|
|
|
|
|
|
sub print_gff3_warning { |
3809
|
2
|
|
|
2
|
0
|
6
|
my $self = shift; |
3810
|
2
|
|
|
|
|
146
|
print STDERR <
|
3811
|
|
|
|
|
|
|
|
3812
|
|
|
|
|
|
|
You are loading a Bio::DB::GFF database with GFF3 formatted data. |
3813
|
|
|
|
|
|
|
While this will likely work fine, the Bio::DB::GFF schema does not |
3814
|
|
|
|
|
|
|
always faithfully capture the complexity represented in GFF3 files. |
3815
|
|
|
|
|
|
|
Unless you have a specific reason for using Bio::DB::GFF, we suggest |
3816
|
|
|
|
|
|
|
that you use a Bio::DB::SeqFeature::Store database and its corresponding |
3817
|
|
|
|
|
|
|
loader, bp_seqfeature_load.pl. |
3818
|
|
|
|
|
|
|
|
3819
|
|
|
|
|
|
|
END |
3820
|
|
|
|
|
|
|
; |
3821
|
|
|
|
|
|
|
|
3822
|
2
|
|
|
|
|
10
|
return; |
3823
|
|
|
|
|
|
|
} |
3824
|
|
|
|
|
|
|
|
3825
|
|
|
|
|
|
|
|
3826
|
|
|
|
|
|
|
package Bio::DB::GFF::ID_Iterator; |
3827
|
3
|
|
|
3
|
|
12
|
use strict; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
51
|
|
3828
|
|
|
|
|
|
|
|
3829
|
3
|
|
|
3
|
|
9
|
use base qw(Bio::Root::Root); |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
774
|
|
3830
|
|
|
|
|
|
|
|
3831
|
|
|
|
|
|
|
sub new { |
3832
|
0
|
|
|
0
|
|
|
my $class = shift; |
3833
|
0
|
|
|
|
|
|
my ($db,$ids,$type) = @_; |
3834
|
0
|
|
|
|
|
|
return bless {ids=>$ids,db=>$db,type=>$type},$class; |
3835
|
|
|
|
|
|
|
} |
3836
|
|
|
|
|
|
|
|
3837
|
|
|
|
|
|
|
sub next_seq { |
3838
|
0
|
|
|
0
|
|
|
my $self = shift; |
3839
|
0
|
|
|
|
|
|
my $next = shift @{$self->{ids}}; |
|
0
|
|
|
|
|
|
|
3840
|
0
|
0
|
|
|
|
|
return unless $next; |
3841
|
0
|
0
|
|
|
|
|
my $name = ref($next) eq 'ARRAY' ? Bio::DB::GFF::Featname->new(@$next) : $next; |
3842
|
|
|
|
|
|
|
my $segment = $self->{type} eq 'name' ? $self->{db}->segment($name) |
3843
|
|
|
|
|
|
|
: $self->{type} eq 'feature' ? $self->{db}->fetch_feature_by_id($name) |
3844
|
0
|
0
|
|
|
|
|
: $self->{type} eq 'group' ? $self->{db}->fetch_feature_by_gid($name) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3845
|
|
|
|
|
|
|
: $self->throw("Bio::DB::GFF::ID_Iterator called to fetch an unknown type of identifier"); |
3846
|
0
|
0
|
|
|
|
|
$self->throw("id does not exist") unless $segment; |
3847
|
0
|
|
|
|
|
|
return $segment; |
3848
|
|
|
|
|
|
|
} |
3849
|
|
|
|
|
|
|
|
3850
|
|
|
|
|
|
|
package Bio::DB::GFF::FeatureIterator; |
3851
|
|
|
|
|
|
|
|
3852
|
|
|
|
|
|
|
sub new { |
3853
|
0
|
|
|
0
|
|
|
my $self = shift; |
3854
|
0
|
|
|
|
|
|
my @features = @_; |
3855
|
0
|
|
0
|
|
|
|
return bless \@features,ref $self || $self; |
3856
|
|
|
|
|
|
|
} |
3857
|
|
|
|
|
|
|
sub next_seq { |
3858
|
0
|
|
|
0
|
|
|
my $self = shift; |
3859
|
0
|
0
|
|
|
|
|
return unless @$self; |
3860
|
0
|
|
|
|
|
|
return shift @$self; |
3861
|
|
|
|
|
|
|
} |
3862
|
|
|
|
|
|
|
|
3863
|
|
|
|
|
|
|
|
3864
|
|
|
|
|
|
|
1; |
3865
|
|
|
|
|
|
|
|
3866
|
|
|
|
|
|
|
__END__ |