line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# BioPerl module for Bio::DB::Fasta |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
Bio::DB::Fasta - Fast indexed access to fasta files |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 SYNOPSIS |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
use Bio::DB::Fasta; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# Create database from a directory of Fasta files |
17
|
|
|
|
|
|
|
my $db = Bio::DB::Fasta->new('/path/to/fasta/files/'); |
18
|
|
|
|
|
|
|
my @ids = $db->get_all_primary_ids; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# Simple access |
21
|
|
|
|
|
|
|
my $seqstr = $db->seq('CHROMOSOME_I', 4_000_000 => 4_100_000); |
22
|
|
|
|
|
|
|
my $revseq = $db->seq('CHROMOSOME_I', 4_100_000 => 4_000_000); |
23
|
|
|
|
|
|
|
my $length = $db->length('CHROMOSOME_I'); |
24
|
|
|
|
|
|
|
my $header = $db->header('CHROMOSOME_I'); |
25
|
|
|
|
|
|
|
my $alphabet = $db->alphabet('CHROMOSOME_I'); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# Access to sequence objects. See Bio::PrimarySeqI. |
28
|
|
|
|
|
|
|
my $seq = $db->get_Seq_by_id('CHROMOSOME_I'); |
29
|
|
|
|
|
|
|
my $seqstr = $seq->seq; |
30
|
|
|
|
|
|
|
my $subseq = $seq->subseq(4_000_000 => 4_100_000); |
31
|
|
|
|
|
|
|
my $trunc = $seq->trunc(4_000_000 => 4_100_000); |
32
|
|
|
|
|
|
|
my $length = $seq->length; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# Loop through sequence objects |
35
|
|
|
|
|
|
|
my $stream = $db->get_PrimarySeq_stream; |
36
|
|
|
|
|
|
|
while (my $seq = $stream->next_seq) { |
37
|
|
|
|
|
|
|
# Bio::PrimarySeqI stuff |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# Filehandle access |
41
|
|
|
|
|
|
|
my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files/'); |
42
|
|
|
|
|
|
|
while (my $seq = <$fh>) { |
43
|
|
|
|
|
|
|
# Bio::PrimarySeqI stuff |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# Tied hash access |
47
|
|
|
|
|
|
|
tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files/'; |
48
|
|
|
|
|
|
|
print $sequences{'CHROMOSOME_I:1,20000'}; |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head1 DESCRIPTION |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
Bio::DB::Fasta provides indexed access to a single Fasta file, several files, |
53
|
|
|
|
|
|
|
or a directory of files. It provides persistent random access to each sequence |
54
|
|
|
|
|
|
|
entry (either as a Bio::PrimarySeqI-compliant object or a string), and to |
55
|
|
|
|
|
|
|
subsequences within each entry, allowing you to retrieve portions of very large |
56
|
|
|
|
|
|
|
sequences without bringing the entire sequence into memory. Bio::DB::Fasta is |
57
|
|
|
|
|
|
|
based on Bio::DB::IndexedBase. See this module's documentation for details. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
The Fasta files may contain any combination of nucleotide and protein sequences; |
60
|
|
|
|
|
|
|
during indexing the module guesses the molecular type. Entries may have any line |
61
|
|
|
|
|
|
|
length up to 65,536 characters, and different line lengths are allowed in the |
62
|
|
|
|
|
|
|
same file. However, within a sequence entry, all lines must be the same length |
63
|
|
|
|
|
|
|
except for the last. An error will be thrown if this is not the case. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
The module uses /^E(\S+)/ to extract the primary ID of each sequence |
66
|
|
|
|
|
|
|
from the Fasta header. See -makeid in Bio::DB::IndexedBase to pass a callback |
67
|
|
|
|
|
|
|
routine to reversibly modify this primary ID, e.g. if you wish to extract a |
68
|
|
|
|
|
|
|
specific portion of the gi|gb|abc|xyz GenBank IDs. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head1 DATABASE CREATION AND INDEXING |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
The object-oriented constructor is new(), the filehandle constructor is newFh() |
73
|
|
|
|
|
|
|
and the tied hash constructor is tie(). They all allow to index a single Fasta |
74
|
|
|
|
|
|
|
file, several files, or a directory of files. See Bio::DB::IndexedBase. |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=head1 SEE ALSO |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
L |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
L |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
L |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=head1 AUTHOR |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
Lincoln Stein Elstein@cshl.orgE. |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Copyright (c) 2001 Cold Spring Harbor Laboratory. |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
91
|
|
|
|
|
|
|
it under the same terms as Perl itself. See DISCLAIMER.txt for |
92
|
|
|
|
|
|
|
disclaimers of warranty. |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=head1 APPENDIX |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
97
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
For BioPerl-style access, the following methods are provided: |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
=head2 get_Seq_by_id |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id |
104
|
|
|
|
|
|
|
Usage : my $seq = $db->get_Seq_by_id($id); |
105
|
|
|
|
|
|
|
Function: Given an ID, fetch the corresponding sequence from the database. |
106
|
|
|
|
|
|
|
Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant) |
107
|
|
|
|
|
|
|
Note that to save resource, Bio::PrimarySeq::Fasta sequence objects |
108
|
|
|
|
|
|
|
only load the sequence string into memory when requested using seq(). |
109
|
|
|
|
|
|
|
See L for methods provided by the sequence objects |
110
|
|
|
|
|
|
|
returned from get_Seq_by_id() and get_PrimarySeq_stream(). |
111
|
|
|
|
|
|
|
Args : ID |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
=head2 get_PrimarySeq_stream |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
Title : get_PrimarySeq_stream |
116
|
|
|
|
|
|
|
Usage : my $stream = $db->get_PrimarySeq_stream(); |
117
|
|
|
|
|
|
|
Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a |
118
|
|
|
|
|
|
|
single method, next_seq(). Each call to next_seq() returns a new |
119
|
|
|
|
|
|
|
Bio::PrimarySeq::Fasta sequence object, until no more sequences remain. |
120
|
|
|
|
|
|
|
Returns : A Bio::DB::Indexed::Stream object |
121
|
|
|
|
|
|
|
Args : None |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=head1 |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
For simple access, the following methods are provided: |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=cut |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
package Bio::DB::Fasta; |
131
|
|
|
|
|
|
|
|
132
|
1
|
|
|
1
|
|
3
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
24
|
|
133
|
1
|
|
|
1
|
|
384
|
use IO::File; |
|
1
|
|
|
|
|
675
|
|
|
1
|
|
|
|
|
85
|
|
134
|
1
|
|
|
1
|
|
5
|
use File::Spec; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
13
|
|
135
|
1
|
|
|
1
|
|
318
|
use Bio::PrimarySeqI; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
25
|
|
136
|
|
|
|
|
|
|
|
137
|
1
|
|
|
1
|
|
5
|
use base qw(Bio::DB::IndexedBase); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
417
|
|
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
our $obj_class = 'Bio::PrimarySeq::Fasta'; |
140
|
|
|
|
|
|
|
our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}'; |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head2 new |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
Title : new |
146
|
|
|
|
|
|
|
Usage : my $db = Bio::DB::Fasta->new( $path, %options); |
147
|
|
|
|
|
|
|
Function: Initialize a new database object. When indexing a directory, files |
148
|
|
|
|
|
|
|
ending in .fa,fasta,fast,dna,fna,faa,fsa are indexed by default. |
149
|
|
|
|
|
|
|
Returns : A new Bio::DB::Fasta object. |
150
|
|
|
|
|
|
|
Args : A single file, or path to dir, or arrayref of files |
151
|
|
|
|
|
|
|
Optional arguments: see Bio::DB::IndexedBase |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=cut |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
sub _calculate_offsets { |
157
|
|
|
|
|
|
|
# Bio::DB::IndexedBase calls this to calculate offsets |
158
|
53
|
|
|
53
|
|
66
|
my ($self, $fileno, $file, $offsets) = @_; |
159
|
|
|
|
|
|
|
|
160
|
53
|
50
|
|
|
|
219
|
my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!"); |
161
|
53
|
|
|
|
|
2565
|
binmode $fh; |
162
|
53
|
50
|
|
|
|
112
|
warn "Indexing $file\n" if $self->{debug}; |
163
|
53
|
|
|
|
|
41
|
my ($offset, @ids, $linelen, $alphabet, $headerlen, $count, $seq_lines, |
164
|
|
|
|
|
|
|
$last_line, %offsets); |
165
|
53
|
|
|
|
|
59
|
my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0); |
166
|
|
|
|
|
|
|
|
167
|
53
|
|
|
|
|
55
|
my $termination_length = $self->{termination_length}; |
168
|
53
|
|
|
|
|
576
|
while (my $line = <$fh>) { |
169
|
|
|
|
|
|
|
# Account for crlf-terminated Windows files |
170
|
36648
|
100
|
|
|
|
67818
|
if (index($line, '>') == 0) { |
|
|
100
|
|
|
|
|
|
171
|
3009
|
100
|
|
|
|
6825
|
if ($line =~ /^>(\S+)/) { |
172
|
|
|
|
|
|
|
print STDERR "Indexed $count sequences...\n" |
173
|
3008
|
50
|
33
|
|
|
4998
|
if $self->{debug} && (++$count%1000) == 0; |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# please, do not enforce arbitrary line length requirements. |
176
|
|
|
|
|
|
|
# It's good practice but not enforced. |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
#$self->_check_linelength($linelen); |
179
|
3008
|
|
|
|
|
2950
|
my $pos = tell($fh); |
180
|
3008
|
100
|
|
|
|
3914
|
if (@ids) { |
181
|
2956
|
|
|
|
|
2736
|
my $strlen = $pos - $offset - length($line); |
182
|
2956
|
|
|
|
|
2425
|
$strlen -= $termination_length * $seq_lines; |
183
|
2956
|
|
|
|
|
2302
|
my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen, |
|
2956
|
|
|
|
|
5212
|
|
184
|
|
|
|
|
|
|
$linelen, $headerlen, $alphabet, $fileno); |
185
|
2956
|
|
|
|
|
2530
|
$alphabet = Bio::DB::IndexedBase::NA; |
186
|
2956
|
|
|
|
|
3023
|
for my $id (@ids) { |
187
|
2962
|
|
|
|
|
34889
|
$offsets->{$id} = $ppos; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
} |
190
|
3008
|
|
|
|
|
4992
|
@ids = $self->_makeid($line); |
191
|
3008
|
|
|
|
|
3382
|
($offset, $headerlen, $linelen, $seq_lines) = ($pos, length $line, 0, 0); |
192
|
3008
|
|
|
|
|
3277
|
($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0); |
193
|
|
|
|
|
|
|
} else { |
194
|
|
|
|
|
|
|
# Catch bad header lines, bug 3172 |
195
|
1
|
|
|
|
|
15
|
$self->throw("FASTA header doesn't match '>(\\S+)': $line"); |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
} elsif ($line !~ /\S/) { |
198
|
|
|
|
|
|
|
# Skip blank line |
199
|
89
|
|
|
|
|
64
|
$blank_lines++; |
200
|
89
|
|
|
|
|
225
|
next; |
201
|
|
|
|
|
|
|
} else { |
202
|
|
|
|
|
|
|
# Need to check every line :( |
203
|
33550
|
|
|
|
|
20669
|
$l3_len = $l2_len; |
204
|
33550
|
|
|
|
|
18499
|
$l2_len = $l_len; |
205
|
33550
|
|
|
|
|
18733
|
$l_len = length $line; |
206
|
33550
|
|
|
|
|
17768
|
if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) { |
207
|
33550
|
50
|
66
|
|
|
105757
|
if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) { |
|
|
|
66
|
|
|
|
|
208
|
0
|
|
|
|
|
0
|
my $fap = substr($line, 0, 20).".."; |
209
|
0
|
|
|
|
|
0
|
$self->throw("Each line of the fasta entry must be the same ". |
210
|
|
|
|
|
|
|
"length except the last. Line above #$. '$fap' is $l2_len". |
211
|
|
|
|
|
|
|
" != $l3_len chars."); |
212
|
|
|
|
|
|
|
} |
213
|
33550
|
100
|
|
|
|
36635
|
if ($blank_lines) { |
214
|
|
|
|
|
|
|
# Blank lines not allowed in entry |
215
|
1
|
|
|
|
|
6
|
$self->throw("Blank lines can only precede header lines, ". |
216
|
|
|
|
|
|
|
"found preceding line #$."); |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
} |
219
|
33549
|
|
66
|
|
|
42823
|
$linelen ||= length $line; |
220
|
33549
|
|
66
|
|
|
36615
|
$alphabet ||= $self->_guess_alphabet($line); |
221
|
33549
|
|
|
|
|
20213
|
$seq_lines++; |
222
|
|
|
|
|
|
|
} |
223
|
36557
|
|
|
|
|
62412
|
$last_line = $line; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
# Process last entry |
227
|
51
|
|
|
|
|
107
|
$self->_check_linelength($linelen); |
228
|
51
|
|
|
|
|
51
|
my $pos = tell $fh; |
229
|
51
|
50
|
|
|
|
69
|
if (@ids) { |
230
|
51
|
|
|
|
|
47
|
my $strlen = $pos - $offset; |
231
|
51
|
100
|
|
|
|
56
|
if ($linelen == 0) { # yet another pesky empty chr_random.fa file |
232
|
11
|
|
|
|
|
14
|
$strlen = 0; |
233
|
|
|
|
|
|
|
} else { |
234
|
40
|
50
|
|
|
|
124
|
if ($last_line !~ /\s$/) { |
235
|
0
|
|
|
|
|
0
|
$seq_lines--; |
236
|
|
|
|
|
|
|
} |
237
|
40
|
|
|
|
|
43
|
$strlen -= $termination_length * $seq_lines; |
238
|
|
|
|
|
|
|
} |
239
|
51
|
|
|
|
|
47
|
my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen, $linelen, |
|
51
|
|
|
|
|
92
|
|
240
|
|
|
|
|
|
|
$headerlen, $alphabet, $fileno); |
241
|
51
|
|
|
|
|
72
|
for my $id (@ids) { |
242
|
52
|
|
|
|
|
552
|
$offsets->{$id} = $ppos; |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
|
246
|
51
|
|
|
|
|
916
|
return \%offsets; |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=head2 seq |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
Title : seq, sequence, subseq |
252
|
|
|
|
|
|
|
Usage : # Entire sequence string |
253
|
|
|
|
|
|
|
my $seqstr = $db->seq($id); |
254
|
|
|
|
|
|
|
# Subsequence |
255
|
|
|
|
|
|
|
my $subseqstr = $db->seq($id, $start, $stop, $strand); |
256
|
|
|
|
|
|
|
# or... |
257
|
|
|
|
|
|
|
my $subseqstr = $db->seq($compound_id); |
258
|
|
|
|
|
|
|
Function: Get a subseq of a sequence from the database. For your convenience, |
259
|
|
|
|
|
|
|
the sequence to extract can be specified with any of the following |
260
|
|
|
|
|
|
|
compound IDs: |
261
|
|
|
|
|
|
|
$db->seq("$id:$start,$stop") |
262
|
|
|
|
|
|
|
$db->seq("$id:$start..$stop") |
263
|
|
|
|
|
|
|
$db->seq("$id:$start-$stop") |
264
|
|
|
|
|
|
|
$db->seq("$id:$start,$stop/$strand") |
265
|
|
|
|
|
|
|
$db->seq("$id:$start..$stop/$strand") |
266
|
|
|
|
|
|
|
$db->seq("$id:$start-$stop/$strand") |
267
|
|
|
|
|
|
|
$db->seq("$id/$strand") |
268
|
|
|
|
|
|
|
In the case of DNA or RNA sequence, if $stop is less than $start, |
269
|
|
|
|
|
|
|
then the reverse complement of the sequence is returned. Avoid using |
270
|
|
|
|
|
|
|
it if possible since this goes against Bio::Seq conventions. |
271
|
|
|
|
|
|
|
Returns : A string |
272
|
|
|
|
|
|
|
Args : ID of sequence to retrieve |
273
|
|
|
|
|
|
|
or |
274
|
|
|
|
|
|
|
Compound ID of subsequence to fetch |
275
|
|
|
|
|
|
|
or |
276
|
|
|
|
|
|
|
ID, optional start (defaults to 1), optional end (defaults to length |
277
|
|
|
|
|
|
|
of sequence) and optional strand (defaults to 1). |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
=cut |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
sub subseq { |
282
|
29
|
|
|
29
|
0
|
328
|
my ($self, $id, $start, $stop, $strand) = @_; |
283
|
29
|
50
|
|
|
|
61
|
$self->throw('Need to provide a sequence ID') if not defined $id; |
284
|
29
|
|
|
|
|
66
|
($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand); |
285
|
|
|
|
|
|
|
|
286
|
29
|
|
|
|
|
40
|
my $data; |
287
|
|
|
|
|
|
|
|
288
|
29
|
100
|
|
|
|
53
|
my $fh = $self->_fh($id) or return; |
289
|
28
|
|
|
|
|
68
|
my $filestart = $self->_calc_offset($id, $start); |
290
|
28
|
|
|
|
|
40
|
my $filestop = $self->_calc_offset($id, $stop ); |
291
|
|
|
|
|
|
|
|
292
|
28
|
|
|
|
|
78
|
seek($fh, $filestart,0); |
293
|
28
|
|
|
|
|
199
|
read($fh, $data, $filestop-$filestart+1); |
294
|
|
|
|
|
|
|
|
295
|
28
|
|
|
|
|
69
|
$data = Bio::DB::IndexedBase::_strip_crnl($data); |
296
|
|
|
|
|
|
|
|
297
|
28
|
100
|
|
|
|
47
|
if ($strand == -1) { |
298
|
|
|
|
|
|
|
# Reverse-complement the sequence |
299
|
7
|
|
|
|
|
20
|
$data = Bio::PrimarySeqI::_revcom_from_string($self, $data, $self->alphabet($id)); |
300
|
|
|
|
|
|
|
} |
301
|
28
|
|
|
|
|
98
|
return $data; |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
*seq = *sequence = \&subseq; |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
=head2 length |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
Title : length |
310
|
|
|
|
|
|
|
Usage : my $length = $qualdb->length($id); |
311
|
|
|
|
|
|
|
Function: Get the number of residues in the indicated sequence. |
312
|
|
|
|
|
|
|
Returns : Number |
313
|
|
|
|
|
|
|
Args : ID of entry |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=head2 header |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
Title : header |
318
|
|
|
|
|
|
|
Usage : my $header = $db->header($id); |
319
|
|
|
|
|
|
|
Function: Get the header line (ID and description fields) of the specified |
320
|
|
|
|
|
|
|
sequence. |
321
|
|
|
|
|
|
|
Returns : String |
322
|
|
|
|
|
|
|
Args : ID of sequence |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
=cut |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
sub header { |
327
|
2
|
|
|
2
|
1
|
2
|
my ($self, $id) = @_; |
328
|
2
|
50
|
|
|
|
6
|
$self->throw('Need to provide a sequence ID') if not defined $id; |
329
|
2
|
|
|
|
|
6
|
my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4]; |
|
2
|
|
|
|
|
5
|
|
330
|
2
|
|
|
|
|
3
|
$offset -= $headerlen; |
331
|
2
|
|
|
|
|
2
|
my $data; |
332
|
2
|
50
|
|
|
|
4
|
my $fh = $self->_fh($id) or return; |
333
|
2
|
|
|
|
|
8
|
seek($fh, $offset, 0); |
334
|
2
|
|
|
|
|
12
|
read($fh, $data, $headerlen); |
335
|
|
|
|
|
|
|
# On Windows chomp remove '\n' but leaves '\r' |
336
|
|
|
|
|
|
|
# when reading '\r\n' in binary mode |
337
|
2
|
|
|
|
|
6
|
$data = Bio::DB::IndexedBase::_strip_crnl($data); |
338
|
2
|
|
|
|
|
4
|
substr($data, 0, 1) = ''; |
339
|
2
|
|
|
|
|
5
|
return $data; |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=head2 alphabet |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
Title : alphabet |
346
|
|
|
|
|
|
|
Usage : my $alphabet = $db->alphabet($id); |
347
|
|
|
|
|
|
|
Function: Get the molecular type of the indicated sequence: dna, rna or protein |
348
|
|
|
|
|
|
|
Returns : String |
349
|
|
|
|
|
|
|
Args : ID of sequence |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
=cut |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
#------------------------------------------------------------- |
355
|
|
|
|
|
|
|
# Bio::PrimarySeqI compatibility |
356
|
|
|
|
|
|
|
# |
357
|
|
|
|
|
|
|
package Bio::PrimarySeq::Fasta; |
358
|
1
|
|
|
1
|
|
9
|
use overload '""' => 'display_id'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
11
|
|
359
|
|
|
|
|
|
|
|
360
|
1
|
|
|
1
|
|
61
|
use base qw(Bio::Root::Root Bio::PrimarySeqI); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
477
|
|
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
sub new { |
363
|
18
|
|
|
18
|
|
28
|
my ($class, @args) = @_; |
364
|
18
|
|
|
|
|
50
|
my $self = $class->SUPER::new(@args); |
365
|
18
|
|
|
|
|
55
|
my ($db, $id, $start, $stop) = $self->_rearrange( |
366
|
|
|
|
|
|
|
[qw(DATABASE ID START STOP)], |
367
|
|
|
|
|
|
|
@args); |
368
|
18
|
|
|
|
|
31
|
$self->{db} = $db; |
369
|
18
|
|
|
|
|
19
|
$self->{id} = $id; |
370
|
18
|
|
100
|
|
|
46
|
$self->{stop} = $stop || $db->length($id); |
371
|
18
|
|
66
|
|
|
60
|
$self->{start} = $start || ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs |
372
|
18
|
|
|
|
|
49
|
return $self; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
sub fetch_sequence { |
376
|
0
|
|
|
0
|
|
0
|
return shift->seq(@_); |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
sub seq { |
380
|
4
|
|
|
4
|
|
6
|
my $self = shift; |
381
|
4
|
|
|
|
|
13
|
return $self->{db}->seq($self->{id}, $self->{start}, $self->{stop}); |
382
|
|
|
|
|
|
|
} |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
sub subseq { |
385
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
386
|
1
|
|
|
|
|
4
|
return $self->trunc(@_)->seq(); |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
sub trunc { |
390
|
|
|
|
|
|
|
# Override Bio::PrimarySeqI trunc() method. This way, we create an object |
391
|
|
|
|
|
|
|
# that does not store the sequence in memory. |
392
|
2
|
|
|
2
|
|
3
|
my ($self, $start, $stop) = @_; |
393
|
2
|
50
|
|
|
|
5
|
$self->throw("Stop cannot be smaller than start") if $stop < $start; |
394
|
2
|
50
|
|
|
|
8
|
if ($self->{start} <= $self->{stop}) { |
395
|
2
|
|
|
|
|
4
|
$start = $self->{start}+$start-1; |
396
|
2
|
|
|
|
|
3
|
$stop = $self->{start}+$stop-1; |
397
|
|
|
|
|
|
|
} else { |
398
|
0
|
|
|
|
|
0
|
$start = $self->{start}-($start-1); |
399
|
0
|
|
|
|
|
0
|
$stop = $self->{start}-($stop-1); |
400
|
|
|
|
|
|
|
} |
401
|
2
|
|
|
|
|
5
|
return $self->new( $self->{db}, $self->{id}, $start, $stop ); |
402
|
|
|
|
|
|
|
} |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
sub is_circular { |
405
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
406
|
1
|
|
|
|
|
4
|
return $self->{is_circular}; |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
sub display_id { |
410
|
9
|
|
|
9
|
|
639
|
my $self = shift; |
411
|
9
|
|
|
|
|
23
|
return $self->{id}; |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
sub accession_number { |
415
|
1
|
|
|
1
|
|
3
|
my $self = shift; |
416
|
1
|
|
|
|
|
3
|
return 'unknown'; |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
sub primary_id { |
420
|
|
|
|
|
|
|
# Following Bio::PrimarySeqI, since this sequence has no accession number, |
421
|
|
|
|
|
|
|
# its primary_id should be a stringified memory location. |
422
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
423
|
1
|
|
|
|
|
6
|
return overload::StrVal($self); |
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
sub can_call_new { |
427
|
0
|
|
|
0
|
|
0
|
return 0; |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
sub alphabet { |
431
|
1
|
|
|
1
|
|
445
|
my $self = shift; |
432
|
1
|
|
|
|
|
5
|
return $self->{db}->alphabet($self->{id}); |
433
|
|
|
|
|
|
|
} |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
sub revcom { |
436
|
|
|
|
|
|
|
# Override Bio::PrimarySeqI revcom() with optimized method. |
437
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
438
|
1
|
|
|
|
|
2
|
return $self->new(@{$self}{'db', 'id', 'stop', 'start'}); |
|
1
|
|
|
|
|
4
|
|
439
|
|
|
|
|
|
|
} |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
sub length { |
442
|
|
|
|
|
|
|
# Get length from sequence location, not the sequence string (too expensive) |
443
|
2
|
|
|
2
|
|
4
|
my $self = shift; |
444
|
|
|
|
|
|
|
return $self->{start} < $self->{stop} ? |
445
|
|
|
|
|
|
|
$self->{stop} - $self->{start} + 1 : |
446
|
2
|
100
|
|
|
|
13
|
$self->{start} - $self->{stop} + 1 ; |
447
|
|
|
|
|
|
|
} |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
sub description { |
450
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
451
|
1
|
|
|
|
|
4
|
my $header = $self->{'db'}->header($self->{id}); |
452
|
|
|
|
|
|
|
# Remove the ID from the header |
453
|
1
|
|
|
|
|
9
|
return (split(/\s+/, $header, 2))[1]; |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
*desc = \&description; |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
1; |