line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# BioPerl module for Bio::SeqIO |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Please direct questions and support issues to |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
# Cared for by Ewan Birney |
6
|
|
|
|
|
|
|
# and Lincoln Stein |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Copyright Ewan Birney |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
11
|
|
|
|
|
|
|
# |
12
|
|
|
|
|
|
|
# _history |
13
|
|
|
|
|
|
|
# October 18, 1999 Largely rewritten by Lincoln Stein |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 NAME |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
Bio::SeqIO - Handler for SeqIO Formats |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 SYNOPSIS |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
use Bio::SeqIO; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
$in = Bio::SeqIO->new(-file => "inputfilename" , |
26
|
|
|
|
|
|
|
-format => 'Fasta'); |
27
|
|
|
|
|
|
|
$out = Bio::SeqIO->new(-file => ">outputfilename" , |
28
|
|
|
|
|
|
|
-format => 'EMBL'); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
while ( my $seq = $in->next_seq() ) { |
31
|
|
|
|
|
|
|
$out->write_seq($seq); |
32
|
|
|
|
|
|
|
} |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# Now, to actually get at the sequence object, use the standard Bio::Seq |
35
|
|
|
|
|
|
|
# methods (look at Bio::Seq if you don't know what they are) |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
use Bio::SeqIO; |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
$in = Bio::SeqIO->new(-file => "inputfilename" , |
40
|
|
|
|
|
|
|
-format => 'genbank'); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
while ( my $seq = $in->next_seq() ) { |
43
|
|
|
|
|
|
|
print "Sequence ",$seq->id, " first 10 bases ", |
44
|
|
|
|
|
|
|
$seq->subseq(1,10), "\n"; |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# The SeqIO system does have a filehandle binding. Most people find this |
49
|
|
|
|
|
|
|
# a little confusing, but it does mean you can write the world's |
50
|
|
|
|
|
|
|
# smallest reformatter |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
use Bio::SeqIO; |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
$in = Bio::SeqIO->newFh(-file => "inputfilename" , |
55
|
|
|
|
|
|
|
-format => 'Fasta'); |
56
|
|
|
|
|
|
|
$out = Bio::SeqIO->newFh(-format => 'EMBL'); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# World's shortest Fasta<->EMBL format converter: |
59
|
|
|
|
|
|
|
print $out $_ while <$in>; |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=head1 DESCRIPTION |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Bio::SeqIO is a handler module for the formats in the SeqIO set (eg, |
65
|
|
|
|
|
|
|
Bio::SeqIO::fasta). It is the officially sanctioned way of getting at |
66
|
|
|
|
|
|
|
the format objects, which most people should use. |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
The Bio::SeqIO system can be thought of like biological file handles. |
69
|
|
|
|
|
|
|
They are attached to filehandles with smart formatting rules (eg, |
70
|
|
|
|
|
|
|
genbank format, or EMBL format, or binary trace file format) and |
71
|
|
|
|
|
|
|
can either read or write sequence objects (Bio::Seq objects, or |
72
|
|
|
|
|
|
|
more correctly, Bio::SeqI implementing objects, of which Bio::Seq is |
73
|
|
|
|
|
|
|
one such object). If you want to know what to do with a Bio::Seq |
74
|
|
|
|
|
|
|
object, read L. |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
The idea is that you request a stream object for a particular format. |
77
|
|
|
|
|
|
|
All the stream objects have a notion of an internal file that is read |
78
|
|
|
|
|
|
|
from or written to. A particular SeqIO object instance is configured |
79
|
|
|
|
|
|
|
for either input or output. A specific example of a stream object is |
80
|
|
|
|
|
|
|
the Bio::SeqIO::fasta object. |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
Each stream object has functions |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
$stream->next_seq(); |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
and |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
$stream->write_seq($seq); |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
As an added bonus, you can recover a filehandle that is tied to the |
91
|
|
|
|
|
|
|
SeqIO object, allowing you to use the standard EE and print |
92
|
|
|
|
|
|
|
operations to read and write sequence objects: |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
use Bio::SeqIO; |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
$stream = Bio::SeqIO->newFh(-format => 'Fasta', |
97
|
|
|
|
|
|
|
-fh => \*ARGV); |
98
|
|
|
|
|
|
|
# read from standard input or the input filenames |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
while ( $seq = <$stream> ) { |
101
|
|
|
|
|
|
|
# do something with $seq |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
and |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
print $stream $seq; # when stream is in output mode |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
This makes the simplest ever reformatter |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
#!/usr/bin/perl |
111
|
|
|
|
|
|
|
use strict; |
112
|
|
|
|
|
|
|
my $format1 = shift; |
113
|
|
|
|
|
|
|
my $format2 = shift || die |
114
|
|
|
|
|
|
|
"Usage: reformat format1 format2 < input > output"; |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
use Bio::SeqIO; |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
my $in = Bio::SeqIO->newFh(-format => $format1, -fh => \*ARGV ); |
119
|
|
|
|
|
|
|
my $out = Bio::SeqIO->newFh(-format => $format2 ); |
120
|
|
|
|
|
|
|
# Note: you might want to quote -format to keep older |
121
|
|
|
|
|
|
|
# perl's from complaining. |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
print $out $_ while <$in>; |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=head1 CONSTRUCTORS |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
=head2 Bio::SeqIO-Enew() |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
$seqIO = Bio::SeqIO->new(-file => 'seqs.fasta', -format => $format); |
131
|
|
|
|
|
|
|
$seqIO = Bio::SeqIO->new(-fh => \*FILEHANDLE, -format => $format); |
132
|
|
|
|
|
|
|
$seqIO = Bio::SeqIO->new(-string => $string , -format => $format); |
133
|
|
|
|
|
|
|
$seqIO = Bio::SeqIO->new(-format => $format); |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
The new() class method constructs a new Bio::SeqIO object. The returned object |
136
|
|
|
|
|
|
|
can be used to retrieve or print Seq objects. new() accepts the following |
137
|
|
|
|
|
|
|
parameters: |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
=over 5 |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=item -file |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
A file path to be opened for reading or writing. The usual Perl |
144
|
|
|
|
|
|
|
conventions apply: |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
'file' # open file for reading |
147
|
|
|
|
|
|
|
'>file' # open file for writing |
148
|
|
|
|
|
|
|
'>>file' # open file for appending |
149
|
|
|
|
|
|
|
'+
|
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
To read from or write to a piped command, open a filehandle and use the -fh |
152
|
|
|
|
|
|
|
option. |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
=item -fh |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
You may use new() with a opened filehandle, provided as a glob reference. For |
157
|
|
|
|
|
|
|
example, to read from STDIN: |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
my $seqIO = Bio::SeqIO->new(-fh => \*STDIN); |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
A string filehandle is handy if you want to modify the output in the |
162
|
|
|
|
|
|
|
memory, before printing it out. The following program reads in EMBL |
163
|
|
|
|
|
|
|
formatted entries from a file and prints them out in fasta format with |
164
|
|
|
|
|
|
|
some HTML tags: |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
use Bio::SeqIO; |
167
|
|
|
|
|
|
|
use IO::String; |
168
|
|
|
|
|
|
|
my $in = Bio::SeqIO->new(-file => "emblfile", |
169
|
|
|
|
|
|
|
-format => 'EMBL'); |
170
|
|
|
|
|
|
|
while ( my $seq = $in->next_seq() ) { |
171
|
|
|
|
|
|
|
# the output handle is reset for every file |
172
|
|
|
|
|
|
|
my $stringio = IO::String->new($string); |
173
|
|
|
|
|
|
|
my $out = Bio::SeqIO->new(-fh => $stringio, |
174
|
|
|
|
|
|
|
-format => 'fasta'); |
175
|
|
|
|
|
|
|
# output goes into $string |
176
|
|
|
|
|
|
|
$out->write_seq($seq); |
177
|
|
|
|
|
|
|
# modify $string |
178
|
|
|
|
|
|
|
$string =~ s|(>)(\w+)|$1$2|g; |
179
|
|
|
|
|
|
|
# print into STDOUT |
180
|
|
|
|
|
|
|
print $string; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
Filehandles can also be used to read from or write to a piped command: |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
use Bio::SeqIO; |
186
|
|
|
|
|
|
|
#convert .fastq.gz to .fasta |
187
|
|
|
|
|
|
|
open my $zcat, 'zcat seq.fastq.gz |' or die $!; |
188
|
|
|
|
|
|
|
my $in=Bio::SeqIO->new(-fh=>$zcat, |
189
|
|
|
|
|
|
|
-format=>'fastq'); |
190
|
|
|
|
|
|
|
my $out=Bio::SeqIO->new(-file=>'>seq.fasta', |
191
|
|
|
|
|
|
|
-format=>'fasta'); |
192
|
|
|
|
|
|
|
while (my $seq=$in->next_seq) { |
193
|
|
|
|
|
|
|
$out->write_seq($seq) |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=item -string |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
A string to read the sequences from. For example: |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
my $string = ">seq1\nACGCTAGCTAGC\n"; |
201
|
|
|
|
|
|
|
my $seqIO = Bio::SeqIO->new(-string => $string); |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=item -format |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
Specify the format of the file. Supported formats include fasta, |
206
|
|
|
|
|
|
|
genbank, embl, swiss (SwissProt), Entrez Gene and tracefile formats |
207
|
|
|
|
|
|
|
such as abi (ABI) and scf. There are many more, for a complete listing |
208
|
|
|
|
|
|
|
see the SeqIO HOWTO (L). |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
If no format is specified and a filename is given then the module will |
211
|
|
|
|
|
|
|
attempt to deduce the format from the filename suffix. If there is no |
212
|
|
|
|
|
|
|
suffix that Bioperl understands then it will attempt to guess the |
213
|
|
|
|
|
|
|
format based on file content. If this is unsuccessful then SeqIO will |
214
|
|
|
|
|
|
|
throw a fatal error. |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
The format name is case-insensitive: 'FASTA', 'Fasta' and 'fasta' are |
217
|
|
|
|
|
|
|
all valid. |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
Currently, the tracefile formats (except for SCF) require installation |
220
|
|
|
|
|
|
|
of the external Staden "io_lib" package, as well as the |
221
|
|
|
|
|
|
|
Bio::SeqIO::staden::read package available from the bioperl-ext |
222
|
|
|
|
|
|
|
repository. |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=item -alphabet |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
Sets the alphabet ('dna', 'rna', or 'protein'). When the alphabet is |
227
|
|
|
|
|
|
|
set then Bioperl will not attempt to guess what the alphabet is. This |
228
|
|
|
|
|
|
|
may be important because Bioperl does not always guess correctly. |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
=item -flush |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
By default, all files (or filehandles) opened for writing sequences |
233
|
|
|
|
|
|
|
will be flushed after each write_seq() (making the file immediately |
234
|
|
|
|
|
|
|
usable). If you do not need this facility and would like to marginally |
235
|
|
|
|
|
|
|
improve the efficiency of writing multiple sequences to the same file |
236
|
|
|
|
|
|
|
(or filehandle), pass the -flush option '0' or any other value that |
237
|
|
|
|
|
|
|
evaluates as defined but false: |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
my $gb = Bio::SeqIO->new(-file => "
|
240
|
|
|
|
|
|
|
-format => "gb"); |
241
|
|
|
|
|
|
|
my $fa = Bio::SeqIO->new(-file => ">gball.fa", |
242
|
|
|
|
|
|
|
-format => "fasta", |
243
|
|
|
|
|
|
|
-flush => 0); # go as fast as we can! |
244
|
|
|
|
|
|
|
while($seq = $gb->next_seq) { $fa->write_seq($seq) } |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
=item -seqfactory |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
Provide a Bio::Factory::SequenceFactoryI object. See the sequence_factory() method. |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
=item -locfactory |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
Provide a Bio::Factory::LocationFactoryI object. See the location_factory() method. |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
=item -objbuilder |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
Provide a Bio::Factory::ObjectBuilderI object. See the object_builder() method. |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=back |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
=head2 Bio::SeqIO-EnewFh() |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
$fh = Bio::SeqIO->newFh(-fh => \*FILEHANDLE, -format=>$format); |
263
|
|
|
|
|
|
|
$fh = Bio::SeqIO->newFh(-format => $format); |
264
|
|
|
|
|
|
|
# etc. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
This constructor behaves like new(), but returns a tied filehandle |
267
|
|
|
|
|
|
|
rather than a Bio::SeqIO object. You can read sequences from this |
268
|
|
|
|
|
|
|
object using the familiar EE operator, and write to it using |
269
|
|
|
|
|
|
|
print(). The usual array and $_ semantics work. For example, you can |
270
|
|
|
|
|
|
|
read all sequence objects into an array like this: |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
@sequences = <$fh>; |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
Other operations, such as read(), sysread(), write(), close(), and |
275
|
|
|
|
|
|
|
printf() are not supported. |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
=head1 OBJECT METHODS |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
See below for more detailed summaries. The main methods are: |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
=head2 $sequence = $seqIO-Enext_seq() |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
Fetch the next sequence from the stream, or nothing if no more. |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
=head2 $seqIO-Ewrite_seq($sequence [,$another_sequence,...]) |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
Write the specified sequence(s) to the stream. |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=head2 TIEHANDLE(), READLINE(), PRINT() |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
These provide the tie interface. See L for more details. |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
=head1 FEEDBACK |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
=head2 Mailing Lists |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
298
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
299
|
|
|
|
|
|
|
of the Bioperl mailing lists. |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
Your participation is much appreciated. |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
304
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
=head2 Support |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
bioperl-l@bioperl.org |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
313
|
|
|
|
|
|
|
responsive experts will be able look at the problem and quickly |
314
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
315
|
|
|
|
|
|
|
with code and data examples if at all possible. |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=head2 Reporting Bugs |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
320
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
321
|
|
|
|
|
|
|
web: |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=head1 AUTHOR - Ewan Birney, Lincoln Stein |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
Email birney@ebi.ac.uk |
328
|
|
|
|
|
|
|
lstein@cshl.org |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
=head1 APPENDIX |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
333
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
=cut |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
#' Let the code begin... |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
package Bio::SeqIO; |
340
|
|
|
|
|
|
|
|
341
|
82
|
|
|
82
|
|
23241
|
use strict; |
|
82
|
|
|
|
|
143
|
|
|
82
|
|
|
|
|
3262
|
|
342
|
82
|
|
|
82
|
|
350
|
use warnings; |
|
82
|
|
|
|
|
165
|
|
|
82
|
|
|
|
|
2274
|
|
343
|
|
|
|
|
|
|
|
344
|
82
|
|
|
82
|
|
21047
|
use Bio::Factory::FTLocationFactory; |
|
82
|
|
|
|
|
203
|
|
|
82
|
|
|
|
|
2477
|
|
345
|
82
|
|
|
82
|
|
23754
|
use Bio::Seq::SeqBuilder; |
|
82
|
|
|
|
|
187
|
|
|
82
|
|
|
|
|
2166
|
|
346
|
82
|
|
|
82
|
|
23838
|
use Bio::Tools::GuessSeqFormat; |
|
82
|
|
|
|
|
203
|
|
|
82
|
|
|
|
|
2563
|
|
347
|
82
|
|
|
82
|
|
549
|
use Symbol; |
|
82
|
|
|
|
|
145
|
|
|
82
|
|
|
|
|
5064
|
|
348
|
|
|
|
|
|
|
|
349
|
82
|
|
|
82
|
|
579
|
use parent qw(Bio::Root::Root Bio::Root::IO Bio::Factory::SequenceStreamI); |
|
82
|
|
|
|
|
146
|
|
|
82
|
|
|
|
|
663
|
|
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
my %valid_alphabet_cache; |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
=head2 new |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
Title : new |
357
|
|
|
|
|
|
|
Usage : $stream = Bio::SeqIO->new(-file => 'sequences.fasta', |
358
|
|
|
|
|
|
|
-format => 'fasta'); |
359
|
|
|
|
|
|
|
Function: Returns a new sequence stream |
360
|
|
|
|
|
|
|
Returns : A Bio::SeqIO stream initialised with the appropriate format |
361
|
|
|
|
|
|
|
Args : Named parameters indicating where to read the sequences from or to |
362
|
|
|
|
|
|
|
write them to: |
363
|
|
|
|
|
|
|
-file => filename, OR |
364
|
|
|
|
|
|
|
-fh => filehandle to attach to, OR |
365
|
|
|
|
|
|
|
-string => string |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
Additional arguments, all with reasonable defaults: |
368
|
|
|
|
|
|
|
-format => format of the sequences, usually auto-detected |
369
|
|
|
|
|
|
|
-alphabet => 'dna', 'rna', or 'protein' |
370
|
|
|
|
|
|
|
-flush => 0 or 1 (default: flush filehandles after each write) |
371
|
|
|
|
|
|
|
-seqfactory => sequence factory |
372
|
|
|
|
|
|
|
-locfactory => location factory |
373
|
|
|
|
|
|
|
-objbuilder => object builder |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
See L |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=cut |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
my $entry = 0; |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
sub new { |
382
|
843
|
|
|
843
|
1
|
12525
|
my ($caller, @args) = @_; |
383
|
843
|
|
33
|
|
|
3612
|
my $class = ref($caller) || $caller; |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
# or do we want to call SUPER on an object if $caller is an |
386
|
|
|
|
|
|
|
# object? |
387
|
843
|
100
|
|
|
|
3992
|
if( $class =~ /Bio::SeqIO::(\S+)/ ) { |
388
|
420
|
|
|
|
|
1894
|
my ($self) = $class->SUPER::new(@args); |
389
|
420
|
|
|
|
|
2108
|
$self->_initialize(@args); |
390
|
420
|
|
|
|
|
3141
|
return $self; |
391
|
|
|
|
|
|
|
} else { |
392
|
423
|
|
|
|
|
1847
|
my %params = @args; |
393
|
423
|
|
|
|
|
1736
|
@params{ map { lc $_ } keys %params } = values %params; # lowercase keys |
|
1056
|
|
|
|
|
3019
|
|
394
|
|
|
|
|
|
|
|
395
|
423
|
100
|
100
|
|
|
2322
|
unless( defined $params{-file} || |
|
|
|
100
|
|
|
|
|
396
|
|
|
|
|
|
|
defined $params{-fh} || |
397
|
|
|
|
|
|
|
defined $params{-string} ) { |
398
|
|
|
|
|
|
|
$class->throw("file argument provided, but with an undefined value") |
399
|
3
|
100
|
|
|
|
10
|
if exists $params{'-file'}; |
400
|
|
|
|
|
|
|
$class->throw("fh argument provided, but with an undefined value") |
401
|
2
|
100
|
|
|
|
7
|
if exists $params{'-fh'}; |
402
|
|
|
|
|
|
|
$class->throw("string argument provided, but with an undefined value") |
403
|
1
|
50
|
|
|
|
3
|
if exists($params{'-string'}); |
404
|
|
|
|
|
|
|
# $class->throw("No file, fh, or string argument provided"); # neither defined |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# Determine or guess sequence format and variant |
408
|
421
|
|
|
|
|
1054
|
my $format = $params{'-format'}; |
409
|
421
|
100
|
|
|
|
1263
|
if (! $format ) { |
410
|
27
|
100
|
|
|
|
80
|
if ($params{-file}) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
411
|
|
|
|
|
|
|
# Guess from filename extension, and then from file content |
412
|
|
|
|
|
|
|
$format = $class->_guess_format( $params{-file} ) || |
413
|
24
|
|
100
|
|
|
105
|
Bio::Tools::GuessSeqFormat->new(-file => $params{-file} )->guess; |
414
|
|
|
|
|
|
|
} elsif ($params{-fh}) { |
415
|
|
|
|
|
|
|
# Guess from filehandle content |
416
|
1
|
|
|
|
|
8
|
$format = Bio::Tools::GuessSeqFormat->new(-fh => $params{-fh} )->guess; |
417
|
|
|
|
|
|
|
} elsif ($params{-string}) { |
418
|
|
|
|
|
|
|
# Guess from string content |
419
|
1
|
|
|
|
|
9
|
$format = Bio::Tools::GuessSeqFormat->new(-text => $params{-string})->guess; |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
# changed 1-3-11; no need to print out an empty string (only way this |
424
|
|
|
|
|
|
|
# exception is triggered) - cjfields |
425
|
420
|
100
|
|
|
|
1206
|
$class->throw("Could not guess format from file, filehandle or string") |
426
|
|
|
|
|
|
|
if not $format; |
427
|
418
|
|
|
|
|
1038
|
$format = "\L$format"; # normalize capitalization to lower case |
428
|
|
|
|
|
|
|
|
429
|
418
|
100
|
|
|
|
1657
|
if ($format =~ /-/) { |
430
|
55
|
|
|
|
|
204
|
($format, my $variant) = split('-', $format, 2); |
431
|
55
|
|
|
|
|
123
|
$params{-variant} = $variant; |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
|
434
|
418
|
50
|
|
|
|
1679
|
return unless( $class->_load_format_module($format) ); |
435
|
418
|
|
|
|
|
3928
|
return "Bio::SeqIO::$format"->new(%params); |
436
|
|
|
|
|
|
|
} |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=head2 newFh |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
Title : newFh |
443
|
|
|
|
|
|
|
Usage : $fh = Bio::SeqIO->newFh(-file=>$filename,-format=>'Format') |
444
|
|
|
|
|
|
|
Function: Does a new() followed by an fh() |
445
|
|
|
|
|
|
|
Example : $fh = Bio::SeqIO->newFh(-file=>$filename,-format=>'Format') |
446
|
|
|
|
|
|
|
$sequence = <$fh>; # read a sequence object |
447
|
|
|
|
|
|
|
print $fh $sequence; # write a sequence object |
448
|
|
|
|
|
|
|
Returns : filehandle tied to the Bio::SeqIO::Fh class |
449
|
|
|
|
|
|
|
Args : |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
See L |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
=cut |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
sub newFh { |
456
|
0
|
|
|
0
|
1
|
0
|
my $class = shift; |
457
|
0
|
0
|
|
|
|
0
|
return unless my $self = $class->new(@_); |
458
|
0
|
|
|
|
|
0
|
return $self->fh; |
459
|
|
|
|
|
|
|
} |
460
|
|
|
|
|
|
|
|
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
=head2 fh |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
Title : fh |
465
|
|
|
|
|
|
|
Usage : $obj->fh |
466
|
|
|
|
|
|
|
Function: Get or set the IO filehandle |
467
|
|
|
|
|
|
|
Example : $fh = $obj->fh; # make a tied filehandle |
468
|
|
|
|
|
|
|
$sequence = <$fh>; # read a sequence object |
469
|
|
|
|
|
|
|
print $fh $sequence; # write a sequence object |
470
|
|
|
|
|
|
|
Returns : filehandle tied to Bio::SeqIO class |
471
|
|
|
|
|
|
|
Args : none |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
=cut |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
sub fh { |
476
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
477
|
0
|
|
0
|
|
|
0
|
my $class = ref($self) || $self; |
478
|
0
|
|
|
|
|
0
|
my $s = Symbol::gensym; |
479
|
0
|
|
|
|
|
0
|
tie $$s,$class,$self; |
480
|
0
|
|
|
|
|
0
|
return $s; |
481
|
|
|
|
|
|
|
} |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
# _initialize is chained for all SeqIO classes |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
sub _initialize { |
487
|
436
|
|
|
436
|
|
1311
|
my($self, @args) = @_; |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
# flush is initialized by the Root::IO init |
490
|
|
|
|
|
|
|
|
491
|
436
|
|
|
|
|
2462
|
my ($seqfact,$locfact,$objbuilder, $alphabet) = |
492
|
|
|
|
|
|
|
$self->_rearrange([qw(SEQFACTORY |
493
|
|
|
|
|
|
|
LOCFACTORY |
494
|
|
|
|
|
|
|
OBJBUILDER |
495
|
|
|
|
|
|
|
ALPHABET) |
496
|
|
|
|
|
|
|
], @args); |
497
|
|
|
|
|
|
|
|
498
|
436
|
50
|
|
|
|
2149
|
$locfact = Bio::Factory::FTLocationFactory->new(-verbose => $self->verbose) |
499
|
|
|
|
|
|
|
if ! $locfact; |
500
|
436
|
50
|
|
|
|
1885
|
$objbuilder = Bio::Seq::SeqBuilder->new(-verbose => $self->verbose) |
501
|
|
|
|
|
|
|
unless $objbuilder; |
502
|
436
|
|
|
|
|
1942
|
$self->sequence_builder($objbuilder); |
503
|
436
|
|
|
|
|
1655
|
$self->location_factory($locfact); |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# note that this should come last because it propagates the sequence |
506
|
|
|
|
|
|
|
# factory to the sequence builder |
507
|
436
|
50
|
|
|
|
1075
|
$seqfact && $self->sequence_factory($seqfact); |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
#bug 2160 |
510
|
436
|
50
|
|
|
|
1001
|
$alphabet && $self->alphabet($alphabet); |
511
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
# initialize the IO part |
513
|
436
|
|
|
|
|
1732
|
$self->_initialize_io(@args); |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
=head2 next_seq |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
Title : next_seq |
520
|
|
|
|
|
|
|
Usage : $seq = stream->next_seq |
521
|
|
|
|
|
|
|
Function: Reads the next sequence object from the stream and returns it. |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
Certain driver modules may encounter entries in the stream |
524
|
|
|
|
|
|
|
that are either misformatted or that use syntax not yet |
525
|
|
|
|
|
|
|
understood by the driver. If such an incident is |
526
|
|
|
|
|
|
|
recoverable, e.g., by dismissing a feature of a feature |
527
|
|
|
|
|
|
|
table or some other non-mandatory part of an entry, the |
528
|
|
|
|
|
|
|
driver will issue a warning. In the case of a |
529
|
|
|
|
|
|
|
non-recoverable situation an exception will be thrown. Do |
530
|
|
|
|
|
|
|
not assume that you can resume parsing the same stream |
531
|
|
|
|
|
|
|
after catching the exception. Note that you can always turn |
532
|
|
|
|
|
|
|
recoverable errors into exceptions by calling |
533
|
|
|
|
|
|
|
$stream->verbose(2). |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
Returns : a Bio::Seq sequence object, or nothing if no more sequences |
536
|
|
|
|
|
|
|
are available |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
Args : none |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
See L, L, L |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
=cut |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
sub next_seq { |
545
|
0
|
|
|
0
|
1
|
0
|
my ($self, $seq) = @_; |
546
|
0
|
|
|
|
|
0
|
$self->throw("Sorry, you cannot read from a generic Bio::SeqIO object."); |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
=head2 write_seq |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
Title : write_seq |
553
|
|
|
|
|
|
|
Usage : $stream->write_seq($seq) |
554
|
|
|
|
|
|
|
Function: writes the $seq object into the stream |
555
|
|
|
|
|
|
|
Returns : 1 for success and 0 for error |
556
|
|
|
|
|
|
|
Args : Bio::Seq object |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=cut |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
sub write_seq { |
561
|
0
|
|
|
0
|
1
|
0
|
my ($self, $seq) = @_; |
562
|
0
|
|
|
|
|
0
|
$self->throw("Sorry, you cannot write to a generic Bio::SeqIO object."); |
563
|
|
|
|
|
|
|
} |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
=head2 format |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
Title : format |
569
|
|
|
|
|
|
|
Usage : $format = $stream->format() |
570
|
|
|
|
|
|
|
Function: Get the sequence format |
571
|
|
|
|
|
|
|
Returns : sequence format, e.g. fasta, fastq |
572
|
|
|
|
|
|
|
Args : none |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
=cut |
575
|
|
|
|
|
|
|
|
576
|
|
|
|
|
|
|
# format() method inherited from Bio::Root::IO |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
=head2 alphabet |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
Title : alphabet |
582
|
|
|
|
|
|
|
Usage : $self->alphabet($newval) |
583
|
|
|
|
|
|
|
Function: Set/get the molecule type for the Seq objects to be created. |
584
|
|
|
|
|
|
|
Example : $seqio->alphabet('protein') |
585
|
|
|
|
|
|
|
Returns : value of alphabet: 'dna', 'rna', or 'protein' |
586
|
|
|
|
|
|
|
Args : newvalue (optional) |
587
|
|
|
|
|
|
|
Throws : Exception if the argument is not one of 'dna', 'rna', or 'protein' |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
=cut |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
sub alphabet { |
592
|
80
|
|
|
80
|
1
|
141
|
my ($self, $value) = @_; |
593
|
|
|
|
|
|
|
|
594
|
80
|
100
|
|
|
|
155
|
if ( defined $value) { |
595
|
2
|
|
|
|
|
4
|
$value = lc $value; |
596
|
2
|
50
|
|
|
|
6
|
unless ($valid_alphabet_cache{$value}) { |
597
|
|
|
|
|
|
|
# instead of hard-coding the allowed values once more, we check by |
598
|
|
|
|
|
|
|
# creating a dummy sequence object |
599
|
2
|
|
|
|
|
4
|
eval { |
600
|
2
|
|
|
|
|
9
|
require Bio::PrimarySeq; |
601
|
2
|
|
|
|
|
7
|
my $seq = Bio::PrimarySeq->new( -verbose => $self->verbose, |
602
|
|
|
|
|
|
|
-alphabet => $value ); |
603
|
|
|
|
|
|
|
}; |
604
|
2
|
50
|
|
|
|
6
|
if ($@) { |
605
|
0
|
|
|
|
|
0
|
$self->throw("Invalid alphabet: $value\n. See Bio::PrimarySeq for allowed values."); |
606
|
|
|
|
|
|
|
} |
607
|
2
|
|
|
|
|
5
|
$valid_alphabet_cache{$value} = 1; |
608
|
|
|
|
|
|
|
} |
609
|
2
|
|
|
|
|
4
|
$self->{'alphabet'} = $value; |
610
|
|
|
|
|
|
|
} |
611
|
80
|
|
|
|
|
165
|
return $self->{'alphabet'}; |
612
|
|
|
|
|
|
|
} |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
=head2 _load_format_module |
616
|
|
|
|
|
|
|
|
617
|
|
|
|
|
|
|
Title : _load_format_module |
618
|
|
|
|
|
|
|
Usage : *INTERNAL SeqIO stuff* |
619
|
|
|
|
|
|
|
Function: Loads up (like use) a module at run time on demand |
620
|
|
|
|
|
|
|
Example : |
621
|
|
|
|
|
|
|
Returns : |
622
|
|
|
|
|
|
|
Args : |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
=cut |
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
sub _load_format_module { |
627
|
418
|
|
|
418
|
|
1196
|
my ($self, $format) = @_; |
628
|
418
|
|
|
|
|
1127
|
my $module = "Bio::SeqIO::" . $format; |
629
|
418
|
|
|
|
|
663
|
my $ok; |
630
|
|
|
|
|
|
|
|
631
|
418
|
|
|
|
|
734
|
eval { |
632
|
418
|
|
|
|
|
2251
|
$ok = $self->_load_module($module); |
633
|
|
|
|
|
|
|
}; |
634
|
418
|
50
|
|
|
|
1162
|
if ( $@ ) { |
635
|
0
|
|
|
|
|
0
|
print STDERR <
|
636
|
|
|
|
|
|
|
$self: $format cannot be found |
637
|
|
|
|
|
|
|
Exception $@ |
638
|
|
|
|
|
|
|
For more information about the SeqIO system please see the SeqIO docs. |
639
|
|
|
|
|
|
|
This includes ways of checking for formats at compile time, not run time |
640
|
|
|
|
|
|
|
END |
641
|
|
|
|
|
|
|
; |
642
|
|
|
|
|
|
|
} |
643
|
418
|
|
|
|
|
1384
|
return $ok; |
644
|
|
|
|
|
|
|
} |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
=head2 _concatenate_lines |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
Title : _concatenate_lines |
650
|
|
|
|
|
|
|
Usage : $s = _concatenate_lines($line, $continuation_line) |
651
|
|
|
|
|
|
|
Function: Private. Concatenates two strings assuming that the second stems |
652
|
|
|
|
|
|
|
from a continuation line of the first. Adds a space between both |
653
|
|
|
|
|
|
|
unless the first ends with a dash. |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
Takes care of either arg being empty. |
656
|
|
|
|
|
|
|
Example : |
657
|
|
|
|
|
|
|
Returns : A string. |
658
|
|
|
|
|
|
|
Args : |
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
=cut |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
sub _concatenate_lines { |
663
|
516
|
|
|
516
|
|
1237
|
my ($self, $s1, $s2) = @_; |
664
|
516
|
50
|
66
|
|
|
1490
|
$s1 .= " " if($s1 && ($s1 !~ /-$/) && $s2); |
|
|
|
66
|
|
|
|
|
665
|
516
|
100
|
|
|
|
1705
|
return ($s1 ? $s1 : "") . ($s2 ? $s2 : ""); |
|
|
50
|
|
|
|
|
|
666
|
|
|
|
|
|
|
} |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
=head2 _filehandle |
670
|
|
|
|
|
|
|
|
671
|
|
|
|
|
|
|
Title : _filehandle |
672
|
|
|
|
|
|
|
Usage : $obj->_filehandle($newval) |
673
|
|
|
|
|
|
|
Function: This method is deprecated. Call _fh() instead. |
674
|
|
|
|
|
|
|
Example : |
675
|
|
|
|
|
|
|
Returns : value of _filehandle |
676
|
|
|
|
|
|
|
Args : newvalue (optional) |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
=cut |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
sub _filehandle { |
681
|
7
|
|
|
7
|
|
13
|
my ($self,@args) = @_; |
682
|
7
|
|
|
|
|
19
|
return $self->_fh(@args); |
683
|
|
|
|
|
|
|
} |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
=head2 _guess_format |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
Title : _guess_format |
689
|
|
|
|
|
|
|
Usage : $obj->_guess_format($filename) |
690
|
|
|
|
|
|
|
Function: guess format based on file suffix |
691
|
|
|
|
|
|
|
Example : |
692
|
|
|
|
|
|
|
Returns : guessed format of filename (lower case) |
693
|
|
|
|
|
|
|
Args : |
694
|
|
|
|
|
|
|
Notes : formats that _filehandle() will guess include fasta, |
695
|
|
|
|
|
|
|
genbank, scf, pir, embl, raw, gcg, ace, bsml, swissprot, |
696
|
|
|
|
|
|
|
fastq and phd/phred |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
=cut |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
sub _guess_format { |
701
|
24
|
|
|
24
|
|
40
|
my $class = shift; |
702
|
24
|
50
|
|
|
|
75
|
return unless $_ = shift; |
703
|
|
|
|
|
|
|
|
704
|
24
|
50
|
|
|
|
125
|
return 'abi' if /\.ab[i1]$/i; |
705
|
24
|
100
|
|
|
|
87
|
return 'ace' if /\.ace$/i; |
706
|
23
|
50
|
|
|
|
75
|
return 'alf' if /\.alf$/i; |
707
|
23
|
50
|
|
|
|
127
|
return 'bsml' if /\.(bsm|bsml)$/i; |
708
|
23
|
50
|
|
|
|
80
|
return 'ctf' if /\.ctf$/i; |
709
|
23
|
100
|
|
|
|
89
|
return 'embl' if /\.(embl|ebl|emb|dat)$/i; |
710
|
22
|
50
|
|
|
|
74
|
return 'entrezgene' if /\.asn$/i; |
711
|
22
|
50
|
|
|
|
70
|
return 'exp' if /\.exp$/i; |
712
|
22
|
100
|
|
|
|
114
|
return 'fasta' if /\.(fasta|fast|fas|seq|fa|fsa|nt|aa|fna|faa)$/i; |
713
|
15
|
100
|
|
|
|
73
|
return 'fastq' if /\.fastq$/i; |
714
|
13
|
100
|
|
|
|
49
|
return 'gcg' if /\.gcg$/i; |
715
|
12
|
100
|
|
|
|
75
|
return 'genbank' if /\.(gb|gbank|genbank|gbk|gbs)$/i; |
716
|
8
|
50
|
|
|
|
36
|
return 'phd' if /\.(phd|phred)$/i; |
717
|
8
|
100
|
|
|
|
37
|
return 'pir' if /\.pir$/i; |
718
|
7
|
50
|
|
|
|
33
|
return 'pln' if /\.pln$/i; |
719
|
7
|
50
|
|
|
|
28
|
return 'qual' if /\.qual$/i; |
720
|
7
|
50
|
|
|
|
28
|
return 'raw' if /\.txt$/i; |
721
|
7
|
50
|
|
|
|
31
|
return 'scf' if /\.scf$/i; |
722
|
|
|
|
|
|
|
# from Strider 1.4 Release Notes: The file name extensions used by |
723
|
|
|
|
|
|
|
# Strider 1.4 are ".xdna", ".xdgn", ".xrna" and ".xprt" for DNA, |
724
|
|
|
|
|
|
|
# DNA Degenerate, RNA and Protein Sequence Files, respectively |
725
|
7
|
50
|
|
|
|
30
|
return 'strider' if /\.(xdna|xdgn|xrna|xprt)$/i; |
726
|
7
|
100
|
|
|
|
48
|
return 'swiss' if /\.(swiss|sp)$/i; |
727
|
6
|
50
|
|
|
|
91
|
return 'ztr' if /\.ztr$/i; |
728
|
|
|
|
|
|
|
} |
729
|
|
|
|
|
|
|
|
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
sub DESTROY { |
732
|
431
|
|
|
431
|
|
34084
|
my $self = shift; |
733
|
431
|
|
|
|
|
2535
|
$self->close(); |
734
|
|
|
|
|
|
|
} |
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
sub TIEHANDLE { |
738
|
0
|
|
|
0
|
|
0
|
my ($class,$val) = @_; |
739
|
0
|
|
|
|
|
0
|
return bless {'seqio' => $val}, $class; |
740
|
|
|
|
|
|
|
} |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
sub READLINE { |
744
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
745
|
0
|
0
|
0
|
|
|
0
|
return $self->{'seqio'}->next_seq() || undef unless wantarray; |
746
|
0
|
|
|
|
|
0
|
my (@list, $obj); |
747
|
0
|
|
|
|
|
0
|
push @list, $obj while $obj = $self->{'seqio'}->next_seq(); |
748
|
0
|
|
|
|
|
0
|
return @list; |
749
|
|
|
|
|
|
|
} |
750
|
|
|
|
|
|
|
|
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
sub PRINT { |
753
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
754
|
0
|
|
|
|
|
0
|
$self->{'seqio'}->write_seq(@_); |
755
|
|
|
|
|
|
|
} |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
=head2 sequence_factory |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
Title : sequence_factory |
761
|
|
|
|
|
|
|
Usage : $seqio->sequence_factory($seqfactory) |
762
|
|
|
|
|
|
|
Function: Get/Set the Bio::Factory::SequenceFactoryI |
763
|
|
|
|
|
|
|
Returns : Bio::Factory::SequenceFactoryI |
764
|
|
|
|
|
|
|
Args : [optional] Bio::Factory::SequenceFactoryI |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
=cut |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
sub sequence_factory { |
769
|
1565
|
|
|
1565
|
1
|
2973
|
my ($self, $obj) = @_; |
770
|
1565
|
100
|
|
|
|
3122
|
if( defined $obj ) { |
771
|
413
|
50
|
33
|
|
|
3678
|
if( ! ref($obj) || ! $obj->isa('Bio::Factory::SequenceFactoryI') ) { |
772
|
0
|
|
|
|
|
0
|
$self->throw("Must provide a valid Bio::Factory::SequenceFactoryI object to ".ref($self)."::sequence_factory()"); |
773
|
|
|
|
|
|
|
} |
774
|
413
|
|
|
|
|
1073
|
$self->{'_seqio_seqfactory'} = $obj; |
775
|
413
|
|
|
|
|
1022
|
my $builder = $self->sequence_builder(); |
776
|
413
|
50
|
66
|
|
|
4195
|
if($builder && $builder->can('sequence_factory') && |
|
|
|
66
|
|
|
|
|
777
|
|
|
|
|
|
|
(! $builder->sequence_factory())) { |
778
|
411
|
|
|
|
|
997
|
$builder->sequence_factory($obj); |
779
|
|
|
|
|
|
|
} |
780
|
|
|
|
|
|
|
} |
781
|
1565
|
|
|
|
|
5637
|
$self->{'_seqio_seqfactory'}; |
782
|
|
|
|
|
|
|
} |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
=head2 object_factory |
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
Title : object_factory |
788
|
|
|
|
|
|
|
Usage : $obj->object_factory($newval) |
789
|
|
|
|
|
|
|
Function: This is an alias to sequence_factory with a more generic name. |
790
|
|
|
|
|
|
|
Example : |
791
|
|
|
|
|
|
|
Returns : value of object_factory (a scalar) |
792
|
|
|
|
|
|
|
Args : on set, new value (a scalar or undef, optional) |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
=cut |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
sub object_factory{ |
797
|
0
|
|
|
0
|
1
|
0
|
return shift->sequence_factory(@_); |
798
|
|
|
|
|
|
|
} |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
=head2 sequence_builder |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
Title : sequence_builder |
804
|
|
|
|
|
|
|
Usage : $seqio->sequence_builder($seqfactory) |
805
|
|
|
|
|
|
|
Function: Get/Set the Bio::Factory::ObjectBuilderI used to build sequence |
806
|
|
|
|
|
|
|
objects. This applies to rich sequence formats only, e.g. genbank |
807
|
|
|
|
|
|
|
but not fasta. |
808
|
|
|
|
|
|
|
|
809
|
|
|
|
|
|
|
If you do not set the sequence object builder yourself, it |
810
|
|
|
|
|
|
|
will in fact be an instance of L, and |
811
|
|
|
|
|
|
|
you may use all methods documented there to configure it. |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
Returns : a Bio::Factory::ObjectBuilderI compliant object |
814
|
|
|
|
|
|
|
Args : [optional] a Bio::Factory::ObjectBuilderI compliant object |
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
=cut |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
sub sequence_builder { |
819
|
1054
|
|
|
1054
|
1
|
1869
|
my ($self, $obj) = @_; |
820
|
1054
|
100
|
|
|
|
2159
|
if( defined $obj ) { |
821
|
436
|
50
|
33
|
|
|
3099
|
if( ! ref($obj) || ! $obj->isa('Bio::Factory::ObjectBuilderI') ) { |
822
|
0
|
|
|
|
|
0
|
$self->throw("Must provide a valid Bio::Factory::ObjectBuilderI object to ".ref($self)."::sequence_builder()"); |
823
|
|
|
|
|
|
|
} |
824
|
436
|
|
|
|
|
1002
|
$self->{'_object_builder'} = $obj; |
825
|
|
|
|
|
|
|
} |
826
|
1054
|
|
|
|
|
2271
|
$self->{'_object_builder'}; |
827
|
|
|
|
|
|
|
} |
828
|
|
|
|
|
|
|
|
829
|
|
|
|
|
|
|
|
830
|
|
|
|
|
|
|
=head2 location_factory |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
Title : location_factory |
833
|
|
|
|
|
|
|
Usage : $seqio->location_factory($locfactory) |
834
|
|
|
|
|
|
|
Function: Get/Set the Bio::Factory::LocationFactoryI object to be used for |
835
|
|
|
|
|
|
|
location string parsing |
836
|
|
|
|
|
|
|
Returns : a Bio::Factory::LocationFactoryI implementing object |
837
|
|
|
|
|
|
|
Args : [optional] on set, a Bio::Factory::LocationFactoryI implementing |
838
|
|
|
|
|
|
|
object. |
839
|
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
=cut |
841
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
sub location_factory { |
843
|
9416
|
|
|
9416
|
1
|
14976
|
my ($self,$obj) = @_; |
844
|
9416
|
100
|
|
|
|
16546
|
if( defined $obj ) { |
845
|
436
|
50
|
33
|
|
|
2929
|
if( ! ref($obj) || ! $obj->isa('Bio::Factory::LocationFactoryI') ) { |
846
|
0
|
|
|
|
|
0
|
$self->throw("Must provide a valid Bio::Factory::LocationFactoryI" . |
847
|
|
|
|
|
|
|
" object to ".ref($self)."->location_factory()"); |
848
|
|
|
|
|
|
|
} |
849
|
436
|
|
|
|
|
893
|
$self->{'_seqio_locfactory'} = $obj; |
850
|
|
|
|
|
|
|
} |
851
|
9416
|
|
|
|
|
21153
|
$self->{'_seqio_locfactory'}; |
852
|
|
|
|
|
|
|
} |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
1; |
855
|
|
|
|
|
|
|
|