File Coverage

Bio/SeqIO/fasta.pm
Criterion Covered Total %
statement 93 105 88.5
branch 38 62 61.2
condition 17 36 47.2
subroutine 11 11 100.0
pod 5 5 100.0
total 164 219 74.8


line stmt bran cond sub pod time code
1             # BioPerl module for Bio::SeqIO::fasta
2             #
3             # Please direct questions and support issues to
4             #
5             # Cared for by Ewan Birney
6             # and Lincoln Stein
7             #
8             # Copyright Ewan Birney & Lincoln Stein
9             #
10             # You may distribute this module under the same terms as perl itself
11             # _history
12             # October 18, 1999 Largely rewritten by Lincoln Stein
13              
14             # POD documentation - main docs before the code
15              
16             =head1 NAME
17              
18             Bio::SeqIO::fasta - fasta sequence input/output stream
19              
20             =head1 SYNOPSIS
21              
22             Do not use this module directly. Use it via the Bio::SeqIO class.
23              
24             =head1 DESCRIPTION
25              
26             This object can transform Bio::Seq objects to and from fasta flat
27             file databases.
28              
29             =head1 FEEDBACK
30              
31             =head2 Mailing Lists
32              
33             User feedback is an integral part of the evolution of this and other
34             Bioperl modules. Send your comments and suggestions preferably to one
35             of the Bioperl mailing lists. Your participation is much appreciated.
36              
37             bioperl-l@bioperl.org - General discussion
38             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
39              
40             =head2 Support
41              
42             Please direct usage questions or support issues to the mailing list:
43              
44             I
45              
46             rather than to the module maintainer directly. Many experienced and
47             reponsive experts will be able look at the problem and quickly
48             address it. Please include a thorough description of the problem
49             with code and data examples if at all possible.
50              
51             =head2 Reporting Bugs
52              
53             Report bugs to the Bioperl bug tracking system to help us keep track
54             the bugs and their resolution. Bug reports can be submitted via the
55             web:
56              
57             https://github.com/bioperl/bioperl-live/issues
58              
59             =head1 AUTHORS - Ewan Birney & Lincoln Stein
60              
61             Email: birney@ebi.ac.uk
62             lstein@cshl.org
63              
64             =head1 CONTRIBUTORS
65              
66             Jason Stajich, jason-at-bioperl.org
67              
68             =head1 APPENDIX
69              
70             The rest of the documentation details each of the object
71             methods. Internal methods are usually preceded with a _
72              
73             =cut
74              
75             # Let the code begin...
76              
77             package Bio::SeqIO::fasta;
78 22     22   553 use strict;
  22         26  
  22         559  
79 22     22   73 use warnings;
  22         24  
  22         533  
80              
81 22     22   6673 use Bio::Seq::SeqFastaSpeedFactory;
  22         39  
  22         562  
82              
83 22     22   113 use parent qw(Bio::SeqIO);
  22         24  
  22         135  
84              
85             sub _initialize {
86 40     40   78 my ($self, @args) = @_;
87 40         138 $self->SUPER::_initialize(@args);
88              
89             ## Initialize fasta specific parameters
90             ## There are some problems with _rearrange. If there's no value for one of
91             ## the parameters, it will return an empty value (not undef). This means we
92             ## can't just merge two hashes since the empty values would override the
93             ## defaults anyway.
94 40         131 my (%defs) = (
95             "width" => 60,
96             "block" => "", # default is same as width
97             "preferred_id_type" => "display",
98             );
99 40         104 foreach my $param (keys %defs) {
100             $self->$param( $self->_rearrange([$param], @args) ||
101 120   33     271 $defs{$param});
102             }
103              
104 40 50       162 unless ( defined $self->sequence_factory ) {
105 40         225 $self->sequence_factory(Bio::Seq::SeqFastaSpeedFactory->new());
106             }
107             }
108              
109             =head2 next_seq
110              
111             Title : next_seq
112             Usage : $seq = $stream->next_seq()
113             Function: returns the next sequence in the stream
114             Returns : Bio::Seq object, or nothing if no more available
115             Args : NONE
116              
117             =cut
118              
119             sub next_seq {
120 89     89 1 2662 my( $self ) = @_;
121 89         77 my $seq;
122             my $alphabet;
123 89         341 local $/ = "\n>";
124 89 100       231 return unless my $entry = $self->_readline;
125              
126             # Replacing chomp for s///, since chomp is not working in some cases
127 80         523 $entry =~ s/\n$//;
128 80         84 $entry =~ s/\r$//;
129 80 50       280 if ($entry =~ m/\A\s*\Z/s) { # very first one
130 0 0       0 return unless $entry = $self->_readline;
131 0         0 chomp($entry);
132             }
133              
134             # this just checks the initial input; beyond that, due to setting $/ above,
135             # the > is part of the record separator and is removed
136 80 100 100     306 $self->throw("The sequence does not appear to be FASTA format ".
137             "(lacks a descriptor line '>')") if $. == 1 && $entry !~ /^>/;
138              
139 76         617 $entry =~ s/^>//;
140              
141 76         696 my ($top,$sequence) = split(/\n/,$entry,2);
142 76 100       784 defined $sequence && $sequence =~ s/>//g;
143             #my ($top,$sequence) = $entry =~ /^>?(.+?)\n+([^>]*)/s
144             # or $self->throw("Can't parse fasta entry");
145              
146 76         68 my ($id,$fulldesc);
147 76 50       226 if( $top =~ /^\s*(\S+)\s*(.*)/ ) {
148 76         175 ($id,$fulldesc) = ($1,$2);
149             }
150              
151 76 50 33     279 if (defined $id && $id eq '') {$id=$fulldesc;} # FIX incase no space
  0         0  
152             # between > and name \AE
153 76 100       1419 defined $sequence && $sequence =~ tr/ \t\n\r//d; # Remove whitespace
154              
155             # for empty sequences we need to know the mol.type
156 76         224 $alphabet = $self->alphabet();
157 76 100 100     254 if(defined $sequence && length($sequence) == 0) {
158 2 50       4 if(! defined($alphabet)) {
159             # let's default to dna
160 2         2 $alphabet = "dna";
161             }
162             }# else {
163             # we don't need it really, so disable
164             # we want to keep this if SeqIO alphabet was set by user
165             # not sure if this could break something
166             #$alphabet = undef;
167             #}
168              
169 76         151 $seq = $self->sequence_factory->create(
170             -seq => $sequence,
171             -id => $id,
172             # Ewan's note - I don't think this healthy
173             # but obviously to taste.
174             #-primary_id => $id,
175             -desc => $fulldesc,
176             -alphabet => $alphabet,
177             -direct => 1,
178             );
179              
180             # if there wasn't one before, set the guessed type
181             #unless ( defined $alphabet ) {
182             # don't assume that all our seqs are the same as the first one found
183             #$self->alphabet($seq->alphabet());
184             #}
185 76         239 return $seq;
186              
187             }
188              
189             =head2 write_seq
190              
191             Title : write_seq
192             Usage : $stream->write_seq(@seq)
193             Function: Writes the $seq object into the stream
194             Returns : 1 for success and 0 for error
195             Args : Array of 1 or more Bio::PrimarySeqI objects
196              
197             =cut
198              
199             sub write_seq {
200 7     7 1 31 my ($self,@seq) = @_;
201 7         21 my $width = $self->width;
202 7         16 my $block = $self->block;
203              
204             ## take a reference for single string (the sequence) and add the whitespace
205             local *format_str = sub {
206 7     7   10 my $str = $_[0];
207 7         58 my @lines = unpack ("(A$width)*", $$str);
208 7 50       17 if ($block >= $width) {
209 7         34 $$str = join ("\n", @lines)."\n";
210             } else {
211 0         0 $$str = "";
212 0         0 $$str .= join (" ", unpack ("(A$block)*", $_)) . "\n" foreach (@lines);
213             }
214 7         45 };
215              
216 7         16 foreach my $seq (@seq) {
217 7 50 33     91 $self->throw("Did not provide a valid Bio::PrimarySeqI object")
      33        
218             unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
219              
220             # Allow for different ids
221 7         8 my $top;
222 7         16 my $id_type = $self->preferred_id_type;
223 7 50       52 if( $id_type =~ /^acc/i ) {
    50          
    0          
224 0         0 $top = $seq->accession_number();
225 0 0       0 if( $id_type =~ /vers/i ) {
226 0         0 $top .= "." . $seq->version();
227             }
228             } elsif($id_type =~ /^displ/i ) {
229 7 50 33     23 $self->warn("No whitespace allowed in FASTA ID [". $seq->display_id. "]")
230             if defined $seq->display_id && $seq->display_id =~ /\s/;
231 7         16 $top = $seq->display_id();
232 7 50       18 $top = '' unless defined $top;
233 7 50 33     35 $self->warn("No whitespace allowed in FASTA ID [". $top. "]")
234             if defined $top && $top =~ /\s/;
235             } elsif($id_type =~ /^pri/i ) {
236 0         0 $top = $seq->primary_id();
237             }
238              
239 7 100 66     68 if ($seq->can('desc') and my $desc = $seq->desc()) {
240 6         11 $desc =~ s/\n//g;
241 6         17 $top .= " $desc";
242             }
243              
244 7 100       35 if( $seq->isa('Bio::Seq::LargeSeqI') ) {
245 1         6 $self->_print(">$top\n");
246             # for large seqs, don't call seq(), it defeats the
247             # purpose of the largeseq functionality. instead get
248             # chunks of the seq, $width at a time
249 1         1 my $buff_max = 2000;
250 1         6 my $buff_size = int($buff_max/$width)*$width; #< buffer is even multiple of widths
251 1         6 my $seq_length = $seq->length;
252 1         2 my $num_chunks = int($seq_length/$buff_size+1);
253 1         3 for( my $c = 0; $c < $num_chunks; $c++ ) {
254 1         1 my $buff_end = $buff_size*($c+1);
255 1 50       3 $buff_end = $seq_length if $buff_end > $seq_length;
256 1         4 my $buff = $seq->subseq($buff_size*$c+1,$buff_end);
257 1 50       3 if($buff) {
258 1         2 format_str (\$buff);
259 1         2 $self->_print($buff);
260             } else {
261 0         0 $self->_print("\n");
262             }
263             }
264             } else {
265 6         16 my $str = $seq->seq;
266 6 50 33     38 if(defined $str && length($str) > 0) {
267 6         14 format_str (\$str);
268             } else {
269 0         0 $str = "\n";
270             }
271 6 50       35 $self->_print (">",$top,"\n",$str) or return;
272             }
273             }
274              
275 7 50 33     20 $self->flush if $self->_flush_on_write && defined $self->_fh;
276 7         59 return 1;
277             }
278              
279             =head2 width
280              
281             Title : width
282             Usage : $obj->width($newval)
283             Function: Get/Set the line width for FASTA output (not counting whitespace).
284             Returns : value of width
285             Args : newvalue (optional)
286              
287             =cut
288              
289             sub width {
290 94     94 1 94 my ($self,$value) = @_;
291 94 100       166 if (defined $value) {
292 40         64 $self->{'width'} = $value;
293             }
294 94         176 return $self->{'width'};
295             }
296              
297             =head2 block
298              
299             Title : block
300             Usage : $obj->block($newval)
301             Function: Get/Set the length of each block for FASTA output. Sequence blocks
302             will be split with a space. Configuring block, to a value of 10 for
303             example, allows to easily indentify a position in a sequence by eye.
304             Default : same value used for width.
305             Returns : value of block
306             Args : newvalue (optional)
307              
308             =cut
309              
310             sub block {
311 47     47 1 80 my ($self,$value) = @_;
312 47 100       121 if (defined $value) {
313 40         63 $self->{'block'} = $value;
314             }
315 47   33     148 return $self->{'block'} || $self->width;
316             }
317              
318             =head2 preferred_id_type
319              
320             Title : preferred_id_type
321             Usage : $obj->preferred_id_type('accession')
322             Function: Get/Set the preferred type of identifier to use in the ">ID" position
323             for FASTA output.
324             Returns : string, one of values defined in @Bio::SeqIO::fasta::SEQ_ID_TYPES.
325             Default : display
326             Args : string when setting. This must be one of values defined in
327             @Bio::SeqIO::fasta::SEQ_ID_TYPES. Allowable values:
328             accession, accession.version, display, primary
329             Throws : fatal exception if the supplied id type is not in @SEQ_ID_TYPES.
330              
331             =cut
332              
333             our @SEQ_ID_TYPES = qw(accession accession.version display primary);
334              
335             sub preferred_id_type {
336 48     48 1 59 my ($self,$type) = @_;
337 48 100       118 if (defined $type) {
338 41 50       161 if( ! grep lc($type) eq $_, @SEQ_ID_TYPES) {
339 0         0 $self->throw(-class=>'Bio::Root::BadParameter',
340             -text=>"Invalid ID type \"$type\". Must be one of: @SEQ_ID_TYPES");
341             }
342 41         88 $self->{'_seq_id_type'} = lc($type);
343             }
344 48         153 $self->{'_seq_id_type'};
345             }
346              
347             1;