line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# BioPerl module for Bio::SeqIO::largefasta |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Please direct questions and support issues to |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
# Cared for by Jason Stajich |
6
|
|
|
|
|
|
|
# |
7
|
|
|
|
|
|
|
# Copyright Jason Stajich |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
10
|
|
|
|
|
|
|
# _history |
11
|
|
|
|
|
|
|
# |
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::SeqIO::largefasta - method i/o on very large fasta sequence files |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
Do not use this module directly. Use it via the Bio::SeqIO class. |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 DESCRIPTION |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
This object can transform Bio::Seq objects to and from fasta flat |
25
|
|
|
|
|
|
|
file databases. |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
This module handles very large sequence files by using the |
28
|
|
|
|
|
|
|
Bio::Seq::LargePrimarySeq module to store all the sequence data in |
29
|
|
|
|
|
|
|
a file. This can be a problem if you have limited disk space on your |
30
|
|
|
|
|
|
|
computer because this will effectively cause 2 copies of the sequence |
31
|
|
|
|
|
|
|
file to reside on disk for the life of the |
32
|
|
|
|
|
|
|
Bio::Seq::LargePrimarySeq object. The default location for this is |
33
|
|
|
|
|
|
|
specified by the L-Etmpdir routine which is usually /tmp |
34
|
|
|
|
|
|
|
on UNIX. If a sequence file is larger than the swap space (capacity |
35
|
|
|
|
|
|
|
of the /tmp dir) this could cause problems for the machine. It is |
36
|
|
|
|
|
|
|
possible to set the directory where the temporary file is located by |
37
|
|
|
|
|
|
|
adding the following line to your code BEFORE calling next_seq. See |
38
|
|
|
|
|
|
|
L for more information. |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
$Bio::Seq::LargePrimarySeq::DEFAULT_TEMP_DIR = 'newdir'; |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head1 FEEDBACK |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=head2 Mailing Lists |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
47
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
48
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
51
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
=head2 Support |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
I |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
60
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
61
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
62
|
|
|
|
|
|
|
with code and data examples if at all possible. |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=head2 Reporting Bugs |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
67
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the web: |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
=head1 AUTHORS - Jason Stajich |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
Email: jason@bioperl.org |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
=head1 APPENDIX |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
78
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
=cut |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Let the code begin... |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
package Bio::SeqIO::largefasta; |
85
|
1
|
|
|
1
|
|
444
|
use vars qw($FASTALINELEN); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
33
|
|
86
|
1
|
|
|
1
|
|
4
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
14
|
|
87
|
|
|
|
|
|
|
|
88
|
1
|
|
|
1
|
|
236
|
use Bio::Seq::SeqFactory; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
$FASTALINELEN = 60; |
91
|
1
|
|
|
1
|
|
4
|
use base qw(Bio::SeqIO); |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
402
|
|
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub _initialize { |
94
|
3
|
|
|
3
|
|
6
|
my($self,@args) = @_; |
95
|
3
|
|
|
|
|
12
|
$self->SUPER::_initialize(@args); |
96
|
3
|
50
|
|
|
|
10
|
if( ! defined $self->sequence_factory ) { |
97
|
3
|
|
|
|
|
8
|
$self->sequence_factory(Bio::Seq::SeqFactory->new |
98
|
|
|
|
|
|
|
(-verbose => $self->verbose(), |
99
|
|
|
|
|
|
|
-type => 'Bio::Seq::LargePrimarySeq')); |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
=head2 next_seq |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
Title : next_seq |
106
|
|
|
|
|
|
|
Usage : $seq = $stream->next_seq() |
107
|
|
|
|
|
|
|
Function: returns the next sequence in the stream |
108
|
|
|
|
|
|
|
Returns : A Bio::Seq::LargePrimarySeq object |
109
|
|
|
|
|
|
|
Args : NONE |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=cut |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub next_seq { |
114
|
2
|
|
|
2
|
1
|
423
|
my ($self) = @_; |
115
|
|
|
|
|
|
|
# local $/ = "\n"; |
116
|
2
|
|
|
|
|
6
|
my $largeseq = $self->sequence_factory->create(); |
117
|
2
|
|
|
|
|
3
|
my ($id,$fulldesc,$entry); |
118
|
2
|
|
|
|
|
1
|
my $count = 0; |
119
|
2
|
|
|
|
|
2
|
my $seen = 0; |
120
|
2
|
|
|
|
|
14
|
while( defined ($entry = $self->_readline) ) { |
121
|
5624
|
50
|
66
|
|
|
16271
|
if( $seen == 1 && $entry =~ /^\s*>/ ) { |
122
|
0
|
|
|
|
|
0
|
$self->_pushback($entry); |
123
|
0
|
|
|
|
|
0
|
return $largeseq; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
# if ( ($entry eq '>') || eof($self->_fh) ) { $seen = 1; next; } |
126
|
5624
|
50
|
|
|
|
8143
|
if ( ($entry eq '>') ) { $seen = 1; next; } |
|
0
|
100
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
127
|
|
|
|
|
|
|
elsif( $entry =~ /\s*>(.+?)$/ ) { |
128
|
2
|
|
|
|
|
4
|
$seen = 1; |
129
|
2
|
50
|
|
|
|
15
|
($id,$fulldesc) = ($1 =~ /^\s*(\S+)\s*(.*)$/) |
130
|
|
|
|
|
|
|
or $self->warn("Can't parse fasta header"); |
131
|
2
|
|
|
|
|
11
|
$largeseq->display_id($id); |
132
|
2
|
|
|
|
|
7
|
$largeseq->primary_id($id); |
133
|
2
|
|
|
|
|
6
|
$largeseq->desc($fulldesc); |
134
|
|
|
|
|
|
|
} else { |
135
|
5622
|
|
|
|
|
12758
|
$entry =~ s/\s+//g; |
136
|
5622
|
|
|
|
|
7964
|
$largeseq->add_sequence_as_string($entry); |
137
|
|
|
|
|
|
|
} |
138
|
5624
|
50
|
66
|
|
|
12541
|
(++$count % 1000 == 0 && $self->verbose() > 0) && print "line $count\n"; |
139
|
|
|
|
|
|
|
} |
140
|
2
|
50
|
|
|
|
10
|
return unless $seen; |
141
|
2
|
|
|
|
|
11
|
return $largeseq; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=head2 write_seq |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
Title : write_seq |
147
|
|
|
|
|
|
|
Usage : $stream->write_seq(@seq) |
148
|
|
|
|
|
|
|
Function: writes the $seq object into the stream |
149
|
|
|
|
|
|
|
Returns : 1 for success and 0 for error |
150
|
|
|
|
|
|
|
Args : Bio::Seq object |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
=cut |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub write_seq { |
156
|
1
|
|
|
1
|
1
|
341
|
my ($self,@seq) = @_; |
157
|
1
|
|
|
|
|
2
|
foreach my $seq (@seq) { |
158
|
1
|
|
|
|
|
7
|
my $top = $seq->id(); |
159
|
1
|
50
|
33
|
|
|
14
|
if ($seq->can('desc') and my $desc = $seq->desc()) { |
160
|
1
|
|
|
|
|
2
|
$desc =~ s/\n//g; |
161
|
1
|
|
|
|
|
3
|
$top .= " $desc"; |
162
|
|
|
|
|
|
|
} |
163
|
1
|
|
|
|
|
6
|
$self->_print (">",$top,"\n"); |
164
|
1
|
|
|
|
|
2
|
my $end = $seq->length(); |
165
|
1
|
|
|
|
|
3
|
my $start = 1; |
166
|
1
|
|
|
|
|
3
|
while( $start <= $end ) { |
167
|
2811
|
|
|
|
|
2163
|
my $stop = $start + $FASTALINELEN - 1; |
168
|
2811
|
100
|
|
|
|
3036
|
$stop = $end if( $stop > $end ); |
169
|
2811
|
|
|
|
|
3266
|
$self->_print($seq->subseq($start,$stop), "\n"); |
170
|
2811
|
|
|
|
|
3881
|
$start += $FASTALINELEN; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
1
|
50
|
33
|
|
|
8
|
$self->flush if $self->_flush_on_write && defined $self->_fh; |
175
|
1
|
|
|
|
|
11
|
return 1; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
1; |