line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package BioX::Seq::Fetch; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
690
|
use 5.012; |
|
1
|
|
|
|
|
6
|
|
4
|
1
|
|
|
1
|
|
8
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
29
|
|
5
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
37
|
|
6
|
|
|
|
|
|
|
|
7
|
1
|
|
|
1
|
|
6
|
use BioX::Seq; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
24
|
|
8
|
1
|
|
|
1
|
|
1341
|
use File::stat; |
|
1
|
|
|
|
|
9109
|
|
|
1
|
|
|
|
|
5
|
|
9
|
|
|
|
|
|
|
|
10
|
1
|
|
|
1
|
|
99
|
use constant MAGIC_GZIP => pack('C3', (0x1f, 0x8b, 0x08)); |
|
1
|
|
|
|
|
30
|
|
|
1
|
|
|
|
|
1532
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub new { |
13
|
|
|
|
|
|
|
|
14
|
10
|
|
|
10
|
1
|
610
|
my ($class,$fn,%args) = @_; |
15
|
|
|
|
|
|
|
|
16
|
10
|
100
|
|
|
|
44
|
die "Must define input filename for random access" |
17
|
|
|
|
|
|
|
if (! defined $fn); |
18
|
|
|
|
|
|
|
|
19
|
9
|
|
|
|
|
37
|
my $self = bless {fn => $fn}, $class; |
20
|
|
|
|
|
|
|
|
21
|
9
|
100
|
|
|
|
382
|
open my $fh, '<', $fn |
22
|
|
|
|
|
|
|
or die "Error opening $fn for reading: $!\n"; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# read magic bytes and reset filehandle |
25
|
8
|
|
|
|
|
76
|
my $old_layers = join '', map {":$_"} PerlIO::get_layers($fh); |
|
16
|
|
|
|
|
57
|
|
26
|
8
|
|
|
|
|
26
|
binmode($fh); |
27
|
8
|
|
|
|
|
156
|
read( $fh, my $magic_start, 3 ); |
28
|
8
|
|
|
|
|
32
|
read( $fh, my $magic_end, 1 ); |
29
|
8
|
|
|
|
|
119
|
binmode($fh,$old_layers); |
30
|
8
|
|
|
|
|
62
|
seek($fh,0,0); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# detect compression (must be BGZIP) |
33
|
8
|
100
|
|
|
|
33
|
if ($magic_start eq MAGIC_GZIP) { |
34
|
3
|
100
|
|
|
|
30
|
die "Gzip files must be compressed with bgzip for non-sequential access" |
35
|
|
|
|
|
|
|
if (ord($magic_end) != 0x04); |
36
|
|
|
|
|
|
|
# TODO: find way to test missing library during unit testing |
37
|
|
|
|
|
|
|
require Compress::BGZF::Reader |
38
|
2
|
50
|
|
|
|
706
|
or die "Compress::BGZF::Reader not installed, no gzip support"; |
39
|
2
|
|
|
|
|
26010
|
close $fh; |
40
|
2
|
|
|
|
|
43
|
$fh = Compress::BGZF::Reader->new_filehandle($fn); |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
|
43
|
7
|
100
|
|
|
|
1818
|
$self->{with_description} = 1 if ($args{with_description}); |
44
|
|
|
|
|
|
|
|
45
|
7
|
|
|
|
|
19
|
$self->{fh} = $fh; |
46
|
|
|
|
|
|
|
|
47
|
7
|
|
|
|
|
27
|
$self->_load_faidx; |
48
|
|
|
|
|
|
|
|
49
|
3
|
|
|
|
|
97
|
return $self; |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub write_index { |
54
|
|
|
|
|
|
|
|
55
|
8
|
|
|
8
|
1
|
653
|
my ($self, $fn_idx) = @_; |
56
|
|
|
|
|
|
|
|
57
|
8
|
|
66
|
|
|
53
|
$fn_idx //= $self->{fn} . '.fai'; |
58
|
|
|
|
|
|
|
|
59
|
8
|
50
|
|
|
|
75
|
if (-e $fn_idx) { |
60
|
0
|
|
|
|
|
0
|
warn "Index file exists, won't overwrite\n"; |
61
|
0
|
|
|
|
|
0
|
return 0; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
8
|
50
|
|
|
|
528
|
open my $idx, '>:raw', $fn_idx |
65
|
|
|
|
|
|
|
or die "Error opening $fn_idx for writing: $!\n"; |
66
|
|
|
|
|
|
|
|
67
|
8
|
|
|
|
|
30
|
my $fh = $self->{fh}; |
68
|
|
|
|
|
|
|
|
69
|
8
|
|
|
|
|
31
|
my $curr_id; |
70
|
|
|
|
|
|
|
my $curr_len; |
71
|
8
|
|
|
|
|
0
|
my $curr_bases; |
72
|
8
|
|
|
|
|
0
|
my $curr_bytes; |
73
|
8
|
|
|
|
|
0
|
my $curr_offset; |
74
|
8
|
|
|
|
|
16
|
my $bl_mismatch = 0; |
75
|
8
|
|
|
|
|
14
|
my $ll_mismatch = 0; |
76
|
8
|
|
|
|
|
106
|
while (my $line = <$fh>) { |
77
|
65
|
100
|
|
|
|
2148
|
if ($line =~ /^>(\S+)/) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
78
|
25
|
100
|
|
|
|
51
|
if (defined $curr_id) { |
79
|
17
|
|
|
|
|
22
|
say {$idx} join "\t", |
|
17
|
|
|
|
|
117
|
|
80
|
|
|
|
|
|
|
$curr_id, |
81
|
|
|
|
|
|
|
$curr_len, |
82
|
|
|
|
|
|
|
$curr_offset, |
83
|
|
|
|
|
|
|
$curr_bases, |
84
|
|
|
|
|
|
|
$curr_bytes, |
85
|
|
|
|
|
|
|
; |
86
|
|
|
|
|
|
|
} |
87
|
25
|
|
|
|
|
92
|
$curr_id = $1; |
88
|
25
|
|
|
|
|
56
|
$curr_offset = tell $fh; |
89
|
25
|
|
|
|
|
77
|
$curr_len = 0; |
90
|
25
|
|
|
|
|
37
|
$curr_bases = undef; |
91
|
25
|
|
|
|
|
34
|
$curr_bytes = undef; |
92
|
25
|
|
|
|
|
46
|
$bl_mismatch = 0; |
93
|
25
|
|
|
|
|
74
|
$ll_mismatch = 0; |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
elsif ($line =~ /^[A-Za-z\-]+(\r?\n?)$/) { |
96
|
37
|
100
|
|
|
|
75
|
if ($bl_mismatch) { |
97
|
1
|
|
|
|
|
12
|
close $idx; |
98
|
1
|
|
|
|
|
43
|
unlink $fn_idx; |
99
|
1
|
|
|
|
|
27
|
die "Base length mismatch\n"; |
100
|
|
|
|
|
|
|
} |
101
|
36
|
100
|
|
|
|
55
|
if ($ll_mismatch) { |
102
|
1
|
|
|
|
|
13
|
close $idx; |
103
|
1
|
|
|
|
|
38
|
unlink $fn_idx; |
104
|
1
|
|
|
|
|
25
|
die "Line length mismatch\n"; |
105
|
|
|
|
|
|
|
} |
106
|
35
|
|
|
|
|
62
|
my $ll = length $line; |
107
|
35
|
|
|
|
|
63
|
my $bl = $ll - length $1; |
108
|
35
|
|
|
|
|
43
|
$curr_len += $bl; |
109
|
35
|
|
66
|
|
|
112
|
$curr_bases //= $bl; |
110
|
35
|
|
66
|
|
|
105
|
$curr_bytes //= $ll; |
111
|
35
|
100
|
|
|
|
58
|
$bl_mismatch = 1 if ($bl != $curr_bases); |
112
|
35
|
100
|
|
|
|
136
|
$ll_mismatch = 1 if ($ll != $curr_bytes); |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
elsif ($line =~ /\S/) { |
115
|
1
|
|
|
|
|
34
|
close $idx; |
116
|
1
|
|
|
|
|
76
|
unlink $fn_idx; |
117
|
1
|
|
|
|
|
27
|
die "Unexpected content: $line"; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# write remaining index |
123
|
|
|
|
|
|
|
# uncoverable branch false |
124
|
5
|
50
|
|
|
|
124
|
if (defined $curr_id) { |
125
|
5
|
|
|
|
|
10
|
say {$idx} join "\t", |
|
5
|
|
|
|
|
21
|
|
126
|
|
|
|
|
|
|
$curr_id, |
127
|
|
|
|
|
|
|
$curr_len, |
128
|
|
|
|
|
|
|
$curr_offset, |
129
|
|
|
|
|
|
|
$curr_bases, |
130
|
|
|
|
|
|
|
$curr_bytes, |
131
|
|
|
|
|
|
|
; |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
5
|
|
|
|
|
201
|
close $idx; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
#reset filehandle |
137
|
5
|
|
|
|
|
41
|
seek $self->{fh}, 0, 0; |
138
|
|
|
|
|
|
|
|
139
|
5
|
|
|
|
|
1246
|
return 1; |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub _load_faidx { |
145
|
|
|
|
|
|
|
|
146
|
7
|
|
|
7
|
|
16
|
my ($self) = @_; |
147
|
|
|
|
|
|
|
|
148
|
7
|
|
|
|
|
19
|
my $fn_idx = $self->{fn} . '.fai'; |
149
|
7
|
50
|
|
|
|
143
|
$self->write_index if (! -e $fn_idx); |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# make sure FASTA file is not newer than its index; otherwise index file |
152
|
|
|
|
|
|
|
# may be out of date and cause silent corruption downstream |
153
|
4
|
|
|
|
|
27
|
my $mtime_fas = stat($self->{fn})->mtime; |
154
|
4
|
|
|
|
|
818
|
my $mtime_idx = stat($fn_idx)->mtime; |
155
|
4
|
50
|
|
|
|
606
|
if ($mtime_fas > $mtime_idx) { |
156
|
0
|
|
|
|
|
0
|
die "Index file exists but is older than FASTA file and probably out" |
157
|
|
|
|
|
|
|
. " of date. Refresh or remove the existing index before" |
158
|
|
|
|
|
|
|
. " proceeding.\n" |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
|
161
|
4
|
|
|
|
|
9
|
my @ids; |
162
|
|
|
|
|
|
|
|
163
|
4
|
50
|
|
|
|
146
|
open my $in, '<', $fn_idx |
164
|
|
|
|
|
|
|
or die "Error opening index file: $!\n"; |
165
|
4
|
|
|
|
|
66
|
while (my $line = <$in>) { |
166
|
15
|
|
|
|
|
32
|
chomp $line; |
167
|
15
|
|
|
|
|
64
|
my ($name, $len, $offset, $bases_per_line, $bytes_per_line) |
168
|
|
|
|
|
|
|
= split "\t", $line; |
169
|
|
|
|
|
|
|
die "ERROR: $fn_idx contains duplicate entries" |
170
|
15
|
100
|
|
|
|
72
|
if ( defined $self->{idx}->{$name} ); |
171
|
14
|
|
|
|
|
29
|
my $eol = $bytes_per_line - $bases_per_line; |
172
|
14
|
|
|
|
|
49
|
$self->{idx}->{$name} = [$len, $offset, $bases_per_line, $eol]; |
173
|
14
|
|
|
|
|
68
|
push @ids, $name; |
174
|
|
|
|
|
|
|
} |
175
|
3
|
|
|
|
|
47
|
close $in; |
176
|
3
|
|
|
|
|
15
|
$self->{ids} = \@ids; |
177
|
|
|
|
|
|
|
|
178
|
3
|
|
|
|
|
15
|
return; |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub ids { |
183
|
|
|
|
|
|
|
|
184
|
1
|
|
|
1
|
1
|
3
|
my ($self) = @_; |
185
|
1
|
|
|
|
|
3
|
return @{ $self->{ids} }; |
|
1
|
|
|
|
|
11
|
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
sub length { |
190
|
|
|
|
|
|
|
|
191
|
2
|
|
|
2
|
1
|
569
|
my ($self, $id) = @_; |
192
|
2
|
|
|
|
|
9
|
return $self->{idx}->{$id}->[0]; |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
sub fetch_seq { |
197
|
|
|
|
|
|
|
|
198
|
8
|
|
|
8
|
1
|
619
|
my ($self, $id, $start_bp, $end_bp) = @_; |
199
|
|
|
|
|
|
|
|
200
|
8
|
50
|
|
|
|
28
|
return undef if (! defined $self->{idx}->{$id}); |
201
|
8
|
|
|
|
|
13
|
my ($len, $off, $bpl, $eol) = @{ $self->{idx}->{$id} }; |
|
8
|
|
|
|
|
21
|
|
202
|
|
|
|
|
|
|
|
203
|
8
|
|
|
|
|
16
|
my $fh = $self->{fh}; |
204
|
|
|
|
|
|
|
|
205
|
8
|
|
100
|
|
|
26
|
$start_bp = $start_bp // 1; |
206
|
8
|
|
66
|
|
|
20
|
$end_bp = $end_bp // $len; |
207
|
8
|
|
|
|
|
16
|
--$start_bp; #make 0-based |
208
|
8
|
100
|
100
|
|
|
66
|
die "Coordinates out of bounds" if ($start_bp < 0 || $end_bp > $len); |
209
|
6
|
|
|
|
|
11
|
my $l = $end_bp - $start_bp; |
210
|
|
|
|
|
|
|
|
211
|
6
|
|
|
|
|
35
|
seek $fh, $off + $start_bp + int(($start_bp)/$bpl)*$eol, 0; |
212
|
6
|
|
|
|
|
259
|
read($fh, my $seq, $l + int(($l+$start_bp%$bpl)/$bpl)*$eol); |
213
|
6
|
|
|
|
|
171
|
$seq =~ s/\s//g; |
214
|
|
|
|
|
|
|
|
215
|
6
|
|
|
|
|
11
|
++$start_bp; |
216
|
|
|
|
|
|
|
|
217
|
6
|
|
|
|
|
10
|
my $desc; |
218
|
6
|
100
|
100
|
|
|
29
|
if ($start_bp > 1 || $end_bp < $len) { |
219
|
2
|
|
|
|
|
8
|
$desc = "($start_bp-$end_bp)"; |
220
|
|
|
|
|
|
|
} |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# backtrack to find defline, if asked |
223
|
6
|
100
|
|
|
|
16
|
if ($self->{with_description}) { |
224
|
|
|
|
|
|
|
|
225
|
1
|
|
|
|
|
9
|
my $string = ''; |
226
|
1
|
|
|
|
|
3
|
my $p = $off; |
227
|
1
|
|
|
|
|
2
|
my $chr; |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
BACK: |
230
|
1
|
|
|
|
|
5
|
while ($p > 0) { |
231
|
65
|
|
|
|
|
82
|
--$p; |
232
|
65
|
|
|
|
|
156
|
seek $fh, $p, 0; |
233
|
65
|
50
|
|
|
|
2126
|
read($fh, $chr, 1) |
234
|
|
|
|
|
|
|
or die "Error during backtrack read: $@\n"; |
235
|
|
|
|
|
|
|
# stop at record separator |
236
|
65
|
100
|
|
|
|
1729
|
if ($chr eq '>') { |
237
|
2
|
50
|
|
|
|
7
|
last BACK if ($p == 0); |
238
|
2
|
|
|
|
|
3
|
my $prev; |
239
|
2
|
|
|
|
|
6
|
seek $fh, $p-1, 0; |
240
|
2
|
|
|
|
|
88
|
read($fh, $prev, 1); |
241
|
2
|
100
|
|
|
|
59
|
last BACK if ($prev =~ /[\r\n]/); |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
64
|
|
|
|
|
152
|
$string = $chr . $string; |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
1
|
50
|
|
|
|
5
|
die "Backtracking inexplicably failed\n" |
248
|
|
|
|
|
|
|
if (! CORE::length $string); |
249
|
1
|
50
|
|
|
|
8
|
if ($string =~ /^(\S+)(?:\s+(.*))?$/) { |
250
|
1
|
|
|
|
|
5
|
my $nid = $1; |
251
|
1
|
|
|
|
|
4
|
$desc = join ' ', grep {defined $_} $2, $desc; |
|
2
|
|
|
|
|
7
|
|
252
|
1
|
50
|
|
|
|
5
|
die "ID mismatch ($id vs $nid)!\n" |
253
|
|
|
|
|
|
|
if ($nid ne $id); |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
|
258
|
6
|
|
|
|
|
27
|
return BioX::Seq->new($seq, $id, $desc); |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
1; |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
__END__ |