line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# POD at __END__, let the code begin... |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
package Bio::SeqIO::fastq; |
4
|
4
|
|
|
4
|
|
449
|
use strict; |
|
4
|
|
|
|
|
5
|
|
|
4
|
|
|
|
|
114
|
|
5
|
|
|
|
|
|
|
|
6
|
4
|
|
|
4
|
|
669
|
use Bio::Seq::SeqFactory; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
91
|
|
7
|
|
|
|
|
|
|
|
8
|
4
|
|
|
4
|
|
14
|
use base qw(Bio::SeqIO); |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
5420
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our %variant = ( |
11
|
|
|
|
|
|
|
sanger => { |
12
|
|
|
|
|
|
|
'offset' => 33, |
13
|
|
|
|
|
|
|
'qual_start' => 0, |
14
|
|
|
|
|
|
|
'qual_end' => 93 |
15
|
|
|
|
|
|
|
}, |
16
|
|
|
|
|
|
|
solexa => { |
17
|
|
|
|
|
|
|
'offset' => 64, |
18
|
|
|
|
|
|
|
'qual_start' => -5, |
19
|
|
|
|
|
|
|
'qual_end' => 62 |
20
|
|
|
|
|
|
|
}, |
21
|
|
|
|
|
|
|
illumina => { |
22
|
|
|
|
|
|
|
'offset' => 64, |
23
|
|
|
|
|
|
|
'qual_start' => 0, |
24
|
|
|
|
|
|
|
'qual_end' => 62 |
25
|
|
|
|
|
|
|
}, |
26
|
|
|
|
|
|
|
); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub _initialize { |
29
|
59
|
|
|
59
|
|
90
|
my($self,@args) = @_; |
30
|
59
|
|
|
|
|
137
|
$self->SUPER::_initialize(@args); |
31
|
59
|
|
|
|
|
180
|
my ($variant, $validate, $header) = $self->_rearrange([qw(VARIANT |
32
|
|
|
|
|
|
|
VALIDATE |
33
|
|
|
|
|
|
|
QUALITY_HEADER)], @args); |
34
|
59
|
|
100
|
|
|
149
|
$variant ||= 'sanger'; |
35
|
59
|
|
|
|
|
109
|
$self->variant($variant); |
36
|
59
|
|
|
|
|
140
|
$self->_init_tables($variant); |
37
|
59
|
50
|
|
|
|
149
|
$validate = defined $validate ? $validate : 1; |
38
|
59
|
|
|
|
|
119
|
$self->validate($validate); |
39
|
59
|
50
|
|
|
|
100
|
$header && $self->quality_header($header); |
40
|
|
|
|
|
|
|
|
41
|
59
|
50
|
|
|
|
133
|
if( ! defined $self->sequence_factory ) { |
42
|
59
|
|
|
|
|
139
|
$self->sequence_factory(Bio::Seq::SeqFactory->new( |
43
|
|
|
|
|
|
|
-verbose => $self->verbose(), |
44
|
|
|
|
|
|
|
-type => 'Bio::Seq::Quality') |
45
|
|
|
|
|
|
|
); |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
} |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub next_seq { |
50
|
393
|
|
|
393
|
1
|
1069
|
my( $self ) = @_; |
51
|
393
|
|
|
|
|
560
|
while (defined(my $data = $self->next_dataset)) { |
52
|
|
|
|
|
|
|
# Are FASTQ sequences w/o any sequence valid? Removing for now |
53
|
|
|
|
|
|
|
# -cjfields 6.22.09 |
54
|
359
|
|
|
|
|
815
|
my $seq = $self->sequence_factory->create(%$data); |
55
|
359
|
|
|
|
|
926
|
return $seq; |
56
|
|
|
|
|
|
|
} |
57
|
14
|
|
|
|
|
23
|
return; |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# pure perl version |
61
|
|
|
|
|
|
|
sub next_dataset { |
62
|
402
|
|
|
402
|
1
|
316
|
my $self = shift; |
63
|
402
|
|
|
|
|
1268
|
local $/ = "\n"; |
64
|
402
|
|
|
|
|
283
|
my $data; |
65
|
402
|
|
|
|
|
353
|
my $mode = '-seq'; |
66
|
|
|
|
|
|
|
# speed this up by directly accessing the filehandle and in-lining the |
67
|
|
|
|
|
|
|
# _readline stuff vs. making the repeated method calls. Tradeoff is speed |
68
|
|
|
|
|
|
|
# over repeated code. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# we can probably normalize line endings using PerlIO::eol or |
71
|
|
|
|
|
|
|
# Encode::Newlines |
72
|
|
|
|
|
|
|
|
73
|
402
|
|
|
|
|
760
|
my $fh = $self->_fh; |
74
|
402
|
|
100
|
|
|
1368
|
my $line = $self->{lastline} || <$fh>; |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
FASTQ: |
77
|
402
|
|
|
|
|
579
|
while (defined $line) { |
78
|
1908
|
|
|
|
|
1905
|
$line =~ s/\015\012/\012/; |
79
|
1908
|
|
|
|
|
1584
|
$line =~ tr/\015/\n/; |
80
|
1908
|
100
|
100
|
|
|
8061
|
if ($mode eq '-seq' && $line =~ m{^@([^\n]+)$}xmso) { |
|
|
100
|
100
|
|
|
|
|
81
|
388
|
|
|
|
|
1136
|
$data->{-descriptor} = $1; |
82
|
388
|
|
|
|
|
265
|
my ($id,$fulldesc); |
83
|
388
|
50
|
|
|
|
1142
|
if ($data->{-descriptor} =~ /^\s*(\S+)\s*(.*)/) { |
84
|
388
|
|
|
|
|
688
|
($id,$fulldesc) = ($1, $2); |
85
|
|
|
|
|
|
|
} else { |
86
|
0
|
|
|
|
|
0
|
$self->throw("Can't parse fastq header"); |
87
|
|
|
|
|
|
|
} |
88
|
388
|
|
|
|
|
487
|
$data->{-id} = $id; |
89
|
388
|
|
|
|
|
421
|
$data->{-desc} = $fulldesc; |
90
|
388
|
|
|
|
|
624
|
$data->{-namespace} = $self->variant; |
91
|
|
|
|
|
|
|
} elsif ($mode eq '-seq' && $line =~ m{^\+([^\n]*)}xmso) { |
92
|
384
|
|
|
|
|
518
|
my $desc = $1; |
93
|
384
|
50
|
|
|
|
639
|
$self->throw("No description line parsed") unless $data->{-descriptor}; |
94
|
384
|
100
|
100
|
|
|
1253
|
if ($desc && $data->{-descriptor} ne $desc) { |
95
|
|
|
|
|
|
|
$self->throw("Quality descriptor [$desc] doesn't match seq ". |
96
|
2
|
|
|
|
|
25
|
"descriptor ".$data->{-descriptor}.", line: $." ); |
97
|
|
|
|
|
|
|
} |
98
|
382
|
|
|
|
|
434
|
$mode = '-raw_quality'; |
99
|
|
|
|
|
|
|
} else { |
100
|
1136
|
100
|
100
|
|
|
3596
|
if ($mode eq '-raw_quality' && defined($data->{-raw_quality}) && |
|
|
|
100
|
|
|
|
|
101
|
|
|
|
|
|
|
(length($data->{-raw_quality}) >= length($data->{-seq}))) { |
102
|
352
|
|
|
|
|
396
|
$self->{lastline} = $line; |
103
|
|
|
|
|
|
|
last FASTQ |
104
|
352
|
|
|
|
|
418
|
} |
105
|
784
|
|
|
|
|
734
|
chomp $line; |
106
|
784
|
100
|
|
|
|
1625
|
if ($line =~ /^$/) { |
107
|
3
|
|
|
|
|
5
|
delete $self->{lastline}; |
108
|
3
|
|
|
|
|
4
|
last FASTQ; |
109
|
|
|
|
|
|
|
} |
110
|
781
|
|
|
|
|
1500
|
$data->{$mode} .= $line |
111
|
|
|
|
|
|
|
} |
112
|
1551
|
|
|
|
|
4250
|
$line = <$fh>; |
113
|
1551
|
100
|
|
|
|
2796
|
if (!defined $line) { |
114
|
31
|
|
|
|
|
38
|
delete $self->{lastline}; |
115
|
31
|
|
|
|
|
52
|
last FASTQ; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
400
|
100
|
|
|
|
608
|
return unless $data; |
120
|
386
|
100
|
66
|
|
|
927
|
if (!$data->{-seq} || !defined($data->{-raw_quality})) { |
121
|
6
|
|
|
|
|
23
|
$self->throw("Missing sequence and/or quality data; line: $."); |
122
|
|
|
|
|
|
|
} |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# simple quality control tests |
125
|
380
|
100
|
|
|
|
686
|
if (length $data->{-seq} != length $data->{-raw_quality}) { |
126
|
|
|
|
|
|
|
$self->throw("Quality string [".$data->{-raw_quality}."] of length [". |
127
|
|
|
|
|
|
|
length($data->{-raw_quality})."]\ndoesn't match length of sequence ". |
128
|
3
|
|
|
|
|
25
|
$data->{-seq}."\n[".length($data->{-seq})."], line: $."); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
$data->{-qual} = [map { |
132
|
69676
|
100
|
66
|
|
|
162496
|
if ($self->{_validate_qual} && !exists($self->{chr2qual}->{$_})) { |
133
|
9
|
|
|
|
|
37
|
$self->throw("Unknown symbol with ASCII value ".ord($_)." outside ". |
134
|
|
|
|
|
|
|
"of quality range") |
135
|
|
|
|
|
|
|
# TODO: fallback? |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
$self->variant eq 'solexa' ? |
138
|
|
|
|
|
|
|
$self->{sol2phred}->{$self->{chr2qual}->{$_}}: |
139
|
69667
|
100
|
|
|
|
68474
|
$self->{chr2qual}->{$_}; |
140
|
377
|
|
|
|
|
8866
|
} unpack("A1" x length($data->{-raw_quality}), $data->{-raw_quality})]; |
141
|
368
|
|
|
|
|
5021
|
return $data; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# This should be creating fastq output only. Bio::SeqIO::fasta and |
145
|
|
|
|
|
|
|
# Bio::SeqIO::qual should be used for that output |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub write_seq { |
148
|
9
|
|
|
9
|
1
|
17
|
my ($self,@seq) = @_; |
149
|
9
|
|
|
|
|
20
|
my $var = $self->variant; |
150
|
9
|
|
|
|
|
18
|
foreach my $seq (@seq) { |
151
|
9
|
50
|
|
|
|
34
|
unless ($seq->isa("Bio::Seq::Quality")){ |
152
|
0
|
|
|
|
|
0
|
$self->warn("You can't write FASTQ without supplying a Bio::Seq::". |
153
|
|
|
|
|
|
|
"Quality object! ".ref($seq)."\n"); |
154
|
0
|
|
|
|
|
0
|
next; |
155
|
|
|
|
|
|
|
} |
156
|
9
|
|
50
|
|
|
26
|
my $str = $seq->seq || ''; |
157
|
9
|
|
|
|
|
11
|
my @qual = @{$seq->qual}; |
|
9
|
|
|
|
|
24
|
|
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
# this should be the origin of the sequence (illumina, solexa, sanger) |
160
|
9
|
|
|
|
|
25
|
my $ns= $seq->namespace; |
161
|
|
|
|
|
|
|
|
162
|
9
|
|
|
|
|
21
|
my $top = $seq->display_id(); |
163
|
9
|
100
|
|
|
|
25
|
if (my $desc = $seq->desc()) { |
164
|
6
|
|
|
|
|
22
|
$desc =~ s/\n//g; |
165
|
6
|
|
|
|
|
14
|
$top .= " $desc"; |
166
|
|
|
|
|
|
|
} |
167
|
9
|
|
|
|
|
14
|
my $qual = ''; |
168
|
|
|
|
|
|
|
my $qual_map = |
169
|
|
|
|
|
|
|
($ns eq 'solexa' && $var eq 'solexa') ? $self->{phred_fp2chr} : |
170
|
|
|
|
|
|
|
($var eq 'solexa') ? $self->{phred_int2chr} : |
171
|
9
|
100
|
100
|
|
|
46
|
$self->{qual2chr}; |
|
|
100
|
|
|
|
|
|
172
|
|
|
|
|
|
|
|
173
|
9
|
|
|
|
|
10
|
my %bad_qual; |
174
|
9
|
|
|
|
|
16
|
for my $q (@qual) { |
175
|
543
|
100
|
100
|
|
|
1145
|
$q = sprintf("%.0f", $q) if ($var ne 'solexa' && $ns eq 'solexa'); |
176
|
543
|
100
|
|
|
|
645
|
if (exists $qual_map->{$q}) { |
177
|
481
|
|
|
|
|
439
|
$qual .= $qual_map->{$q}; |
178
|
481
|
|
|
|
|
354
|
next; |
179
|
|
|
|
|
|
|
} else { |
180
|
|
|
|
|
|
|
# fuzzy mapping, for edited qual scores |
181
|
62
|
|
|
|
|
62
|
my $qr = sprintf("%.0f",$q); |
182
|
62
|
|
|
|
|
197
|
my $bounds = sprintf("%.1f-%.1f",$qr-0.5, $qr+0.5); |
183
|
62
|
50
|
|
|
|
80
|
if (exists $self->{fuzzy_qual2chr}->{$bounds}) { |
184
|
0
|
|
|
|
|
0
|
$qual .= $self->{fuzzy_qual2chr}->{$bounds}; |
185
|
0
|
|
|
|
|
0
|
next; |
186
|
|
|
|
|
|
|
} else { |
187
|
|
|
|
|
|
|
my $rep = ($q <= $self->{qual_start}) ? |
188
|
62
|
50
|
|
|
|
69
|
$qual_map->{$self->{qual_start}} : $qual_map->{$self->{qual_end}}; |
189
|
62
|
|
|
|
|
42
|
$qual .= $rep; |
190
|
62
|
|
|
|
|
84
|
$bad_qual{$q}++; |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
} |
194
|
9
|
100
|
66
|
|
|
41
|
if ($self->{_validate_qual} && %bad_qual) { |
195
|
|
|
|
|
|
|
$self->warn("Data loss for $var: following values not found\n". |
196
|
2
|
|
|
|
|
24
|
join(',',sort {$a <=> $b} keys %bad_qual)) |
|
232
|
|
|
|
|
155
|
|
197
|
|
|
|
|
|
|
} |
198
|
9
|
50
|
|
|
|
38
|
$self->_print("\@",$top,"\n",$str,"\n") or return; |
199
|
9
|
50
|
|
|
|
37
|
$self->_print("+",($self->{_quality_header} ? $top : ''),"\n",$qual,"\n") or return; |
|
|
50
|
|
|
|
|
|
200
|
|
|
|
|
|
|
} |
201
|
9
|
|
|
|
|
16
|
return 1; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
sub write_fastq { |
205
|
0
|
|
|
0
|
1
|
0
|
my ($self,@seq) = @_; |
206
|
0
|
|
|
|
|
0
|
return $self->write_seq(@seq); |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
sub write_fasta { |
210
|
0
|
|
|
0
|
1
|
0
|
my ($self,@seq) = @_; |
211
|
0
|
0
|
|
|
|
0
|
if (!exists($self->{fasta_proxy})) { |
212
|
0
|
|
|
|
|
0
|
$self->{fasta_proxy} = Bio::SeqIO->new(-format => 'fasta', -fh => $self->_fh); |
213
|
|
|
|
|
|
|
} |
214
|
0
|
|
|
|
|
0
|
return $self->{fasta_proxy}->write_seq(@seq); |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub write_qual { |
218
|
0
|
|
|
0
|
1
|
0
|
my ($self,@seq) = @_; |
219
|
0
|
0
|
|
|
|
0
|
if (!exists($self->{qual_proxy})) { |
220
|
0
|
|
|
|
|
0
|
$self->{qual_proxy} = Bio::SeqIO->new(-format => 'qual', -fh => $self->_fh); |
221
|
|
|
|
|
|
|
} |
222
|
0
|
|
|
|
|
0
|
return $self->{qual_proxy}->write_seq(@seq); |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# variant() method inherited from Bio::Root::IO |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub _init_tables { |
228
|
59
|
|
|
59
|
|
68
|
my ($self, $var) = @_; |
229
|
|
|
|
|
|
|
# cache encode/decode values for quicker accession |
230
|
|
|
|
|
|
|
($self->{qual_start}, $self->{qual_end}, $self->{qual_offset}) = |
231
|
59
|
|
|
|
|
62
|
@{ $variant{$var} }{qw(qual_start qual_end offset)}; |
|
59
|
|
|
|
|
181
|
|
232
|
59
|
100
|
|
|
|
120
|
if ($var eq 'solexa') { |
233
|
10
|
|
|
|
|
31
|
for my $q ($self->{qual_start} .. $self->{qual_end}) { |
234
|
680
|
|
|
|
|
519
|
my $char = chr($q + $self->{qual_offset}); |
235
|
680
|
|
|
|
|
683
|
$self->{chr2qual}->{$char} = $q; |
236
|
680
|
|
|
|
|
745
|
$self->{qual2chr}->{$q} = $char; |
237
|
680
|
|
|
|
|
833
|
my $s2p = 10 * log(1 + 10 ** ($q / 10.0)) / log(10); |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
# solexa <=> solexa mapping speedup (retain floating pt precision) |
240
|
680
|
|
|
|
|
1908
|
$self->{phred_fp2chr}->{$s2p} = $char; |
241
|
680
|
|
|
|
|
620
|
$self->{sol2phred}->{$q} = $s2p; |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
# this is for mapping values fuzzily (fallback) |
244
|
680
|
|
|
|
|
2331
|
$self->{fuzzy_qual2chr}->{sprintf("%.1f-%.1f",$q - 0.5, $q + 0.5)} = $char; |
245
|
|
|
|
|
|
|
|
246
|
680
|
100
|
|
|
|
930
|
next if $q < 0; # skip loop; PHRED scores greater than 0 |
247
|
630
|
100
|
|
|
|
1202
|
my $p2s = sprintf("%.0f",($q <= 1) ? -5 : 10 * log(-1 + 10 ** ($q / 10.0)) / log(10)); |
248
|
|
|
|
|
|
|
# sanger/illumina PHRED <=> Solexa char mapping speedup |
249
|
630
|
|
|
|
|
1001
|
$self->{phred_int2chr}->{$q} = chr($p2s + $self->{qual_offset}); |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
} else { |
252
|
49
|
|
|
|
|
123
|
for my $c ($self->{qual_start}..$self->{qual_end}) { |
253
|
|
|
|
|
|
|
# PHRED mapping |
254
|
4296
|
|
|
|
|
3530
|
my $char = chr($c + $self->{qual_offset}); |
255
|
4296
|
|
|
|
|
3987
|
$self->{chr2qual}->{$char} = $c; |
256
|
4296
|
|
|
|
|
4247
|
$self->{qual2chr}->{$c} = $char; |
257
|
|
|
|
|
|
|
# this is for mapping values not found with above |
258
|
4296
|
|
|
|
|
14595
|
$self->{fuzzy_qual2chr}->{sprintf("%.1f-%.1f",$c - 0.5, $c + 0.5)} = $char; |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
sub validate { |
264
|
59
|
|
|
59
|
1
|
69
|
my ($self, $val) = @_; |
265
|
59
|
50
|
|
|
|
100
|
if (defined $val) { |
266
|
59
|
|
|
|
|
87
|
$self->{_validate_qual} = $val; |
267
|
|
|
|
|
|
|
} |
268
|
59
|
|
|
|
|
53
|
return $self->{_validate_qual}; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
sub quality_header{ |
272
|
0
|
|
|
0
|
1
|
|
my ($self, $val) = @_; |
273
|
0
|
0
|
|
|
|
|
if (defined $val) { |
274
|
0
|
|
|
|
|
|
$self->{_quality_header} = $val; |
275
|
|
|
|
|
|
|
} |
276
|
0
|
|
0
|
|
|
|
return $self->{_quality_header} || 0; |
277
|
|
|
|
|
|
|
} |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
1; |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
__END__ |