File Coverage

Bio/SeqIO/fastq.pm
Criterion Covered Total %
statement 125 144 86.8
branch 52 70 74.2
condition 33 39 84.6
subroutine 9 13 69.2
pod 8 8 100.0
total 227 274 82.8


line stmt bran cond sub pod time code
1             # POD at __END__, let the code begin...
2              
3             package Bio::SeqIO::fastq;
4 4     4   378 use strict;
  4         6  
  4         127  
5              
6 4     4   637 use Bio::Seq::SeqFactory;
  4         7  
  4         91  
7              
8 4     4   12 use base qw(Bio::SeqIO);
  4         4  
  4         4540  
9              
10             our %variant = (
11             sanger => {
12             'offset' => 33,
13             'qual_start' => 0,
14             'qual_end' => 93
15             },
16             solexa => {
17             'offset' => 64,
18             'qual_start' => -5,
19             'qual_end' => 62
20             },
21             illumina => {
22             'offset' => 64,
23             'qual_start' => 0,
24             'qual_end' => 62
25             },
26             );
27              
28             sub _initialize {
29 59     59   100 my($self,@args) = @_;
30 59         108 $self->SUPER::_initialize(@args);
31 59         173 my ($variant, $validate, $header) = $self->_rearrange([qw(VARIANT
32             VALIDATE
33             QUALITY_HEADER)], @args);
34 59   100     136 $variant ||= 'sanger';
35 59         109 $self->variant($variant);
36 59         132 $self->_init_tables($variant);
37 59 50       142 $validate = defined $validate ? $validate : 1;
38 59         116 $self->validate($validate);
39 59 50       93 $header && $self->quality_header($header);
40              
41 59 50       148 if( ! defined $self->sequence_factory ) {
42 59         152 $self->sequence_factory(Bio::Seq::SeqFactory->new(
43             -verbose => $self->verbose(),
44             -type => 'Bio::Seq::Quality')
45             );
46             }
47             }
48              
49             sub next_seq {
50 393     393 1 1037 my( $self ) = @_;
51 393         567 while (defined(my $data = $self->next_dataset)) {
52             # Are FASTQ sequences w/o any sequence valid? Removing for now
53             # -cjfields 6.22.09
54 359         798 my $seq = $self->sequence_factory->create(%$data);
55 359         1037 return $seq;
56             }
57 14         19 return;
58             }
59              
60             # pure perl version
61             sub next_dataset {
62 402     402 1 340 my $self = shift;
63 402         1266 local $/ = "\n";
64 402         316 my $data;
65 402         354 my $mode = '-seq';
66             # speed this up by directly accessing the filehandle and in-lining the
67             # _readline stuff vs. making the repeated method calls. Tradeoff is speed
68             # over repeated code.
69              
70             # we can probably normalize line endings using PerlIO::eol or
71             # Encode::Newlines
72              
73 402         749 my $fh = $self->_fh;
74 402   100     1367 my $line = $self->{lastline} || <$fh>;
75              
76             FASTQ:
77 402         591 while (defined $line) {
78 1908         1971 $line =~ s/\015\012/\012/;
79 1908         1743 $line =~ tr/\015/\n/;
80 1908 100 100     8660 if ($mode eq '-seq' && $line =~ m{^@([^\n]+)$}xmso) {
    100 100        
81 388         1245 $data->{-descriptor} = $1;
82 388         277 my ($id,$fulldesc);
83 388 50       1131 if ($data->{-descriptor} =~ /^\s*(\S+)\s*(.*)/) {
84 388         724 ($id,$fulldesc) = ($1, $2);
85             } else {
86 0         0 $self->throw("Can't parse fastq header");
87             }
88 388         617 $data->{-id} = $id;
89 388         522 $data->{-desc} = $fulldesc;
90 388         710 $data->{-namespace} = $self->variant;
91             } elsif ($mode eq '-seq' && $line =~ m{^\+([^\n]*)}xmso) {
92 384         547 my $desc = $1;
93 384 50       707 $self->throw("No description line parsed") unless $data->{-descriptor};
94 384 100 100     1180 if ($desc && $data->{-descriptor} ne $desc) {
95             $self->throw("Quality descriptor [$desc] doesn't match seq ".
96 2         21 "descriptor ".$data->{-descriptor}.", line: $." );
97             }
98 382         434 $mode = '-raw_quality';
99             } else {
100 1136 100 100     3966 if ($mode eq '-raw_quality' && defined($data->{-raw_quality}) &&
      100        
101             (length($data->{-raw_quality}) >= length($data->{-seq}))) {
102 352         415 $self->{lastline} = $line;
103             last FASTQ
104 352         514 }
105 784         699 chomp $line;
106 784 100       1478 if ($line =~ /^$/) {
107 3         4 delete $self->{lastline};
108 3         5 last FASTQ;
109             }
110 781         1638 $data->{$mode} .= $line
111             }
112 1551         4719 $line = <$fh>;
113 1551 100       3006 if (!defined $line) {
114 31         36 delete $self->{lastline};
115 31         42 last FASTQ;
116             }
117             }
118              
119 400 100       655 return unless $data;
120 386 100 66     1051 if (!$data->{-seq} || !defined($data->{-raw_quality})) {
121 6         21 $self->throw("Missing sequence and/or quality data; line: $.");
122             }
123              
124             # simple quality control tests
125 380 100       756 if (length $data->{-seq} != length $data->{-raw_quality}) {
126             $self->throw("Quality string [".$data->{-raw_quality}."] of length [".
127             length($data->{-raw_quality})."]\ndoesn't match length of sequence ".
128 3         20 $data->{-seq}."\n[".length($data->{-seq})."], line: $.");
129             }
130              
131             $data->{-qual} = [map {
132 69676 100 66     177017 if ($self->{_validate_qual} && !exists($self->{chr2qual}->{$_})) {
133 9         31 $self->throw("Unknown symbol with ASCII value ".ord($_)." outside ".
134             "of quality range")
135             # TODO: fallback?
136             }
137             $self->variant eq 'solexa' ?
138             $self->{sol2phred}->{$self->{chr2qual}->{$_}}:
139 69667 100       75230 $self->{chr2qual}->{$_};
140 377         9643 } unpack("A1" x length($data->{-raw_quality}), $data->{-raw_quality})];
141 368         5553 return $data;
142             }
143              
144             # This should be creating fastq output only. Bio::SeqIO::fasta and
145             # Bio::SeqIO::qual should be used for that output
146              
147             sub write_seq {
148 9     9 1 10 my ($self,@seq) = @_;
149 9         17 my $var = $self->variant;
150 9         13 foreach my $seq (@seq) {
151 9 50       23 unless ($seq->isa("Bio::Seq::Quality")){
152 0         0 $self->warn("You can't write FASTQ without supplying a Bio::Seq::".
153             "Quality object! ".ref($seq)."\n");
154 0         0 next;
155             }
156 9   50     19 my $str = $seq->seq || '';
157 9         8 my @qual = @{$seq->qual};
  9         20  
158              
159             # this should be the origin of the sequence (illumina, solexa, sanger)
160 9         15 my $ns= $seq->namespace;
161              
162 9         16 my $top = $seq->display_id();
163 9 100       11 if (my $desc = $seq->desc()) {
164 6         13 $desc =~ s/\n//g;
165 6         10 $top .= " $desc";
166             }
167 9         9 my $qual = '';
168             my $qual_map =
169             ($ns eq 'solexa' && $var eq 'solexa') ? $self->{phred_fp2chr} :
170             ($var eq 'solexa') ? $self->{phred_int2chr} :
171 9 100 100     36 $self->{qual2chr};
    100          
172              
173 9         7 my %bad_qual;
174 9         11 for my $q (@qual) {
175 543 100 100     1034 $q = sprintf("%.0f", $q) if ($var ne 'solexa' && $ns eq 'solexa');
176 543 100       598 if (exists $qual_map->{$q}) {
177 481         351 $qual .= $qual_map->{$q};
178 481         303 next;
179             } else {
180             # fuzzy mapping, for edited qual scores
181 62         53 my $qr = sprintf("%.0f",$q);
182 62         183 my $bounds = sprintf("%.1f-%.1f",$qr-0.5, $qr+0.5);
183 62 50       74 if (exists $self->{fuzzy_qual2chr}->{$bounds}) {
184 0         0 $qual .= $self->{fuzzy_qual2chr}->{$bounds};
185 0         0 next;
186             } else {
187             my $rep = ($q <= $self->{qual_start}) ?
188 62 50       79 $qual_map->{$self->{qual_start}} : $qual_map->{$self->{qual_end}};
189 62         40 $qual .= $rep;
190 62         74 $bad_qual{$q}++;
191             }
192             }
193             }
194 9 100 66     38 if ($self->{_validate_qual} && %bad_qual) {
195             $self->warn("Data loss for $var: following values not found\n".
196 2         14 join(',',sort {$a <=> $b} keys %bad_qual))
  234         153  
197             }
198 9 50       25 $self->_print("\@",$top,"\n",$str,"\n") or return;
199 9 50       25 $self->_print("+",($self->{_quality_header} ? $top : ''),"\n",$qual,"\n") or return;
    50          
200             }
201 9         12 return 1;
202             }
203              
204             sub write_fastq {
205 0     0 1 0 my ($self,@seq) = @_;
206 0         0 return $self->write_seq(@seq);
207             }
208              
209             sub write_fasta {
210 0     0 1 0 my ($self,@seq) = @_;
211 0 0       0 if (!exists($self->{fasta_proxy})) {
212 0         0 $self->{fasta_proxy} = Bio::SeqIO->new(-format => 'fasta', -fh => $self->_fh);
213             }
214 0         0 return $self->{fasta_proxy}->write_seq(@seq);
215             }
216              
217             sub write_qual {
218 0     0 1 0 my ($self,@seq) = @_;
219 0 0       0 if (!exists($self->{qual_proxy})) {
220 0         0 $self->{qual_proxy} = Bio::SeqIO->new(-format => 'qual', -fh => $self->_fh);
221             }
222 0         0 return $self->{qual_proxy}->write_seq(@seq);
223             }
224              
225             # variant() method inherited from Bio::Root::IO
226              
227             sub _init_tables {
228 59     59   59 my ($self, $var) = @_;
229             # cache encode/decode values for quicker accession
230             ($self->{qual_start}, $self->{qual_end}, $self->{qual_offset}) =
231 59         55 @{ $variant{$var} }{qw(qual_start qual_end offset)};
  59         158  
232 59 100       113 if ($var eq 'solexa') {
233 10         28 for my $q ($self->{qual_start} .. $self->{qual_end}) {
234 680         516 my $char = chr($q + $self->{qual_offset});
235 680         639 $self->{chr2qual}->{$char} = $q;
236 680         685 $self->{qual2chr}->{$q} = $char;
237 680         1139 my $s2p = 10 * log(1 + 10 ** ($q / 10.0)) / log(10);
238              
239             # solexa <=> solexa mapping speedup (retain floating pt precision)
240 680         1856 $self->{phred_fp2chr}->{$s2p} = $char;
241 680         596 $self->{sol2phred}->{$q} = $s2p;
242              
243             # this is for mapping values fuzzily (fallback)
244 680         2217 $self->{fuzzy_qual2chr}->{sprintf("%.1f-%.1f",$q - 0.5, $q + 0.5)} = $char;
245              
246 680 100       886 next if $q < 0; # skip loop; PHRED scores greater than 0
247 630 100       1253 my $p2s = sprintf("%.0f",($q <= 1) ? -5 : 10 * log(-1 + 10 ** ($q / 10.0)) / log(10));
248             # sanger/illumina PHRED <=> Solexa char mapping speedup
249 630         994 $self->{phred_int2chr}->{$q} = chr($p2s + $self->{qual_offset});
250             }
251             } else {
252 49         117 for my $c ($self->{qual_start}..$self->{qual_end}) {
253             # PHRED mapping
254 4296         3632 my $char = chr($c + $self->{qual_offset});
255 4296         3838 $self->{chr2qual}->{$char} = $c;
256 4296         4212 $self->{qual2chr}->{$c} = $char;
257             # this is for mapping values not found with above
258 4296         14054 $self->{fuzzy_qual2chr}->{sprintf("%.1f-%.1f",$c - 0.5, $c + 0.5)} = $char;
259             }
260             }
261             }
262              
263             sub validate {
264 59     59 1 62 my ($self, $val) = @_;
265 59 50       108 if (defined $val) {
266 59         71 $self->{_validate_qual} = $val;
267             }
268 59         53 return $self->{_validate_qual};
269             }
270              
271             sub quality_header{
272 0     0 1   my ($self, $val) = @_;
273 0 0         if (defined $val) {
274 0           $self->{_quality_header} = $val;
275             }
276 0   0       return $self->{_quality_header} || 0;
277             }
278              
279             1;
280              
281             __END__