File Coverage

Bio/SeqIO/fastq.pm
Criterion Covered Total %
statement 125 144 86.8
branch 52 70 74.2
condition 33 39 84.6
subroutine 9 13 69.2
pod 8 8 100.0
total 227 274 82.8


line stmt bran cond sub pod time code
1             # POD at __END__, let the code begin...
2              
3             package Bio::SeqIO::fastq;
4 4     4   449 use strict;
  4         5  
  4         114  
5              
6 4     4   669 use Bio::Seq::SeqFactory;
  4         6  
  4         91  
7              
8 4     4   14 use base qw(Bio::SeqIO);
  4         7  
  4         5420  
9              
10             our %variant = (
11             sanger => {
12             'offset' => 33,
13             'qual_start' => 0,
14             'qual_end' => 93
15             },
16             solexa => {
17             'offset' => 64,
18             'qual_start' => -5,
19             'qual_end' => 62
20             },
21             illumina => {
22             'offset' => 64,
23             'qual_start' => 0,
24             'qual_end' => 62
25             },
26             );
27              
28             sub _initialize {
29 59     59   90 my($self,@args) = @_;
30 59         137 $self->SUPER::_initialize(@args);
31 59         180 my ($variant, $validate, $header) = $self->_rearrange([qw(VARIANT
32             VALIDATE
33             QUALITY_HEADER)], @args);
34 59   100     149 $variant ||= 'sanger';
35 59         109 $self->variant($variant);
36 59         140 $self->_init_tables($variant);
37 59 50       149 $validate = defined $validate ? $validate : 1;
38 59         119 $self->validate($validate);
39 59 50       100 $header && $self->quality_header($header);
40              
41 59 50       133 if( ! defined $self->sequence_factory ) {
42 59         139 $self->sequence_factory(Bio::Seq::SeqFactory->new(
43             -verbose => $self->verbose(),
44             -type => 'Bio::Seq::Quality')
45             );
46             }
47             }
48              
49             sub next_seq {
50 393     393 1 1069 my( $self ) = @_;
51 393         560 while (defined(my $data = $self->next_dataset)) {
52             # Are FASTQ sequences w/o any sequence valid? Removing for now
53             # -cjfields 6.22.09
54 359         815 my $seq = $self->sequence_factory->create(%$data);
55 359         926 return $seq;
56             }
57 14         23 return;
58             }
59              
60             # pure perl version
61             sub next_dataset {
62 402     402 1 316 my $self = shift;
63 402         1268 local $/ = "\n";
64 402         283 my $data;
65 402         353 my $mode = '-seq';
66             # speed this up by directly accessing the filehandle and in-lining the
67             # _readline stuff vs. making the repeated method calls. Tradeoff is speed
68             # over repeated code.
69              
70             # we can probably normalize line endings using PerlIO::eol or
71             # Encode::Newlines
72              
73 402         760 my $fh = $self->_fh;
74 402   100     1368 my $line = $self->{lastline} || <$fh>;
75              
76             FASTQ:
77 402         579 while (defined $line) {
78 1908         1905 $line =~ s/\015\012/\012/;
79 1908         1584 $line =~ tr/\015/\n/;
80 1908 100 100     8061 if ($mode eq '-seq' && $line =~ m{^@([^\n]+)$}xmso) {
    100 100        
81 388         1136 $data->{-descriptor} = $1;
82 388         265 my ($id,$fulldesc);
83 388 50       1142 if ($data->{-descriptor} =~ /^\s*(\S+)\s*(.*)/) {
84 388         688 ($id,$fulldesc) = ($1, $2);
85             } else {
86 0         0 $self->throw("Can't parse fastq header");
87             }
88 388         487 $data->{-id} = $id;
89 388         421 $data->{-desc} = $fulldesc;
90 388         624 $data->{-namespace} = $self->variant;
91             } elsif ($mode eq '-seq' && $line =~ m{^\+([^\n]*)}xmso) {
92 384         518 my $desc = $1;
93 384 50       639 $self->throw("No description line parsed") unless $data->{-descriptor};
94 384 100 100     1253 if ($desc && $data->{-descriptor} ne $desc) {
95             $self->throw("Quality descriptor [$desc] doesn't match seq ".
96 2         25 "descriptor ".$data->{-descriptor}.", line: $." );
97             }
98 382         434 $mode = '-raw_quality';
99             } else {
100 1136 100 100     3596 if ($mode eq '-raw_quality' && defined($data->{-raw_quality}) &&
      100        
101             (length($data->{-raw_quality}) >= length($data->{-seq}))) {
102 352         396 $self->{lastline} = $line;
103             last FASTQ
104 352         418 }
105 784         734 chomp $line;
106 784 100       1625 if ($line =~ /^$/) {
107 3         5 delete $self->{lastline};
108 3         4 last FASTQ;
109             }
110 781         1500 $data->{$mode} .= $line
111             }
112 1551         4250 $line = <$fh>;
113 1551 100       2796 if (!defined $line) {
114 31         38 delete $self->{lastline};
115 31         52 last FASTQ;
116             }
117             }
118              
119 400 100       608 return unless $data;
120 386 100 66     927 if (!$data->{-seq} || !defined($data->{-raw_quality})) {
121 6         23 $self->throw("Missing sequence and/or quality data; line: $.");
122             }
123              
124             # simple quality control tests
125 380 100       686 if (length $data->{-seq} != length $data->{-raw_quality}) {
126             $self->throw("Quality string [".$data->{-raw_quality}."] of length [".
127             length($data->{-raw_quality})."]\ndoesn't match length of sequence ".
128 3         25 $data->{-seq}."\n[".length($data->{-seq})."], line: $.");
129             }
130              
131             $data->{-qual} = [map {
132 69676 100 66     162496 if ($self->{_validate_qual} && !exists($self->{chr2qual}->{$_})) {
133 9         37 $self->throw("Unknown symbol with ASCII value ".ord($_)." outside ".
134             "of quality range")
135             # TODO: fallback?
136             }
137             $self->variant eq 'solexa' ?
138             $self->{sol2phred}->{$self->{chr2qual}->{$_}}:
139 69667 100       68474 $self->{chr2qual}->{$_};
140 377         8866 } unpack("A1" x length($data->{-raw_quality}), $data->{-raw_quality})];
141 368         5021 return $data;
142             }
143              
144             # This should be creating fastq output only. Bio::SeqIO::fasta and
145             # Bio::SeqIO::qual should be used for that output
146              
147             sub write_seq {
148 9     9 1 17 my ($self,@seq) = @_;
149 9         20 my $var = $self->variant;
150 9         18 foreach my $seq (@seq) {
151 9 50       34 unless ($seq->isa("Bio::Seq::Quality")){
152 0         0 $self->warn("You can't write FASTQ without supplying a Bio::Seq::".
153             "Quality object! ".ref($seq)."\n");
154 0         0 next;
155             }
156 9   50     26 my $str = $seq->seq || '';
157 9         11 my @qual = @{$seq->qual};
  9         24  
158              
159             # this should be the origin of the sequence (illumina, solexa, sanger)
160 9         25 my $ns= $seq->namespace;
161              
162 9         21 my $top = $seq->display_id();
163 9 100       25 if (my $desc = $seq->desc()) {
164 6         22 $desc =~ s/\n//g;
165 6         14 $top .= " $desc";
166             }
167 9         14 my $qual = '';
168             my $qual_map =
169             ($ns eq 'solexa' && $var eq 'solexa') ? $self->{phred_fp2chr} :
170             ($var eq 'solexa') ? $self->{phred_int2chr} :
171 9 100 100     46 $self->{qual2chr};
    100          
172              
173 9         10 my %bad_qual;
174 9         16 for my $q (@qual) {
175 543 100 100     1145 $q = sprintf("%.0f", $q) if ($var ne 'solexa' && $ns eq 'solexa');
176 543 100       645 if (exists $qual_map->{$q}) {
177 481         439 $qual .= $qual_map->{$q};
178 481         354 next;
179             } else {
180             # fuzzy mapping, for edited qual scores
181 62         62 my $qr = sprintf("%.0f",$q);
182 62         197 my $bounds = sprintf("%.1f-%.1f",$qr-0.5, $qr+0.5);
183 62 50       80 if (exists $self->{fuzzy_qual2chr}->{$bounds}) {
184 0         0 $qual .= $self->{fuzzy_qual2chr}->{$bounds};
185 0         0 next;
186             } else {
187             my $rep = ($q <= $self->{qual_start}) ?
188 62 50       69 $qual_map->{$self->{qual_start}} : $qual_map->{$self->{qual_end}};
189 62         42 $qual .= $rep;
190 62         84 $bad_qual{$q}++;
191             }
192             }
193             }
194 9 100 66     41 if ($self->{_validate_qual} && %bad_qual) {
195             $self->warn("Data loss for $var: following values not found\n".
196 2         24 join(',',sort {$a <=> $b} keys %bad_qual))
  232         155  
197             }
198 9 50       38 $self->_print("\@",$top,"\n",$str,"\n") or return;
199 9 50       37 $self->_print("+",($self->{_quality_header} ? $top : ''),"\n",$qual,"\n") or return;
    50          
200             }
201 9         16 return 1;
202             }
203              
204             sub write_fastq {
205 0     0 1 0 my ($self,@seq) = @_;
206 0         0 return $self->write_seq(@seq);
207             }
208              
209             sub write_fasta {
210 0     0 1 0 my ($self,@seq) = @_;
211 0 0       0 if (!exists($self->{fasta_proxy})) {
212 0         0 $self->{fasta_proxy} = Bio::SeqIO->new(-format => 'fasta', -fh => $self->_fh);
213             }
214 0         0 return $self->{fasta_proxy}->write_seq(@seq);
215             }
216              
217             sub write_qual {
218 0     0 1 0 my ($self,@seq) = @_;
219 0 0       0 if (!exists($self->{qual_proxy})) {
220 0         0 $self->{qual_proxy} = Bio::SeqIO->new(-format => 'qual', -fh => $self->_fh);
221             }
222 0         0 return $self->{qual_proxy}->write_seq(@seq);
223             }
224              
225             # variant() method inherited from Bio::Root::IO
226              
227             sub _init_tables {
228 59     59   68 my ($self, $var) = @_;
229             # cache encode/decode values for quicker accession
230             ($self->{qual_start}, $self->{qual_end}, $self->{qual_offset}) =
231 59         62 @{ $variant{$var} }{qw(qual_start qual_end offset)};
  59         181  
232 59 100       120 if ($var eq 'solexa') {
233 10         31 for my $q ($self->{qual_start} .. $self->{qual_end}) {
234 680         519 my $char = chr($q + $self->{qual_offset});
235 680         683 $self->{chr2qual}->{$char} = $q;
236 680         745 $self->{qual2chr}->{$q} = $char;
237 680         833 my $s2p = 10 * log(1 + 10 ** ($q / 10.0)) / log(10);
238              
239             # solexa <=> solexa mapping speedup (retain floating pt precision)
240 680         1908 $self->{phred_fp2chr}->{$s2p} = $char;
241 680         620 $self->{sol2phred}->{$q} = $s2p;
242              
243             # this is for mapping values fuzzily (fallback)
244 680         2331 $self->{fuzzy_qual2chr}->{sprintf("%.1f-%.1f",$q - 0.5, $q + 0.5)} = $char;
245              
246 680 100       930 next if $q < 0; # skip loop; PHRED scores greater than 0
247 630 100       1202 my $p2s = sprintf("%.0f",($q <= 1) ? -5 : 10 * log(-1 + 10 ** ($q / 10.0)) / log(10));
248             # sanger/illumina PHRED <=> Solexa char mapping speedup
249 630         1001 $self->{phred_int2chr}->{$q} = chr($p2s + $self->{qual_offset});
250             }
251             } else {
252 49         123 for my $c ($self->{qual_start}..$self->{qual_end}) {
253             # PHRED mapping
254 4296         3530 my $char = chr($c + $self->{qual_offset});
255 4296         3987 $self->{chr2qual}->{$char} = $c;
256 4296         4247 $self->{qual2chr}->{$c} = $char;
257             # this is for mapping values not found with above
258 4296         14595 $self->{fuzzy_qual2chr}->{sprintf("%.1f-%.1f",$c - 0.5, $c + 0.5)} = $char;
259             }
260             }
261             }
262              
263             sub validate {
264 59     59 1 69 my ($self, $val) = @_;
265 59 50       100 if (defined $val) {
266 59         87 $self->{_validate_qual} = $val;
267             }
268 59         53 return $self->{_validate_qual};
269             }
270              
271             sub quality_header{
272 0     0 1   my ($self, $val) = @_;
273 0 0         if (defined $val) {
274 0           $self->{_quality_header} = $val;
275             }
276 0   0       return $self->{_quality_header} || 0;
277             }
278              
279             1;
280              
281             __END__