line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Phylo::Parsers::Fastq; |
2
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
25
|
|
3
|
1
|
|
|
1
|
|
4
|
use base 'Bio::Phylo::Parsers::Abstract'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
266
|
|
4
|
1
|
|
|
1
|
|
6
|
use Bio::Phylo::Util::Logger ':simple'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
91
|
|
5
|
1
|
|
|
1
|
|
6
|
use Bio::Phylo::Util::Exceptions 'throw'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
32
|
|
6
|
1
|
|
|
1
|
|
5
|
use Bio::Phylo::Util::CONSTANT ':objecttypes'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
523
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
Bio::Phylo::Parsers::Fastq - Parser used by Bio::Phylo::IO, no serviceable parts inside |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 DESCRIPTION |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
A FASTQ file parser. To use it, you need to pass an argument |
15
|
|
|
|
|
|
|
that specifies the data type of the phred scores into the parse function, i.e. |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my $handler_type = _DATUM_; |
18
|
|
|
|
|
|
|
parse( |
19
|
|
|
|
|
|
|
-format => 'fastq', |
20
|
|
|
|
|
|
|
-type => 'illumina', # to indicate how phred scores are scaled |
21
|
|
|
|
|
|
|
-file => 'infile.fastq', |
22
|
|
|
|
|
|
|
-flush => 1, # don't store record, flush and move on |
23
|
|
|
|
|
|
|
-handlers => { |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# specifies a handler that is executed on each newly created datum |
26
|
|
|
|
|
|
|
$handler_type => sub { |
27
|
|
|
|
|
|
|
my $seq = shift; |
28
|
|
|
|
|
|
|
my @char = $seq->get_char; |
29
|
|
|
|
|
|
|
my @anno = @{ $seq->get_annotations }; |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# print fasta, omit bases with low phred scores |
32
|
|
|
|
|
|
|
print ">$seq\n"; |
33
|
|
|
|
|
|
|
for my $i ( 0 .. $#char ) { |
34
|
|
|
|
|
|
|
if ( $anno[$i]->{phred} > 20 ) { |
35
|
|
|
|
|
|
|
print $char[$i]; |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
print "\n"; |
39
|
|
|
|
|
|
|
} |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
); |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=cut |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
sub _parse { |
46
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
47
|
1
|
|
|
|
|
3
|
my $fh = $self->_handle; |
48
|
1
|
|
|
|
|
4
|
my $fac = $self->_factory; |
49
|
1
|
50
|
|
|
|
3
|
my $type = $self->_args->{'-type'} or throw 'BadArgs' => 'No data type specified!'; |
50
|
1
|
|
|
|
|
6
|
my $to = $fac->create_datatype($type); |
51
|
1
|
|
|
|
|
7
|
my $matrix; |
52
|
1
|
50
|
|
|
|
12
|
$matrix = $fac->create_matrix( '-type' => 'dna' ) unless $self->_flush; |
53
|
|
|
|
|
|
|
|
54
|
1
|
|
|
|
|
5
|
my ( $readseq, $readphred ); |
55
|
1
|
|
|
|
|
0
|
my ( $id, $seq, $phred ); |
56
|
1
|
|
|
|
|
42
|
LINE: while( my $line = $fh->getline ) { |
57
|
2336
|
|
|
|
|
64487
|
chomp $line; |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# found the FASTQ id line |
60
|
2336
|
100
|
100
|
|
|
15173
|
if ( $line =~ /^\@(.+)$/ and not $readphred ) { |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
61
|
584
|
|
|
|
|
1924
|
my $capture = $1; |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# process previous record |
64
|
584
|
50
|
66
|
|
|
3140
|
if ( $id && $seq && $phred ) { |
|
|
|
66
|
|
|
|
|
65
|
583
|
|
|
|
|
2212
|
$self->_process_seq( |
66
|
|
|
|
|
|
|
'phred' => $phred, |
67
|
|
|
|
|
|
|
'seq' => $seq, |
68
|
|
|
|
|
|
|
'id' => $id, |
69
|
|
|
|
|
|
|
'to' => $to, |
70
|
|
|
|
|
|
|
); |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
# start new record |
74
|
584
|
|
|
|
|
1936
|
$id = $capture; |
75
|
584
|
|
|
|
|
1373
|
$readseq = 1; |
76
|
584
|
|
|
|
|
1014
|
$readphred = 0; |
77
|
584
|
|
|
|
|
1374
|
$seq = ''; |
78
|
584
|
|
|
|
|
3695
|
INFO "found record ID $id, going to read sequence"; |
79
|
584
|
|
|
|
|
24410
|
next LINE; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# found the FASTQ plus line |
83
|
|
|
|
|
|
|
elsif ( $line =~ /^\+/ and not $readphred ) { |
84
|
584
|
|
|
|
|
1248
|
$readseq = 0; |
85
|
584
|
|
|
|
|
955
|
$readphred = 1; |
86
|
584
|
|
|
|
|
1298
|
$phred = ''; |
87
|
584
|
|
|
|
|
1985
|
INFO "found plus line, going to read sequence quality"; |
88
|
584
|
|
|
|
|
11444
|
next LINE; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# concatenate sequence |
92
|
|
|
|
|
|
|
elsif ( $readseq ) { |
93
|
584
|
|
|
|
|
1812
|
$seq .= $line; |
94
|
584
|
|
|
|
|
11376
|
next LINE; |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# concatenate quality line |
98
|
|
|
|
|
|
|
elsif ( $readphred ) { |
99
|
584
|
|
|
|
|
1294
|
$phred .= $line; |
100
|
584
|
50
|
|
|
|
3461
|
if ( length($phred) == length($seq) ) { |
101
|
584
|
|
|
|
|
2428
|
INFO "found all phred characters"; |
102
|
584
|
|
|
|
|
1408
|
$readphred = 0; |
103
|
|
|
|
|
|
|
} |
104
|
584
|
|
|
|
|
11726
|
next LINE; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
# process last record |
109
|
|
|
|
|
|
|
$self->_process_seq( |
110
|
1
|
|
|
|
|
31
|
'phred' => $phred, |
111
|
|
|
|
|
|
|
'seq' => $seq, |
112
|
|
|
|
|
|
|
'id' => $id, |
113
|
|
|
|
|
|
|
'to' => $to, |
114
|
|
|
|
|
|
|
); |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
# done |
117
|
1
|
50
|
|
|
|
4
|
return $self->_flush ? undef : $matrix; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
sub _process_seq { |
121
|
584
|
|
|
584
|
|
7687
|
my ($self,%args) = @_; |
122
|
584
|
|
|
|
|
2469
|
my $sh = $self->_handlers(_DATUM_); |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# turn the phred line into column-level annotations |
125
|
82243
|
|
|
|
|
168560
|
my @scores = map { { 'phred' => $_ } } |
126
|
82243
|
|
|
|
|
119139
|
map { @{ $args{to}->get_states_for_symbol($_) } } |
|
82243
|
|
|
|
|
191575
|
|
127
|
584
|
|
|
|
|
1766
|
@{ $args{to}->split($args{phred}) }; |
|
584
|
|
|
|
|
3196
|
|
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
# create the sequence object |
130
|
|
|
|
|
|
|
my $datum = $self->_factory->create_datum( |
131
|
|
|
|
|
|
|
'-type' => 'dna', |
132
|
|
|
|
|
|
|
'-name' => $args{id}, |
133
|
|
|
|
|
|
|
'-char' => $args{seq}, |
134
|
584
|
|
|
|
|
14066
|
'-annotations' => \@scores, |
135
|
|
|
|
|
|
|
); |
136
|
|
|
|
|
|
|
|
137
|
584
|
50
|
|
|
|
4430
|
$sh->($datum) if $sh; |
138
|
584
|
50
|
|
|
|
1208506
|
$args{'matrix'}->insert($datum) unless $self->_flush; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# podinherit_insert_token |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
=head1 SEE ALSO |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
There is a mailing list at L<https://groups.google.com/forum/#!forum/bio-phylo> |
146
|
|
|
|
|
|
|
for any user or developer questions and discussions. |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
=over |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=item L<Bio::Phylo::IO> |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
The fasta parser is called by the L<Bio::Phylo::IO|Bio::Phylo::IO> object. |
153
|
|
|
|
|
|
|
Look there to learn more about parsing. |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=item L<Bio::Phylo::Manual> |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com> |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=back |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=head1 CITATION |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
If you use Bio::Phylo in published research, please cite it: |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen> |
166
|
|
|
|
|
|
|
and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl. |
167
|
|
|
|
|
|
|
I<BMC Bioinformatics> B<12>:63. |
168
|
|
|
|
|
|
|
L<http://dx.doi.org/10.1186/1471-2105-12-63> |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=cut |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
1; |