line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Phylo::Parsers::Fastq; |
2
|
1
|
|
|
1
|
|
5
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
24
|
|
3
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
21
|
|
4
|
1
|
|
|
1
|
|
4
|
use base 'Bio::Phylo::Parsers::Abstract'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
266
|
|
5
|
1
|
|
|
1
|
|
6
|
use Bio::Phylo::Util::Logger ':simple'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
91
|
|
6
|
1
|
|
|
1
|
|
6
|
use Bio::Phylo::Util::Exceptions 'throw'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
33
|
|
7
|
1
|
|
|
1
|
|
4
|
use Bio::Phylo::Util::CONSTANT ':objecttypes'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
561
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 NAME |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
Bio::Phylo::Parsers::Fastq - Parser used by Bio::Phylo::IO, no serviceable parts inside |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 DESCRIPTION |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
A FASTQ file parser. To use it, you need to pass an argument |
16
|
|
|
|
|
|
|
that specifies the data type of the phred scores into the parse function, i.e. |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
my $handler_type = _DATUM_; |
19
|
|
|
|
|
|
|
parse( |
20
|
|
|
|
|
|
|
-format => 'fastq', |
21
|
|
|
|
|
|
|
-type => 'illumina', # to indicate how phred scores are scaled |
22
|
|
|
|
|
|
|
-file => 'infile.fastq', |
23
|
|
|
|
|
|
|
-flush => 1, # don't store record, flush and move on |
24
|
|
|
|
|
|
|
-handlers => { |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# specifies a handler that is executed on each newly created datum |
27
|
|
|
|
|
|
|
$handler_type => sub { |
28
|
|
|
|
|
|
|
my $seq = shift; |
29
|
|
|
|
|
|
|
my @char = $seq->get_char; |
30
|
|
|
|
|
|
|
my @anno = @{ $seq->get_annotations }; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# print fasta, omit bases with low phred scores |
33
|
|
|
|
|
|
|
print ">$seq\n"; |
34
|
|
|
|
|
|
|
for my $i ( 0 .. $#char ) { |
35
|
|
|
|
|
|
|
if ( $anno[$i]->{phred} > 20 ) { |
36
|
|
|
|
|
|
|
print $char[$i]; |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
print "\n"; |
40
|
|
|
|
|
|
|
} |
41
|
|
|
|
|
|
|
} |
42
|
|
|
|
|
|
|
); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=cut |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
sub _parse { |
47
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
48
|
1
|
|
|
|
|
4
|
my $fh = $self->_handle; |
49
|
1
|
|
|
|
|
3
|
my $fac = $self->_factory; |
50
|
1
|
50
|
|
|
|
3
|
my $type = $self->_args->{'-type'} or throw 'BadArgs' => 'No data type specified!'; |
51
|
1
|
|
|
|
|
6
|
my $to = $fac->create_datatype($type); |
52
|
1
|
|
|
|
|
2
|
my $matrix; |
53
|
1
|
50
|
|
|
|
8
|
$matrix = $fac->create_matrix( '-type' => 'dna' ) unless $self->_flush; |
54
|
|
|
|
|
|
|
|
55
|
1
|
|
|
|
|
3
|
my ( $readseq, $readphred ); |
56
|
1
|
|
|
|
|
0
|
my ( $id, $seq, $phred ); |
57
|
1
|
|
|
|
|
29
|
LINE: while( my $line = $fh->getline ) { |
58
|
2336
|
|
|
|
|
48160
|
chomp $line; |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# found the FASTQ id line |
61
|
2336
|
100
|
100
|
|
|
10522
|
if ( $line =~ /^\@(.+)$/ and not $readphred ) { |
|
|
100
|
66
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
62
|
584
|
|
|
|
|
1194
|
my $capture = $1; |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# process previous record |
65
|
584
|
50
|
66
|
|
|
2356
|
if ( $id && $seq && $phred ) { |
|
|
|
66
|
|
|
|
|
66
|
583
|
|
|
|
|
1345
|
$self->_process_seq( |
67
|
|
|
|
|
|
|
'phred' => $phred, |
68
|
|
|
|
|
|
|
'seq' => $seq, |
69
|
|
|
|
|
|
|
'id' => $id, |
70
|
|
|
|
|
|
|
'to' => $to, |
71
|
|
|
|
|
|
|
); |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# start new record |
75
|
584
|
|
|
|
|
1254
|
$id = $capture; |
76
|
584
|
|
|
|
|
855
|
$readseq = 1; |
77
|
584
|
|
|
|
|
798
|
$readphred = 0; |
78
|
584
|
|
|
|
|
970
|
$seq = ''; |
79
|
584
|
|
|
|
|
2642
|
INFO "found record ID $id, going to read sequence"; |
80
|
584
|
|
|
|
|
15427
|
next LINE; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# found the FASTQ plus line |
84
|
|
|
|
|
|
|
elsif ( $line =~ /^\+/ and not $readphred ) { |
85
|
584
|
|
|
|
|
997
|
$readseq = 0; |
86
|
584
|
|
|
|
|
832
|
$readphred = 1; |
87
|
584
|
|
|
|
|
902
|
$phred = ''; |
88
|
584
|
|
|
|
|
1416
|
INFO "found plus line, going to read sequence quality"; |
89
|
584
|
|
|
|
|
9277
|
next LINE; |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# concatenate sequence |
93
|
|
|
|
|
|
|
elsif ( $readseq ) { |
94
|
584
|
|
|
|
|
1082
|
$seq .= $line; |
95
|
584
|
|
|
|
|
8850
|
next LINE; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# concatenate quality line |
99
|
|
|
|
|
|
|
elsif ( $readphred ) { |
100
|
584
|
|
|
|
|
1193
|
$phred .= $line; |
101
|
584
|
50
|
|
|
|
2197
|
if ( length($phred) == length($seq) ) { |
102
|
584
|
|
|
|
|
1373
|
INFO "found all phred characters"; |
103
|
584
|
|
|
|
|
796
|
$readphred = 0; |
104
|
|
|
|
|
|
|
} |
105
|
584
|
|
|
|
|
9212
|
next LINE; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
# process last record |
110
|
|
|
|
|
|
|
$self->_process_seq( |
111
|
1
|
|
|
|
|
32
|
'phred' => $phred, |
112
|
|
|
|
|
|
|
'seq' => $seq, |
113
|
|
|
|
|
|
|
'id' => $id, |
114
|
|
|
|
|
|
|
'to' => $to, |
115
|
|
|
|
|
|
|
); |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# done |
118
|
1
|
50
|
|
|
|
5
|
return $self->_flush ? undef : $matrix; |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub _process_seq { |
122
|
584
|
|
|
584
|
|
2591
|
my ($self,%args) = @_; |
123
|
584
|
|
|
|
|
1582
|
my $sh = $self->_handlers(_DATUM_); |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
# turn the phred line into column-level annotations |
126
|
82243
|
|
|
|
|
114768
|
my @scores = map { { 'phred' => $_ } } |
127
|
82243
|
|
|
|
|
93339
|
map { @{ $args{to}->get_states_for_symbol($_) } } |
|
82243
|
|
|
|
|
137500
|
|
128
|
584
|
|
|
|
|
920
|
@{ $args{to}->split($args{phred}) }; |
|
584
|
|
|
|
|
1671
|
|
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# create the sequence object |
131
|
|
|
|
|
|
|
my $datum = $self->_factory->create_datum( |
132
|
|
|
|
|
|
|
'-type' => 'dna', |
133
|
|
|
|
|
|
|
'-name' => $args{id}, |
134
|
|
|
|
|
|
|
'-char' => $args{seq}, |
135
|
584
|
|
|
|
|
7382
|
'-annotations' => \@scores, |
136
|
|
|
|
|
|
|
); |
137
|
|
|
|
|
|
|
|
138
|
584
|
50
|
|
|
|
2602
|
$sh->($datum) if $sh; |
139
|
584
|
50
|
|
|
|
513693
|
$args{'matrix'}->insert($datum) unless $self->_flush; |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# podinherit_insert_token |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=head1 SEE ALSO |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
There is a mailing list at L<https://groups.google.com/forum/#!forum/bio-phylo> |
147
|
|
|
|
|
|
|
for any user or developer questions and discussions. |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
=over |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=item L<Bio::Phylo::IO> |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
The fasta parser is called by the L<Bio::Phylo::IO|Bio::Phylo::IO> object. |
154
|
|
|
|
|
|
|
Look there to learn more about parsing. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=item L<Bio::Phylo::Manual> |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com> |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=back |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=head1 CITATION |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
If you use Bio::Phylo in published research, please cite it: |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen> |
167
|
|
|
|
|
|
|
and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl. |
168
|
|
|
|
|
|
|
I<BMC Bioinformatics> B<12>:63. |
169
|
|
|
|
|
|
|
L<http://dx.doi.org/10.1186/1471-2105-12-63> |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=cut |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
1; |