File Coverage

blib/lib/Bio/Phylo/Parsers/Fastq.pm

Criterion	Covered	Total	%
statement	63	63	100.0
branch	14	22	63.6
condition	9	12	75.0
subroutine	8	8	100.0
pod			n/a
total	94	105	89.5

line	stmt	bran	cond	sub	time	code
1						package Bio::Phylo::Parsers::Fastq;
2	1			1	5	use strict;
	1				2
	1				24
3	1			1	4	use warnings;
	1				1
	1				21
4	1			1	4	use base 'Bio::Phylo::Parsers::Abstract';
	1				2
	1				266
5	1			1	6	use Bio::Phylo::Util::Logger ':simple';
	1				2
	1				91
6	1			1	6	use Bio::Phylo::Util::Exceptions 'throw';
	1				2
	1				33
7	1			1	4	use Bio::Phylo::Util::CONSTANT ':objecttypes';
	1				1
	1				561
8
9						=head1 NAME
10
11						Bio::Phylo::Parsers::Fastq - Parser used by Bio::Phylo::IO, no serviceable parts inside
12
13						=head1 DESCRIPTION
14
15						A FASTQ file parser. To use it, you need to pass an argument
16						that specifies the data type of the phred scores into the parse function, i.e.
17
18						my $handler_type = _DATUM_;
19						parse(
20						-format => 'fastq',
21						-type => 'illumina', # to indicate how phred scores are scaled
22						-file => 'infile.fastq',
23						-flush => 1, # don't store record, flush and move on
24						-handlers => {
25
26						# specifies a handler that is executed on each newly created datum
27						$handler_type => sub {
28						my $seq = shift;
29						my @char = $seq->get_char;
30						my @anno = @{ $seq->get_annotations };
31
32						# print fasta, omit bases with low phred scores
33						print ">$seq\n";
34						for my $i ( 0 .. $#char ) {
35						if ( $anno[$i]->{phred} > 20 ) {
36						print $char[$i];
37						}
38						}
39						print "\n";
40						}
41						}
42						);
43
44						=cut
45
46						sub _parse {
47	1			1	1	my $self = shift;
48	1				4	my $fh = $self->_handle;
49	1				3	my $fac = $self->_factory;
50	1	50			3	my $type = $self->_args->{'-type'} or throw 'BadArgs' => 'No data type specified!';
51	1				6	my $to = $fac->create_datatype($type);
52	1				2	my $matrix;
53	1	50			8	$matrix = $fac->create_matrix( '-type' => 'dna' ) unless $self->_flush;
54
55	1				3	my ( $readseq, $readphred );
56	1				0	my ( $id, $seq, $phred );
57	1				29	LINE: while( my $line = $fh->getline ) {
58	2336				48160	chomp $line;
59
60						# found the FASTQ id line
61	2336	100	100		10522	if ( $line =~ /^\@(.+)$/ and not $readphred ) {
		100	66
		100
		50
62	584				1194	my $capture = $1;
63
64						# process previous record
65	584	50	66		2356	if ( $id && $seq && $phred ) {
			66
66	583				1345	$self->_process_seq(
67						'phred' => $phred,
68						'seq' => $seq,
69						'id' => $id,
70						'to' => $to,
71						);
72						}
73
74						# start new record
75	584				1254	$id = $capture;
76	584				855	$readseq = 1;
77	584				798	$readphred = 0;
78	584				970	$seq = '';
79	584				2642	INFO "found record ID $id, going to read sequence";
80	584				15427	next LINE;
81						}
82
83						# found the FASTQ plus line
84						elsif ( $line =~ /^\+/ and not $readphred ) {
85	584				997	$readseq = 0;
86	584				832	$readphred = 1;
87	584				902	$phred = '';
88	584				1416	INFO "found plus line, going to read sequence quality";
89	584				9277	next LINE;
90						}
91
92						# concatenate sequence
93						elsif ( $readseq ) {
94	584				1082	$seq .= $line;
95	584				8850	next LINE;
96						}
97
98						# concatenate quality line
99						elsif ( $readphred ) {
100	584				1193	$phred .= $line;
101	584	50			2197	if ( length($phred) == length($seq) ) {
102	584				1373	INFO "found all phred characters";
103	584				796	$readphred = 0;
104						}
105	584				9212	next LINE;
106						}
107						}
108
109						# process last record
110						$self->_process_seq(
111	1				32	'phred' => $phred,
112						'seq' => $seq,
113						'id' => $id,
114						'to' => $to,
115						);
116
117						# done
118	1	50			5	return $self->_flush ? undef : $matrix;
119						}
120
121						sub _process_seq {
122	584			584	2591	my ($self,%args) = @_;
123	584				1582	my $sh = $self->_handlers(_DATUM_);
124
125						# turn the phred line into column-level annotations
126	82243				114768	my @scores = map { { 'phred' => $_ } }
127	82243				93339	map { @{ $args{to}->get_states_for_symbol($_) } }
	82243				137500
128	584				920	@{ $args{to}->split($args{phred}) };
	584				1671
129
130						# create the sequence object
131						my $datum = $self->_factory->create_datum(
132						'-type' => 'dna',
133						'-name' => $args{id},
134						'-char' => $args{seq},
135	584				7382	'-annotations' => \@scores,
136						);
137
138	584	50			2602	$sh->($datum) if $sh;
139	584	50			513693	$args{'matrix'}->insert($datum) unless $self->_flush;
140						}
141
142						# podinherit_insert_token
143
144						=head1 SEE ALSO
145
146						There is a mailing list at L<https://groups.google.com/forum/#!forum/bio-phylo>
147						for any user or developer questions and discussions.
148
149						=over
150
151						=item L<Bio::Phylo::IO>
152
153						The fasta parser is called by the L<Bio::Phylo::IO\|Bio::Phylo::IO> object.
154						Look there to learn more about parsing.
155
156						=item L<Bio::Phylo::Manual>
157
158						Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com>
159
160						=back
161
162						=head1 CITATION
163
164						If you use Bio::Phylo in published research, please cite it:
165
166						B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen>
167						and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
168						I<BMC Bioinformatics> B<12>:63.
169						L<http://dx.doi.org/10.1186/1471-2105-12-63>
170
171						=cut
172
173						1;