File Coverage

blib/lib/Bio/Phylo/Parsers/Fastq.pm

Criterion	Covered	Total	%
statement	60	60	100.0
branch	14	22	63.6
condition	9	12	75.0
subroutine	7	7	100.0
pod			n/a
total	90	101	89.1

line	stmt	bran	cond	sub	time	code
1						package Bio::Phylo::Parsers::Fastq;
2	1			1	6	use strict;
	1				1
	1				25
3	1			1	4	use base 'Bio::Phylo::Parsers::Abstract';
	1				1
	1				266
4	1			1	6	use Bio::Phylo::Util::Logger ':simple';
	1				2
	1				91
5	1			1	6	use Bio::Phylo::Util::Exceptions 'throw';
	1				2
	1				32
6	1			1	5	use Bio::Phylo::Util::CONSTANT ':objecttypes';
	1				2
	1				523
7
8						=head1 NAME
9
10						Bio::Phylo::Parsers::Fastq - Parser used by Bio::Phylo::IO, no serviceable parts inside
11
12						=head1 DESCRIPTION
13
14						A FASTQ file parser. To use it, you need to pass an argument
15						that specifies the data type of the phred scores into the parse function, i.e.
16
17						my $handler_type = _DATUM_;
18						parse(
19						-format => 'fastq',
20						-type => 'illumina', # to indicate how phred scores are scaled
21						-file => 'infile.fastq',
22						-flush => 1, # don't store record, flush and move on
23						-handlers => {
24
25						# specifies a handler that is executed on each newly created datum
26						$handler_type => sub {
27						my $seq = shift;
28						my @char = $seq->get_char;
29						my @anno = @{ $seq->get_annotations };
30
31						# print fasta, omit bases with low phred scores
32						print ">$seq\n";
33						for my $i ( 0 .. $#char ) {
34						if ( $anno[$i]->{phred} > 20 ) {
35						print $char[$i];
36						}
37						}
38						print "\n";
39						}
40						}
41						);
42
43						=cut
44
45						sub _parse {
46	1			1	2	my $self = shift;
47	1				3	my $fh = $self->_handle;
48	1				4	my $fac = $self->_factory;
49	1	50			3	my $type = $self->_args->{'-type'} or throw 'BadArgs' => 'No data type specified!';
50	1				6	my $to = $fac->create_datatype($type);
51	1				7	my $matrix;
52	1	50			12	$matrix = $fac->create_matrix( '-type' => 'dna' ) unless $self->_flush;
53
54	1				5	my ( $readseq, $readphred );
55	1				0	my ( $id, $seq, $phred );
56	1				42	LINE: while( my $line = $fh->getline ) {
57	2336				64487	chomp $line;
58
59						# found the FASTQ id line
60	2336	100	100		15173	if ( $line =~ /^\@(.+)$/ and not $readphred ) {
		100	66
		100
		50
61	584				1924	my $capture = $1;
62
63						# process previous record
64	584	50	66		3140	if ( $id && $seq && $phred ) {
			66
65	583				2212	$self->_process_seq(
66						'phred' => $phred,
67						'seq' => $seq,
68						'id' => $id,
69						'to' => $to,
70						);
71						}
72
73						# start new record
74	584				1936	$id = $capture;
75	584				1373	$readseq = 1;
76	584				1014	$readphred = 0;
77	584				1374	$seq = '';
78	584				3695	INFO "found record ID $id, going to read sequence";
79	584				24410	next LINE;
80						}
81
82						# found the FASTQ plus line
83						elsif ( $line =~ /^\+/ and not $readphred ) {
84	584				1248	$readseq = 0;
85	584				955	$readphred = 1;
86	584				1298	$phred = '';
87	584				1985	INFO "found plus line, going to read sequence quality";
88	584				11444	next LINE;
89						}
90
91						# concatenate sequence
92						elsif ( $readseq ) {
93	584				1812	$seq .= $line;
94	584				11376	next LINE;
95						}
96
97						# concatenate quality line
98						elsif ( $readphred ) {
99	584				1294	$phred .= $line;
100	584	50			3461	if ( length($phred) == length($seq) ) {
101	584				2428	INFO "found all phred characters";
102	584				1408	$readphred = 0;
103						}
104	584				11726	next LINE;
105						}
106						}
107
108						# process last record
109						$self->_process_seq(
110	1				31	'phred' => $phred,
111						'seq' => $seq,
112						'id' => $id,
113						'to' => $to,
114						);
115
116						# done
117	1	50			4	return $self->_flush ? undef : $matrix;
118						}
119
120						sub _process_seq {
121	584			584	7687	my ($self,%args) = @_;
122	584				2469	my $sh = $self->_handlers(_DATUM_);
123
124						# turn the phred line into column-level annotations
125	82243				168560	my @scores = map { { 'phred' => $_ } }
126	82243				119139	map { @{ $args{to}->get_states_for_symbol($_) } }
	82243				191575
127	584				1766	@{ $args{to}->split($args{phred}) };
	584				3196
128
129						# create the sequence object
130						my $datum = $self->_factory->create_datum(
131						'-type' => 'dna',
132						'-name' => $args{id},
133						'-char' => $args{seq},
134	584				14066	'-annotations' => \@scores,
135						);
136
137	584	50			4430	$sh->($datum) if $sh;
138	584	50			1208506	$args{'matrix'}->insert($datum) unless $self->_flush;
139						}
140
141						# podinherit_insert_token
142
143						=head1 SEE ALSO
144
145						There is a mailing list at L<https://groups.google.com/forum/#!forum/bio-phylo>
146						for any user or developer questions and discussions.
147
148						=over
149
150						=item L<Bio::Phylo::IO>
151
152						The fasta parser is called by the L<Bio::Phylo::IO\|Bio::Phylo::IO> object.
153						Look there to learn more about parsing.
154
155						=item L<Bio::Phylo::Manual>
156
157						Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com>
158
159						=back
160
161						=head1 CITATION
162
163						If you use Bio::Phylo in published research, please cite it:
164
165						B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen>
166						and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
167						I<BMC Bioinformatics> B<12>:63.
168						L<http://dx.doi.org/10.1186/1471-2105-12-63>
169
170						=cut
171
172						1;