File Coverage

Bio/SearchIO/gmap_f9.pm

Criterion	Covered	Total	%
statement	76	111	68.4
branch	15	64	23.4
condition	5	9	55.5
subroutine	8	10	80.0
pod	1	1	100.0
total	105	195	53.8

line	stmt	bran	cond	sub	pod	time	code
1							# $Id: gmap_f9.pm 15987 2009-08-18 21:08:55Z lstein $
2							#
3							# BioPerl module for Bio::SearchIO::gmap_f9
4							#
5							# Cared for by George Hartzell
6							#
7							# Copyright George Hartzell
8							#
9							# You may distribute this module under the same terms as perl itself
10
11							# POD documentation - main docs before the code
12
13							=head1 NAME
14
15							Bio::SearchIO::gmap_f9 - Event generator for parsing gmap reports (Z format)
16
17							=head1 SYNOPSIS
18
19							# Do not use this object directly - it is used as part of the
20							# Bio::SearchIO system.
21
22							use Bio::SearchIO;
23							my $searchio = Bio::SearchIO->new(-format => 'gmap',
24							-file => 't/data/her2.gmapz');
25							while( my $result = $searchio->next_result ) {
26							while( my $hit = $result->next_hit ) {
27							while( my $hsp = $hit->next_hsp ) {
28							# ...
29							}
30							}
31							}
32
33
34							=head1 DESCRIPTION
35
36							This object encapsulated the necessary methods for generating events
37							suitable for building Bio::Search objects from a GMAP "compressed"
38							report (from gmap run with -Z flag) Read the L for more
39							information about how to use this.
40
41							=head2 REVERSE STRAND AND BIOPERL COORDINATES
42
43							I believe that I'm doing the correct thing when reporting hits on the
44							negative strand of the genome. In particular, I've compared the
45							"exons" this code generates with the set returned by ncbi's megablast
46							web service. NCBI's hsp's are ordered differently and have a
47							different genomic location (off by ~18,000,000 bases, padding?) but
48							the starts, ends, and lengths were similar and my strand handling
49							matches theirs. E.g.
50
51							CDNA GENOME
52							start end strand start end strand
53
54							blast
55							1913 2989 1 86236731 86237808 -1
56							1 475 1 86260509 86260983 -1
57							1510 1727 1 86240259 86240476 -1
58							841 989 1 86243034 86243182 -1
59							1381 1514 1 86240630 86240763 -1
60							989 1122 1 86242457 86242590 -1
61							599 729 1 86247470 86247600 -1
62							473 608 1 86259972 86260107 -1
63							1255 1382 1 86240837 86240964 -1
64							730 842 1 86244040 86244152 -1
65							1813 1921 1 86238123 86238231 -1
66							1725 1814 1 86239747 86239836 -1
67							1167 1256 1 86241294 86241383 -1
68							1120 1188 1 86242319 86242387 -1
69
70							gmap
71							1 475 1 104330509 104330983 -1
72							476 600 1 104329980 104330104 -1
73							601 729 1 104317470 104317598 -1
74							730 841 1 104314041 104314152 -1
75							842 989 1 104313034 104313181 -1
76							990 1121 1 104312458 104312589 -1
77							1122 1187 1 104312320 104312385 -1
78							1188 1256 1 104311294 104311362 -1
79							1257 1382 1 104310837 104310962 -1
80							1383 1511 1 104310633 104310761 -1
81							1512 1726 1 104310260 104310474 -1
82							1727 1814 1 104309747 104309834 -1
83							1815 1917 1 104308127 104308229 -1
84							1918 2989 1 104306731 104307802 -1
85
86							=head1 FEEDBACK
87
88							=head2 Mailing Lists
89
90							User feedback is an integral part of the evolution of this and other
91							Bioperl modules. Send your comments and suggestions preferably to
92							the Bioperl mailing list. Your participation is much appreciated.
93
94							bioperl-l@bioperl.org - General discussion
95							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
96
97							=head2 Reporting Bugs
98
99							Report bugs to the Bioperl bug tracking system to help us keep track
100							of the bugs and their resolution. Bug reports can be submitted via
101							the web:
102
103							https://github.com/bioperl/bioperl-live/issues
104
105							=head1 AUTHOR - George Hartzell
106
107							Email hartzell@alerce.com
108
109							=head1 CONTRIBUTORS
110
111							Additional contributors names and emails here
112
113							=head1 APPENDIX
114
115							The rest of the documentation details each of the object methods.
116							Internal methods are usually preceded with an underscore (_).
117
118							=cut
119
120
121							# Let the code begin...
122
123
124							package Bio::SearchIO::gmap_f9;
125	1			1		4	use strict;
	1					0
	1					24
126	1			1		2	use warnings;
	1					1
	1					20
127
128	1			1		418	use Bio::Search::Hit::GenericHit;
	1					2
	1					25
129	1			1		442	use Bio::Search::HSP::GenericHSP;
	1					3
	1					39
130
131	1			1		5	use base qw(Bio::SearchIO );
	1					1
	1					66
132
133	1			1		5	use Data::Dumper;
	1					0
	1					954
134
135							=head2 next_result
136
137							Title : next_result
138							Usage : $result = stream->next_result
139							Function: Reads the next ResultI object from the stream and returns it.
140							Returns : A Bio::Search::Result::ResultI object
141							Args : n/a
142
143							=cut
144
145							sub next_result {
146	62			62	1	124	my $self = shift;
147
148	62					78	my $info = [];
149	62					63	my $result;
150							my $hit;
151	0					0	my @hsp_info;
152	0					0	my $previous_hit_pos;
153
154	62					163	while ( $_ = $self->_readline ) {
155	86654	100				97153	if ( $_ =~ /^>/ ) { # looking at the start of a result
156	118	100				212	if ($result) { # and done if there's one in progress
157	57					186	$self->_pushback($_);
158	57					826	goto DONE;
159							}
160							else { # otherwise start a new one.
161	61					416	my ($id, $desc, $md5) = m\|>([^ ])\s(.)\s(?:md5:(.*))?\|;
162
163	61					280	$result = Bio::Search::Result::GenericResult->new();
164	61					187	$result->algorithm('gmap');
165	61					174	$result->query_name($id);
166	61					147	$result->query_accession($id);
167	61					164	$result->query_description($desc);
168							#$self->warn("Take care of MD5!\n");
169
170	61		33			319	$hit \|\|= Bio::Search::Hit::GenericHit->new( -name =>
171							"NONE_SPECIFIED");
172							}
173							}
174							else { # add another position to the hit/hsp
175							# 468 H 1956 C -14:104307764 2298317517 C H
176							# 468 1957 A -14:104307763 2298317516 A
177	86536					51492	my $c; # info about a column
178							($c->{query_aa_pos}, $c->{query_aa}, $c->{query_pos},
179							$c->{query_base},
180							$c->{hit_strand}, $c->{hit_chromo}, $c->{hit_pos},
181							$c->{hit_concat_pos}, $c->{hit_base}, $c->{hit_aa})
182	86536					576883	= ($_ =~
183							m\|
184							(\d+)[ ]?(.)?[\t]
185							(\d+)[ ]?(.)?[\t]
186							# TODO chromosome isn't a number... X, Y, MT....
187							(\+\\|\-)([\dxXyY]+\\|MT):(\d+)[ ](\d+)[ ](.)
188							[\t]?(.)?
189							\|xo
190							);
191
192	86536	100	100			325045	if ($previous_hit_pos &&
193							(abs($c->{hit_pos} - $previous_hit_pos) > 1)) {
194	541		33			928	$hit \|\|= Bio::Search::Hit::GenericHit->new( -name =>
195							"NONE_SPECIFIED",
196							);
197	541					1379	$hit->add_hsp( $self->_hsp_from_info(\@hsp_info) );
198	541					58869	@hsp_info = ();
199							}
200	86536					65951	push @hsp_info, $c;
201	86536					162936	$previous_hit_pos = $c->{hit_pos};
202							}
203							}
204
205							DONE:
206	62	100				166	if ($result) {
207	61	50				308	$hit->add_hsp( $self->_hsp_from_info(\@hsp_info) ) if (@hsp_info);
208
209	61					84	my ($hit_length,$query_length);
210	61					289	for my $hsp ($hit->hsps) {
211	602					1083	$hit_length += $hsp->length();
212	602					911	$query_length += $hsp->length('query');
213							}
214	61					314	$hit->length($hit_length);
215	61					216	$hit->query_length($query_length);
216							# update this now that we actually know something useful.q
217	61					228	$hit->name($hsp_info[0]->{hit_chromo});
218
219	61	50				344	$result->add_hit($hit) if ($hit);
220							}
221
222	62					13614	return($result);
223							}
224
225
226
227							sub _hsp_from_info {
228	602			602		579	my $self = shift;
229	602					399	my $info = shift;
230	602					677	my $a = {}; # args w/ which we'll create hsp
231	602					502	my $hsp;
232							my $identical;
233
234	602					1135	$a->{-algorithm} = 'GMAP';
235
236	602					596	for my $c (@{$info}) {
	602					1190
237	86536					79680	$a->{-query_seq} .= $c->{query_base};
238	86536					70419	$a->{-hit_seq} .= $c->{hit_base};
239	86536	100				105394	$a->{-homology_seq} .= $c->{query_base} eq $c->{hit_base} ? $c->{hit_base} : ' ';
240	86536	100				110984	$identical++ if ( $c->{query_base} eq $c->{hit_base} );
241							}
242
243	602					1244	$a->{-query_seq} =~ s\| \|\-\|g; # switch to bioperl gaps.
244	602					961	$a->{-hit_seq} =~ s\| \|\-\|g;
245
246	602					1515	$a->{-conserved} = $a->{-identical} = $identical;
247
248							# use the coordinates from from gmap's -f 9 output to
249							# determine whether gmap revcomped the query sequence
250							# to generate the alignment. Note that this is not
251							# the same as the cDNA's sense/anti-sense-ness.
252	602					1014	$a->{-stranded} = 'both';
253
254	602					1458	$a->{-query_start} = $info->[0]->{query_pos};
255	602					1129	$a->{-query_end} = $info->[-1]->{query_pos};
256	602					1711	$a->{-query_length} = $a->{-query_end} - $a->{-query_start} + 1;
257
258							# hit can be either strand, -f 9 output tells us which.
259							# we don't have to worry about it here, but telling the generichsp code
260							# that this hit is 'stranded', it compares the start and end positions
261							# sets it for us.
262	602					940	$a->{-hit_start} = $info->[0]->{hit_pos};
263	602					872	$a->{-hit_end} = $info->[-1]->{hit_pos};
264
265	602					1311	$a->{-hit_length} = abs($a->{-hit_end} - $a->{-hit_start}) + 1;
266
267							$a->{-hsp_length} =
268							$a->{-query_length} > $a->{-hit_length} ?
269	602	50				1753	$a->{-query_length} : $a->{-hit_length};
270
271	602					3667	$hsp = Bio::Search::HSP::GenericHSP->new( %$a );
272
273	602					3465	return $hsp;
274							}
275
276							# TODO (adjust regexp to swallow lines w/out md5 sig's.
277							sub _parse_path_header {
278	0			0			my $self = shift;
279	0						my $path_line = shift;
280	0						my $path = {};
281
282							(
283							$path->{query},
284							$path->{db},
285							$path->{path_num},
286							$path->{path_total_num},
287							$path->{query_length},
288							$path->{exon_count},
289							$path->{trimmed_coverage},
290							$path->{percent_identity},
291							$path->{query_start},
292							$path->{query_end},
293							$path->{whole_genome_start},
294							$path->{whole_genome_end},
295							$path->{chromosome},
296							$path->{chromo_start},
297							$path->{chromo_end},
298							$path->{strand},
299							$path->{sense},
300							$path->{md5},
301	0						) =
302							($_ =~ qr\|
303							>
304							([^ ]*)[ ] # the query id}, followed by a space
305							([^ ]*)[ ] # the genome database, followed by a space
306							(\d+)/(\d+)[ ] # path_num/path_total_num (e.g. 3/12)
307							(\d+)[ ] # query length, followed by a space
308							(\d+)[ ] # hsp/exon count, followed by a space
309							(\d+\.\d*)[ ] # trimmed coverage
310							(\d+\.\d*)[ ] # percent identity
311							(\d+)\.\.(\d+)[ ] # query start .. query end, followed by space
312							(\d+)\.\.(\d+)[ ] # whole genome s..e, followed by space
313							(\d+): # chromosome number
314							(\d+)\.\.(\d+)[ ] # chromo s..e, followed by a space
315							([+-])[ ] # strand, followed by a space
316							dir:(.*) # dir:sense or dir:antisense
317							[ ]md5:([\dabcdefg]+) # md5 signature
318							\|x
319							);
320
321	0	0					$path->{query} or $self->throw("query was not found in path line.");
322	0	0					$path->{db} or $self->throw("db was not found in path line.");
323	0	0					$path->{path_num} or $self->throw("path_num was not found in path line.");
324							$path->{path_total_num} or
325	0	0					$self->throw("path_total_num was not found in path line.");
326							$path->{query_length} or
327	0	0					$self->throw("query_length was not found in path line.");
328							$path->{exon_count} or
329	0	0					$self->throw("exon_count was not found in path line.");
330							$path->{trimmed_coverage} or
331	0	0					$self->throw("trimmed_coverage was not found in path line.");
332							$path->{percent_identity} or
333	0	0					$self->throw("percent_identity was not found in path line.");
334							$path->{query_start} or
335	0	0					$self->throw("query_start was not found in path line.");
336							$path->{query_end} or
337	0	0					$self->throw("query_end was not found in path line.");
338							$path->{whole_genome_start} or
339	0	0					$self->throw("whole_genome_start was not found in path line.");
340							$path->{whole_genome_end} or
341	0	0					$self->throw("whole_genome_end was not found in path line.");
342							$path->{chromosome} or
343	0	0					$self->throw("chromosome was not found in path line.");
344							$path->{chromo_start} or
345	0	0					$self->throw("chromo_start was not found in path line.");
346							$path->{chromo_end} or
347	0	0					$self->throw("chromo_end was not found in path line.");
348	0	0					$path->{strand} or $self->throw("strand was not found in path line.");
349	0	0					$path->{sense} or $self->throw("sense was not found in path line.");
350
351	0						return $path;
352							}
353
354							sub _parse_alignment_line {
355	0			0			my $self = shift;
356	0						my $a_line = shift;
357	0						my $align = {};
358
359							(
360							$align->{chromo_start},
361							$align->{chromo_end},
362							$align->{query_start},
363							$align->{query_end},
364							$align->{percent_identity},
365							$align->{align_length},
366							$align->{intron_length},
367	0						) =
368							($_ =~ qr\|
369							[\t]
370							([\d]+)[ ] # start in chromosome coord.
371							([\d]+)[ ] # end in chromosome coord.
372							([\d]+)[ ] # start in query coord.
373							([\d]+)[ ] # end in query coord.
374							([\d]+) # percent identity (as integer)
375							[\t].*[\t] # skip the edit script
376							([\d]+) # length of alignment block.
377							[\t]([\d]+) # length of following intron.
378							\|x
379							);
380
381							$align->{chromo_start}
382	0	0					or $self->throw("chromo_start missing in alignment line.");
383							$align->{chromo_end},
384	0	0					or $self->throw("chromo_end was missing in alignment line.");
385							$align->{query_start},
386	0	0					or $self->throw("query_start was missing in alignment line.");
387							$align->{query_end},
388	0	0					or $self->throw("query_end was missing in alignment line.");
389							$align->{percent_identity},
390	0	0					or $self->throw("percent_identity was missing in alignment line.");
391							$align->{align_length},
392	0	0					or $self->throw("align_length was missing in alignment line.");
393
394	0						return $align;
395							}
396
397							1;