File Coverage

Bio/Tools/Est2Genome.pm

Criterion	Covered	Total	%
statement	90	99	90.9
branch	39	54	72.2
condition	2	3	66.6
subroutine	12	13	92.3
pod	3	3	100.0
total	146	172	84.8

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::Tools::Est2Genome
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Jason Stajich
7							#
8							# Copyright Jason Stajich
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::Tools::Est2Genome - Parse est2genome output, makes simple Bio::SeqFeature::Generic objects
17
18							=head1 SYNOPSIS
19
20							use Bio::Tools::Est2Genome;
21
22							my $featureiter = Bio::Tools::Est2Genome->new(-file => 'output.est2genome');
23
24							# This is going to be fixed to use the SeqAnalysisI next_feature
25							# Method eventually when we have the objects to put the data in
26							# properly
27							while( my $f = $featureiter->parse_next_gene ) {
28							# process Bio::SeqFeature::Generic objects here
29							}
30
31							=head1 DESCRIPTION
32
33							This module is a parser for C [EMBOSS] alignments of est/cdna
34							sequence to genomic DNA. This is generally accepted as the best
35							program for predicting splice sites based on est/dnas (as far as I know).
36
37							This module currently does not try pull out the ungapped alignments
38							(Segment) but may in the future.
39
40							=head1 FEEDBACK
41
42							=head2 Mailing Lists
43
44							User feedback is an integral part of the evolution of this and other
45							Bioperl modules. Send your comments and suggestions preferably to
46							the Bioperl mailing list. Your participation is much appreciated.
47
48							bioperl-l@bioperl.org - General discussion
49							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
50
51							=head2 Support
52
53							Please direct usage questions or support issues to the mailing list:
54
55							I
56
57							rather than to the module maintainer directly. Many experienced and
58							reponsive experts will be able look at the problem and quickly
59							address it. Please include a thorough description of the problem
60							with code and data examples if at all possible.
61
62							=head2 Reporting Bugs
63
64							Report bugs to the Bioperl bug tracking system to help us keep track
65							of the bugs and their resolution. Bug reports can be submitted the
66							web:
67
68							https://github.com/bioperl/bioperl-live/issues
69
70							=head1 AUTHOR - Jason Stajich
71
72							Email jason-at-bioperl.org
73
74							=head1 APPENDIX
75
76							The rest of the documentation details each of the object methods.
77							Internal methods are usually preceded with a _
78
79							=cut
80
81
82							# Let the code begin...
83
84
85							package Bio::Tools::Est2Genome;
86	1			1		442	use strict;
	1					1
	1					23
87
88							# Object preamble - inherits from Bio::Root::Root
89
90	1			1		263	use Bio::Root::Root;
	1					1
	1					26
91	1			1		259	use Bio::SeqFeature::Gene::Exon;
	1					863
	1					39
92	1			1		299	use Bio::SeqFeature::Gene::Transcript;
	1					3
	1					28
93	1			1		273	use Bio::SeqFeature::Gene::Intron;
	1					1
	1					27
94	1			1		280	use Bio::SeqFeature::Gene::GeneStructure;
	1					2
	1					25
95	1			1		268	use Bio::SeqFeature::SimilarityPair;
	1					1
	1					79
96
97	1			1		6	use base qw(Bio::Tools::AnalysisResult);
	1					1
	1					288
98
99							=head2 new
100
101							Title : new
102							Usage : my $obj = Bio::Tools::Est2Genome->new();
103							Function: Builds a new Bio::Tools::Est2Genome object
104							Returns : an instance of Bio::Tools::Est2Genome
105							Args : -file => 'output.est2genome' or
106							-fh => \*EST2GENOMEOUTPUT
107							-genomefirst => 1 # genome was the first input (not standard)
108
109							=cut
110
111							sub _initialize_state {
112	2			2		3	my($self,@args) = @_;
113
114							# call the inherited method first
115	2					8	my $make = $self->SUPER::_initialize_state(@args);
116
117	2					7	my ($genome_is_first) = $self->_rearrange([qw(GENOMEFIRST)], @args);
118
119	2					5	delete($self->{'_genome_is_first'});
120	2	50				5	$self->{'_genome_is_first'} = $genome_is_first if(defined($genome_is_first));
121	2					112	$self->analysis_method("est2genome");
122							}
123
124							=head2 analysis_method
125
126							Usage : $sim4->analysis_method();
127							Purpose : Inherited method. Overridden to ensure that the name matches
128							/est2genome/i.
129							Returns : String
130							Argument : n/a
131
132							=cut
133
134							#-------------
135							sub analysis_method {
136							#-------------
137	38			38	1	34	my ($self, $method) = @_;
138	38	50	66			70	if($method && ($method !~ /est2genome/i)) {
139	0					0	$self->throw("method $method not supported in " . ref($self));
140							}
141	38					67	return $self->SUPER::analysis_method($method);
142							}
143
144							=head2 parse_next_gene
145
146							Title : parse_next_gene
147							Usage : @gene = $est2genome_result->parse_next_gene;
148							foreach $exon (@exons) {
149							# do something
150							}
151
152							Function: Parses the next alignments of the est2genome result file and
153							returns the found exons as an array of
154							Bio::SeqFeature::SimilarityPair objects. Call
155							this method repeatedly until an empty array is returned to get the
156							results for all alignments.
157
158							The $exon->seq_id() attribute will be set to the identifier of the
159							respective sequence for both sequences.
160							The length is accessible via the seqlength()
161							attribute of $exon->query() and
162							$exon->est_hit().
163							Returns : An array (or array reference) of Bio::SeqFeature::SimilarityPair and Bio::SeqFeature::Generic objects
164							or Bio::SeqFeature::Gene::GeneStructure
165							Args : flag(1/0) indicating to return Bio::SeqFeature::Gene::GeneStructure or Bio::SeqFeature::SimilarityPair
166							defaults to 0
167
168							=cut
169
170							sub parse_next_gene {
171	2			2	1	368	my ($self,$return_gene) = @_;
172	2	100				8	return $self->_parse_gene_struct if $return_gene;
173	1					1	my $seensegment = 0;
174	1					2	my @features;
175	1					2	my ($qstrand,$hstrand) = (1,1);
176	1					1	my $lasthseqname;
177	1					6	while( defined($_ = $self->_readline) ) {
178	16	100				76	if( /Note Best alignment is between (reversed\|forward) est and (reversed\|forward) genome, (but\|and) splice\s+sites imply\s+(forward gene\|REVERSED GENE)/) {
		100
		100
		100
		100
		50
179	1	50				3	if( $seensegment ) {
180	0					0	$self->_pushback($_);
181	0	0				0	return wantarray ? @features : \@features;
182							}
183	1	50				4	$hstrand = -1 if $1 eq 'reversed';
184	1	50				4	$qstrand = -1 if $4 eq 'REVERSED GENE';
185							#$self->debug( "1=$1, 2=$2, 4=$4\n");
186							}
187							elsif( /^Exon/ ) {
188	4					14	my ($name,$score,$perc_ident,$qstart,$qend,$qseqname,
189							$hstart,$hend, $hseqname) = split;
190	4					5	$lasthseqname = $hseqname;
191	4					10	my $query = Bio::SeqFeature::Similarity->new(-primary => $name,
192							-source => $self->analysis_method,
193							-seq_id => $qseqname, # FIXME WHEN WE REDO THE GENERIC NAME CHANGE
194							-start => $qstart,
195							-end => $qend,
196							-strand => $qstrand,
197							-score => $score,
198							-tag => {
199							# 'Location' => "$hstart..$hend",
200							'Sequence' => "$hseqname",
201							'identity' => $perc_ident,
202							}
203							);
204	4					15	my $hit = Bio::SeqFeature::Similarity->new(-primary => 'exon_hit',
205							-source => $self->analysis_method,
206							-seq_id => $hseqname,
207							-start => $hstart,
208							-end => $hend,
209							-strand => $hstrand,
210							-score => $score,
211							-tag => {
212							# 'Location' => "$qstart..$qend",
213							'Sequence' => "$qseqname",
214							'identity' => $perc_ident,
215							}
216							);
217	4					14	push @features, Bio::SeqFeature::SimilarityPair->new
218							(-query => $query,
219							-hit => $hit,
220							-source => $self->analysis_method);
221							} elsif( /^([\-\+\?])(Intron)/) {
222	3					9	my ($name,$score,$perc_ident,$qstart,$qend,$qseqname) = split;
223	3					9	push @features, Bio::SeqFeature::Generic->new(-primary => $2,
224							-source => $self->analysis_method,
225							-start => $qstart,
226							-end => $qend,
227							-strand => $qstrand,
228							-score => $score,
229							-seq_id => $qseqname,
230							-tag => {
231							'identity' => $perc_ident,
232							'Sequence' => $lasthseqname});
233							} elsif( /^Span/ ) {
234							} elsif( /^Segment/ ) {
235	5					9	$seensegment = 1;
236							} elsif( /^\s+$/ ) { # do nothing
237							} else {
238	0					0	$self->warn( "unknown line $_\n");
239							}
240							}
241	1	50				4	return unless( @features );
242	1	50				4	return wantarray ? @features : \@features;
243							}
244
245							sub _parse_gene_struct {
246	1			1		1	my ($self) = @_;
247	1					2	my $seensegment = 0;
248	1					0	my @features;
249	1					2	my ($qstrand,$hstrand) = (1,1);
250	1					1	my $lasthseqname;
251	1					3	my $gene = Bio::SeqFeature::Gene::GeneStructure->new(-source => $self->analysis_method);
252	1					4	my $transcript = Bio::SeqFeature::Gene::Transcript->new(-source => $self->analysis_method);
253	1					2	my @suppf;
254							my @exon;
255	1					2	while( defined($_ = $self->_readline) ) {
256	16	100				71	if( /Note Best alignment is between (reversed\|forward) est and (reversed\|forward) genome, (but\|and) splice\s+sites imply\s+(forward gene\|REVERSED GENE)/) {
		100
		100
		100
		100
		50
257	1	50				3	if( $seensegment ) {
258	0					0	$self->_pushback($_);
259	0					0	return $gene;
260							}
261	1	50				4	$hstrand = -1 if $1 eq 'reversed';
262	1	50				4	$qstrand = -1 if $4 eq 'REVERSED GENE';
263							}
264							elsif( /^Exon/ ) {
265	4					13	my ($name,$score,$perc_ident,$qstart,$qend,$qseqname,$hstart,$hend, $hseqname) = split;
266	4					6	$lasthseqname = $hseqname;
267	4					8	my $exon = Bio::SeqFeature::Gene::Exon->new(-primary => $name,
268							-source => $self->analysis_method,
269							-seq_id => $qseqname, # FIXME WHEN WE REDO THE GENERIC NAME CHANGE
270							-start => $qstart,
271							-end => $qend,
272							-strand => $qstrand,
273							-score => $score,
274							-tag => {
275							#'Location' => "$hstart..$hend",
276							'identity' => $perc_ident,
277							'Sequence' => "$hseqname",
278							}
279							);
280	4	100				14	$transcript->seq_id($qseqname) unless $transcript->seq_id;
281	4					7	$exon->add_tag_value('phase',0);
282	4					12	push @exon, $exon;
283
284							} elsif( /^([\-\+\?])(Intron)/) {
285	3					6	next; #intron auto matically built from exons..hope thats ok..
286							} elsif( /^Span/ ) {
287							} elsif( /^Segment/ ) {
288	5					18	my ($name,$score,$perc_ident,$qstart,$qend,$qseqname,$hstart,$hend, $hseqname) = split;
289	5					14	my $query = Bio::SeqFeature::Similarity->new(-primary => $name,
290							-source => $self->analysis_method,
291							-seq_id => $qseqname, # FIXME WHEN WE REDO THE GENERIC NAME CHANGE
292							-start => $qstart,
293							-end => $qend,
294							-strand => $qstrand,
295							-score => $score,
296							-tag => {
297							# 'Location' => "$hstart..$hend",
298							'Sequence' => "$hseqname",
299							'identity' => $perc_ident,
300							}
301							);
302	5					17	my $hit = Bio::SeqFeature::Similarity->new(-primary => 'exon_hit',
303							-source => $self->analysis_method,
304							-seq_id => $hseqname,
305							-start => $hstart,
306							-end => $hend,
307							-strand => $hstrand,
308							-score => $score,
309							-tag => {
310							# 'Location' => "$qstart..$qend",
311							'Sequence' => "$qseqname",
312							'identity' => $perc_ident,
313							}
314							);
315	5					17	my $support = Bio::SeqFeature::SimilarityPair->new(-query => $query,
316							-hit => $hit,
317							-source => $self->analysis_method);
318	5					19	push @suppf, $support;
319							} elsif( /^\s+$/ ) { # do nothing
320							} else {
321	0					0	$self->warn( "unknown line $_\n");
322							}
323							}
324	1	50				4	return unless $#exon >=0;
325	1					2	foreach my $e(@exon){
326	4					3	my @add;
327	4					5	foreach my $sf(@suppf){
328	20	100				33	if($sf->overlaps($e)){
329	5					10	push @add,$sf;
330							}
331							}
332	4					10	$e->add_tag_value('supporting_feature',@add);
333	4					9	$transcript->add_exon($e);
334							}
335
336	1					4	$gene->add_transcript($transcript);
337	1					2	$gene->seq_id($transcript->seq_id);
338	1					4	return $gene;
339							}
340
341							=head2 next_feature
342
343							Title : next_feature
344							Usage : $seqfeature = $obj->next_feature();
345							Function: Returns the next feature available in the analysis result, or
346							undef if there are no more features.
347							Example :
348							Returns : A Bio::SeqFeatureI implementing object, or undef if there are no
349							more features.
350							Args : none
351
352							=cut
353
354							sub next_feature {
355	0			0	1		my ($self) = shift;
356	0						$self->throw("We haven't really done this right, yet, use parse_next_gene");
357							}
358
359
360							1;