File Coverage

Bio/Tools/Est2Genome.pm
Criterion Covered Total %
statement 90 99 90.9
branch 39 54 72.2
condition 2 3 66.6
subroutine 12 13 92.3
pod 3 3 100.0
total 146 172 84.8


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Est2Genome
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Tools::Est2Genome - Parse est2genome output, makes simple Bio::SeqFeature::Generic objects
17              
18             =head1 SYNOPSIS
19              
20             use Bio::Tools::Est2Genome;
21              
22             my $featureiter = Bio::Tools::Est2Genome->new(-file => 'output.est2genome');
23              
24             # This is going to be fixed to use the SeqAnalysisI next_feature
25             # Method eventually when we have the objects to put the data in
26             # properly
27             while( my $f = $featureiter->parse_next_gene ) {
28             # process Bio::SeqFeature::Generic objects here
29             }
30              
31             =head1 DESCRIPTION
32              
33             This module is a parser for C [EMBOSS] alignments of est/cdna
34             sequence to genomic DNA. This is generally accepted as the best
35             program for predicting splice sites based on est/dnas (as far as I know).
36              
37             This module currently does not try pull out the ungapped alignments
38             (Segment) but may in the future.
39              
40             =head1 FEEDBACK
41              
42             =head2 Mailing Lists
43              
44             User feedback is an integral part of the evolution of this and other
45             Bioperl modules. Send your comments and suggestions preferably to
46             the Bioperl mailing list. Your participation is much appreciated.
47              
48             bioperl-l@bioperl.org - General discussion
49             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
50              
51             =head2 Support
52              
53             Please direct usage questions or support issues to the mailing list:
54              
55             I
56              
57             rather than to the module maintainer directly. Many experienced and
58             reponsive experts will be able look at the problem and quickly
59             address it. Please include a thorough description of the problem
60             with code and data examples if at all possible.
61              
62             =head2 Reporting Bugs
63              
64             Report bugs to the Bioperl bug tracking system to help us keep track
65             of the bugs and their resolution. Bug reports can be submitted the
66             web:
67              
68             https://github.com/bioperl/bioperl-live/issues
69              
70             =head1 AUTHOR - Jason Stajich
71              
72             Email jason-at-bioperl.org
73              
74             =head1 APPENDIX
75              
76             The rest of the documentation details each of the object methods.
77             Internal methods are usually preceded with a _
78              
79             =cut
80              
81              
82             # Let the code begin...
83              
84              
85             package Bio::Tools::Est2Genome;
86 1     1   423 use strict;
  1         1  
  1         23  
87              
88             # Object preamble - inherits from Bio::Root::Root
89              
90 1     1   253 use Bio::Root::Root;
  1         2  
  1         27  
91 1     1   264 use Bio::SeqFeature::Gene::Exon;
  1         2  
  1         33  
92 1     1   313 use Bio::SeqFeature::Gene::Transcript;
  1         2  
  1         27  
93 1     1   272 use Bio::SeqFeature::Gene::Intron;
  1         2  
  1         26  
94 1     1   276 use Bio::SeqFeature::Gene::GeneStructure;
  1         2  
  1         27  
95 1     1   267 use Bio::SeqFeature::SimilarityPair;
  1         1  
  1         26  
96              
97 1     1   5 use base qw(Bio::Tools::AnalysisResult);
  1         1  
  1         280  
98              
99             =head2 new
100              
101             Title : new
102             Usage : my $obj = Bio::Tools::Est2Genome->new();
103             Function: Builds a new Bio::Tools::Est2Genome object
104             Returns : an instance of Bio::Tools::Est2Genome
105             Args : -file => 'output.est2genome' or
106             -fh => \*EST2GENOMEOUTPUT
107             -genomefirst => 1 # genome was the first input (not standard)
108              
109             =cut
110              
111             sub _initialize_state {
112 2     2   6 my($self,@args) = @_;
113              
114             # call the inherited method first
115 2         11 my $make = $self->SUPER::_initialize_state(@args);
116              
117 2         11 my ($genome_is_first) = $self->_rearrange([qw(GENOMEFIRST)], @args);
118              
119 2         5 delete($self->{'_genome_is_first'});
120 2 50       7 $self->{'_genome_is_first'} = $genome_is_first if(defined($genome_is_first));
121 2         186 $self->analysis_method("est2genome");
122             }
123              
124             =head2 analysis_method
125              
126             Usage : $sim4->analysis_method();
127             Purpose : Inherited method. Overridden to ensure that the name matches
128             /est2genome/i.
129             Returns : String
130             Argument : n/a
131              
132             =cut
133              
134             #-------------
135             sub analysis_method {
136             #-------------
137 38     38 1 45 my ($self, $method) = @_;
138 38 50 66     94 if($method && ($method !~ /est2genome/i)) {
139 0         0 $self->throw("method $method not supported in " . ref($self));
140             }
141 38         96 return $self->SUPER::analysis_method($method);
142             }
143              
144             =head2 parse_next_gene
145              
146             Title : parse_next_gene
147             Usage : @gene = $est2genome_result->parse_next_gene;
148             foreach $exon (@exons) {
149             # do something
150             }
151              
152             Function: Parses the next alignments of the est2genome result file and
153             returns the found exons as an array of
154             Bio::SeqFeature::SimilarityPair objects. Call
155             this method repeatedly until an empty array is returned to get the
156             results for all alignments.
157              
158             The $exon->seq_id() attribute will be set to the identifier of the
159             respective sequence for both sequences.
160             The length is accessible via the seqlength()
161             attribute of $exon->query() and
162             $exon->est_hit().
163             Returns : An array (or array reference) of Bio::SeqFeature::SimilarityPair and Bio::SeqFeature::Generic objects
164             or Bio::SeqFeature::Gene::GeneStructure
165             Args : flag(1/0) indicating to return Bio::SeqFeature::Gene::GeneStructure or Bio::SeqFeature::SimilarityPair
166             defaults to 0
167              
168             =cut
169              
170             sub parse_next_gene {
171 2     2 1 950 my ($self,$return_gene) = @_;
172 2 100       13 return $self->_parse_gene_struct if $return_gene;
173 1         2 my $seensegment = 0;
174 1         1 my @features;
175 1         2 my ($qstrand,$hstrand) = (1,1);
176 1         2 my $lasthseqname;
177 1         41 while( defined($_ = $self->_readline) ) {
178 16 100       130 if( /Note Best alignment is between (reversed|forward) est and (reversed|forward) genome, (but|and) splice\s+sites imply\s+(forward gene|REVERSED GENE)/) {
    100          
    100          
    100          
    100          
    50          
179 1 50       4 if( $seensegment ) {
180 0         0 $self->_pushback($_);
181 0 0       0 return wantarray ? @features : \@features;
182             }
183 1 50       6 $hstrand = -1 if $1 eq 'reversed';
184 1 50       8 $qstrand = -1 if $4 eq 'REVERSED GENE';
185             #$self->debug( "1=$1, 2=$2, 4=$4\n");
186             }
187             elsif( /^Exon/ ) {
188 4         24 my ($name,$score,$perc_ident,$qstart,$qend,$qseqname,
189             $hstart,$hend, $hseqname) = split;
190 4         9 $lasthseqname = $hseqname;
191 4         16 my $query = Bio::SeqFeature::Similarity->new(-primary => $name,
192             -source => $self->analysis_method,
193             -seq_id => $qseqname, # FIXME WHEN WE REDO THE GENERIC NAME CHANGE
194             -start => $qstart,
195             -end => $qend,
196             -strand => $qstrand,
197             -score => $score,
198             -tag => {
199             # 'Location' => "$hstart..$hend",
200             'Sequence' => "$hseqname",
201             'identity' => $perc_ident,
202             }
203             );
204 4         22 my $hit = Bio::SeqFeature::Similarity->new(-primary => 'exon_hit',
205             -source => $self->analysis_method,
206             -seq_id => $hseqname,
207             -start => $hstart,
208             -end => $hend,
209             -strand => $hstrand,
210             -score => $score,
211             -tag => {
212             # 'Location' => "$qstart..$qend",
213             'Sequence' => "$qseqname",
214             'identity' => $perc_ident,
215             }
216             );
217 4         26 push @features, Bio::SeqFeature::SimilarityPair->new
218             (-query => $query,
219             -hit => $hit,
220             -source => $self->analysis_method);
221             } elsif( /^([\-\+\?])(Intron)/) {
222 3         16 my ($name,$score,$perc_ident,$qstart,$qend,$qseqname) = split;
223 3         15 push @features, Bio::SeqFeature::Generic->new(-primary => $2,
224             -source => $self->analysis_method,
225             -start => $qstart,
226             -end => $qend,
227             -strand => $qstrand,
228             -score => $score,
229             -seq_id => $qseqname,
230             -tag => {
231             'identity' => $perc_ident,
232             'Sequence' => $lasthseqname});
233             } elsif( /^Span/ ) {
234             } elsif( /^Segment/ ) {
235 5         14 $seensegment = 1;
236             } elsif( /^\s+$/ ) { # do nothing
237             } else {
238 0         0 $self->warn( "unknown line $_\n");
239             }
240             }
241 1 50       6 return unless( @features );
242 1 50       8 return wantarray ? @features : \@features;
243             }
244              
245             sub _parse_gene_struct {
246 1     1   1 my ($self) = @_;
247 1         2 my $seensegment = 0;
248 1         2 my @features;
249 1         2 my ($qstrand,$hstrand) = (1,1);
250 1         1 my $lasthseqname;
251 1         3 my $gene = Bio::SeqFeature::Gene::GeneStructure->new(-source => $self->analysis_method);
252 1         5 my $transcript = Bio::SeqFeature::Gene::Transcript->new(-source => $self->analysis_method);
253 1         2 my @suppf;
254             my @exon;
255 1         5 while( defined($_ = $self->_readline) ) {
256 16 100       82 if( /Note Best alignment is between (reversed|forward) est and (reversed|forward) genome, (but|and) splice\s+sites imply\s+(forward gene|REVERSED GENE)/) {
    100          
    100          
    100          
    100          
    50          
257 1 50       2 if( $seensegment ) {
258 0         0 $self->_pushback($_);
259 0         0 return $gene;
260             }
261 1 50       4 $hstrand = -1 if $1 eq 'reversed';
262 1 50       5 $qstrand = -1 if $4 eq 'REVERSED GENE';
263             }
264             elsif( /^Exon/ ) {
265 4         13 my ($name,$score,$perc_ident,$qstart,$qend,$qseqname,$hstart,$hend, $hseqname) = split;
266 4         7 $lasthseqname = $hseqname;
267 4         11 my $exon = Bio::SeqFeature::Gene::Exon->new(-primary => $name,
268             -source => $self->analysis_method,
269             -seq_id => $qseqname, # FIXME WHEN WE REDO THE GENERIC NAME CHANGE
270             -start => $qstart,
271             -end => $qend,
272             -strand => $qstrand,
273             -score => $score,
274             -tag => {
275             #'Location' => "$hstart..$hend",
276             'identity' => $perc_ident,
277             'Sequence' => "$hseqname",
278             }
279             );
280 4 100       13 $transcript->seq_id($qseqname) unless $transcript->seq_id;
281 4         7 $exon->add_tag_value('phase',0);
282 4         10 push @exon, $exon;
283            
284             } elsif( /^([\-\+\?])(Intron)/) {
285 3         8 next; #intron auto matically built from exons..hope thats ok..
286             } elsif( /^Span/ ) {
287             } elsif( /^Segment/ ) {
288 5         17 my ($name,$score,$perc_ident,$qstart,$qend,$qseqname,$hstart,$hend, $hseqname) = split;
289 5         13 my $query = Bio::SeqFeature::Similarity->new(-primary => $name,
290             -source => $self->analysis_method,
291             -seq_id => $qseqname, # FIXME WHEN WE REDO THE GENERIC NAME CHANGE
292             -start => $qstart,
293             -end => $qend,
294             -strand => $qstrand,
295             -score => $score,
296             -tag => {
297             # 'Location' => "$hstart..$hend",
298             'Sequence' => "$hseqname",
299             'identity' => $perc_ident,
300             }
301             );
302 5         17 my $hit = Bio::SeqFeature::Similarity->new(-primary => 'exon_hit',
303             -source => $self->analysis_method,
304             -seq_id => $hseqname,
305             -start => $hstart,
306             -end => $hend,
307             -strand => $hstrand,
308             -score => $score,
309             -tag => {
310             # 'Location' => "$qstart..$qend",
311             'Sequence' => "$qseqname",
312             'identity' => $perc_ident,
313             }
314             );
315 5         19 my $support = Bio::SeqFeature::SimilarityPair->new(-query => $query,
316             -hit => $hit,
317             -source => $self->analysis_method);
318 5         17 push @suppf, $support;
319             } elsif( /^\s+$/ ) { # do nothing
320             } else {
321 0         0 $self->warn( "unknown line $_\n");
322             }
323             }
324 1 50       4 return unless $#exon >=0;
325 1         3 foreach my $e(@exon){
326 4         5 my @add;
327 4         5 foreach my $sf(@suppf){
328 20 100       40 if($sf->overlaps($e)){
329 5         10 push @add,$sf;
330             }
331             }
332 4         7 $e->add_tag_value('supporting_feature',@add);
333 4         10 $transcript->add_exon($e);
334             }
335            
336 1         7 $gene->add_transcript($transcript);
337 1         3 $gene->seq_id($transcript->seq_id);
338 1         4 return $gene;
339             }
340              
341             =head2 next_feature
342              
343             Title : next_feature
344             Usage : $seqfeature = $obj->next_feature();
345             Function: Returns the next feature available in the analysis result, or
346             undef if there are no more features.
347             Example :
348             Returns : A Bio::SeqFeatureI implementing object, or undef if there are no
349             more features.
350             Args : none
351              
352             =cut
353              
354             sub next_feature {
355 0     0 1   my ($self) = shift;
356 0           $self->throw("We haven't really done this right, yet, use parse_next_gene");
357             }
358              
359              
360             1;