File Coverage

Bio/Tools/Sim4/Results.pm
Criterion Covered Total %
statement 99 119 83.1
branch 27 48 56.2
condition 2 6 33.3
subroutine 9 10 90.0
pod 4 4 100.0
total 141 187 75.4


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Tools::Sim4::Results
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Ewan Birney
7             # and Hilmar Lapp
8             #
9             # Copyright Ewan Birney and Hilmar Lapp
10             #
11             # You may distribute this module under the same terms as perl itself
12              
13             # POD documentation - main docs before the code
14              
15             =head1 NAME
16              
17             Bio::Tools::Sim4::Results - Results of one Sim4 run
18              
19             =head1 SYNOPSIS
20              
21             # to preset the order of EST and genomic file as given on the sim4
22             # command line:
23             my $sim4 = Bio::Tools::Sim4::Results->new(-file => 'result.sim4',
24             -estfirst => 1);
25             # to let the order be determined automatically (by length comparison):
26             $sim4 = Bio::Tools::Sim4::Results->new( -file => 'sim4.results' );
27             # filehandle:
28             $sim4 = Bio::Tools::Sim4::Results->new( -fh => \*INPUT );
29              
30             # parse the results
31             while(my $exonset = $sim4->next_exonset()) {
32             # $exonset is-a Bio::SeqFeature::Generic with Bio::Tools::Sim4::Exons
33             # as sub features
34             print "Delimited on sequence ", $exonset->seq_id(),
35             "from ", $exonset->start(), " to ", $exonset->end(), "\n";
36             foreach my $exon ( $exonset->sub_SeqFeature() ) {
37             # $exon is-a Bio::SeqFeature::FeaturePair
38             print "Exon from ", $exon->start, " to ", $exon->end,
39             " on strand ", $exon->strand(), "\n";
40             # you can get out what it matched using the est_hit attribute
41             my $homol = $exon->est_hit();
42             print "Matched to sequence ", $homol->seq_id,
43             " at ", $homol->start," to ", $homol->end, "\n";
44             }
45             }
46              
47             # essential if you gave a filename at initialization (otherwise the file
48             # stays open)
49             $sim4->close();
50              
51             =head1 DESCRIPTION
52              
53             The sim4 module provides a parser and results object for sim4 output. The
54             sim4 results are specialised types of SeqFeatures, meaning you can add them
55             to AnnSeq objects fine, and manipulate them in the "normal" seqfeature manner.
56              
57             The sim4 Exon objects are Bio::SeqFeature::FeaturePair inherited objects. The
58             $esthit = $exon-Eest_hit() is the alignment as a feature on the matching
59             object (normally, an EST), in which the start/end points are where the hit
60             lies.
61              
62             To make this module work sensibly you need to run
63              
64             sim4 genomic.fasta est.database.fasta
65             or
66             sim4 est.fasta genomic.database.fasta
67              
68             To get the sequence identifiers recorded for the first sequence, too, use
69             A=4 as output option for sim4.
70              
71             One fiddle here is that there are only two real possibilities to the matching
72             criteria: either one sequence needs reversing or not. Because of this, it
73             is impossible to tell whether the match is in the forward or reverse strand
74             of the genomic DNA. We solve this here by assuming that the genomic DNA is
75             always forward. As a consequence, the strand attribute of the matching EST is
76             unknown, and the strand attribute of the genomic DNA (i.e., the Exon object)
77             will reflect the direction of the hit.
78              
79             See the documentation of parse_next_alignment() for abilities of the parser
80             to deal with the different output format options of sim4.
81              
82             =head1 FEEDBACK
83              
84             =head2 Mailing Lists
85              
86             User feedback is an integral part of the evolution of this and other
87             Bioperl modules. Send your comments and suggestions preferably to one
88             of the Bioperl mailing lists. Your participation is much appreciated.
89              
90             bioperl-l@bioperl.org - General discussion
91             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
92              
93             =head2 Support
94              
95             Please direct usage questions or support issues to the mailing list:
96              
97             I
98              
99             rather than to the module maintainer directly. Many experienced and
100             reponsive experts will be able look at the problem and quickly
101             address it. Please include a thorough description of the problem
102             with code and data examples if at all possible.
103              
104             =head2 Reporting Bugs
105              
106             Report bugs to the Bioperl bug tracking system to help us keep track
107             the bugs and their resolution. Bug reports can be submitted via the
108             web:
109              
110             https://github.com/bioperl/bioperl-live/issues
111              
112             =head1 AUTHOR - Ewan Birney, Hilmar Lapp
113              
114             Ewan Birney Ebirney-at-sanger.ac.ukE
115             Hilmar Lapp Ehlapp-at-gmx.netE or Ehilmar.lapp-at-pharma.novartis.comE.
116              
117             =head1 APPENDIX
118              
119             The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
120              
121             =cut
122              
123              
124             # Let the code begin...
125              
126              
127             package Bio::Tools::Sim4::Results;
128 1     1   494 use strict;
  1         1  
  1         22  
129              
130              
131 1     1   3 use File::Basename;
  1         1  
  1         55  
132 1     1   288 use Bio::Root::Root;
  1         1  
  1         26  
133 1     1   280 use Bio::Tools::Sim4::Exon;
  1         2  
  1         32  
134              
135 1     1   4 use base qw(Bio::Tools::AnalysisResult);
  1         1  
  1         329  
136              
137              
138             sub _initialize_state {
139 2     2   3 my($self,@args) = @_;
140              
141             # call the inherited method first
142 2         5 my $make = $self->SUPER::_initialize_state(@args);
143              
144 2         6 my ($est_is_first) = $self->_rearrange([qw(ESTFIRST)], @args);
145              
146 2         4 delete($self->{'_est_is_first'});
147 2 50       4 $self->{'_est_is_first'} = $est_is_first if(defined($est_is_first));
148 2         5 $self->analysis_method("Sim4");
149             }
150              
151             =head2 analysis_method
152              
153             Usage : $sim4->analysis_method();
154             Purpose : Inherited method. Overridden to ensure that the name matches
155             /sim4/i.
156             Returns : String
157             Argument : n/a
158              
159             =cut
160              
161             #-------------
162             sub analysis_method {
163             #-------------
164 20     20 1 22 my ($self, $method) = @_;
165 20 50 66     43 if($method && ($method !~ /sim4/i)) {
166 0         0 $self->throw("method $method not supported in " . ref($self));
167             }
168 20         32 return $self->SUPER::analysis_method($method);
169             }
170              
171             =head2 parse_next_alignment
172              
173             Title : parse_next_alignment
174             Usage : @exons = $sim4_result->parse_next_alignment;
175             foreach $exon (@exons) {
176             # do something
177             }
178             Function: Parses the next alignment of the Sim4 result file and returns the
179             found exons as an array of Bio::Tools::Sim4::Exon objects. Call
180             this method repeatedly until an empty array is returned to get the
181             results for all alignments.
182              
183             The $exon->seq_id() attribute will be set to the identifier of the
184             respective sequence for both sequences if A=4 was used in the sim4
185             run, and otherwise for the second sequence only. If the output does
186             not contain the identifier, the filename stripped of path and
187             extension is used instead. In addition, the full filename
188             will be recorded for both features ($exon inherits off
189             Bio::SeqFeature::SimilarityPair) as tag 'filename'. The length
190             is accessible via the seqlength() attribute of $exon->query() and
191             $exon->est_hit().
192              
193             Note that this method is capable of dealing with outputs generated
194             with format 0,1,3, and 4 (via the A=n option to sim4). It
195             automatically determines which of the two sequences has been
196             reversed, and adjusts the coordinates for that sequence. It will
197             also detect whether the EST sequence(s) were given as first or as
198             second file to sim4, unless this has been specified at creation
199             time of the object.
200              
201             Example :
202             Returns : An array of Bio::Tools::Sim4::Exon objects
203             Args :
204              
205              
206             =cut
207              
208             sub parse_next_alignment {
209 3     3 1 4 my ($self) = @_;
210 3         3 my @exons = ();
211 3         3 my %seq1props = ();
212 3         4 my %seq2props = ();
213             # we refer to the properties of each seq by reference
214 3         2 my ($estseq, $genomseq, $to_reverse);
215 3         3 my $started = 0;
216 3         3 my $hit_direction = 1;
217 3         3 my $output_fmt = 3; # same as 0 and 1 (we cannot deal with A=2 produced
218             # output yet)
219            
220 3         12 while(defined($_ = $self->_readline())) {
221             #chomp();
222             #
223             # bascially, each sim4 'hit' starts with seq1...
224             #
225 32 100       53 /^seq1/ && do {
226 4 100       8 if($started) {
227 1         7 $self->_pushback($_);
228 1         2 last;
229             }
230 3         3 $started = 1;
231              
232             # filename and length of seq 1
233 3 50       13 /^seq1\s+=\s+(\S+)\,\s+(\d+)/ ||
234             $self->throw("Sim4 parsing error on seq1 [$_] line. Sorry!");
235 3         8 $seq1props{'filename'} = $1;
236 3         4 $seq1props{'length'} = $2;
237 3         7 next;
238             };
239 28 100       45 /^seq2/ && do {
240             # the second hit has also the database name in the >name syntax
241             # (in brackets).
242 3 50       14 /^seq2\s+=\s+(\S+)\s+\(>?(\S+)\s*\)\,\s+(\d+)/||
243             $self->throw("Sim4 parsing error on seq2 [$_] line. Sorry!");
244 3         6 $seq2props{'filename'} = $1;
245 3         5 $seq2props{'seqname'} = $2;
246 3         5 $seq2props{'length'} = $3;
247 3         6 next;
248             };
249 25 50       31 if(/^>(\S+)\s*(.*)$/) {
250             # output option was A=4, which not only gives the complete
251             # description lines, but also causes the longer sequence to be
252             # reversed if the second file contained one (genomic) sequence
253 0         0 $seq1props{'seqname'} = $1;
254 0 0       0 $seq1props{'description'} = $2 if $2;
255 0         0 $output_fmt = 4;
256             # we handle seq1 and seq2 both here
257 0 0 0     0 if(defined($_ = $self->_readline()) && (/^>(\S+)\s*(.*)$/)) {
258 0         0 $seq2props{'seqname'} = $1; # redundant, since already set above
259 0 0       0 $seq2props{'description'} = $2 if $2;
260             }
261 0         0 next;
262             }
263 25 100       30 /^\(complement\)/ && do {
264 1         1 $hit_direction = -1;
265 1         2 next;
266             };
267             # this matches
268             # start-end (start-end) pctid%
269 24 100       90 if(/(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d+)%/) {
270 18         35 $seq1props{'start'} = $1;
271 18         23 $seq1props{'end'} = $2;
272 18         19 $seq2props{'start'} = $3;
273 18         18 $seq2props{'end'} = $4;
274 18         13 my $pctid = $5;
275            
276 18 100       32 if(! defined($estseq)) {
277             # for the first time here: need to set the references referring
278             # to seq1 and seq2
279 3 100       5 if(! exists($self->{'_est_is_first'})) {
280             # detect which one is the EST by looking at the lengths,
281             # and assume that this holds throughout the entire result
282             # file (i.e., when this method is called for the next
283             # alignment, this will not be checked again)
284 2 50       6 if($seq1props{'length'} > $seq2props{'length'}) {
285 2         4 $self->{'_est_is_first'} = 0;
286             } else {
287 0         0 $self->{'_est_is_first'} = 1;
288             }
289             }
290 3 50       5 if($self->{'_est_is_first'}) {
291 0         0 $estseq = \%seq1props;
292 0         0 $genomseq = \%seq2props;
293             # if the EST is given first, A=4 selects the genomic
294             # seq for being reversed (reversing the EST is default)
295 0 0       0 $to_reverse = ($output_fmt == 4) ? $genomseq : $estseq;
296             } else {
297 3         4 $estseq = \%seq2props;
298 3         3 $genomseq = \%seq1props;
299             # if the EST is the second, A=4 does not change the
300             # seq being reversed (always the EST is reversed)
301 3         3 $to_reverse = $estseq;
302             }
303             }
304 18 100       28 if($hit_direction == -1) {
305             # we have to reverse the coordinates of one of both seqs
306 10         10 my $tmp = $to_reverse->{'start'};
307             $to_reverse->{'start'} =
308 10         19 $to_reverse->{'length'} - $to_reverse->{'end'} + 1;
309 10         12 $to_reverse->{'end'} = $to_reverse->{'length'} - $tmp + 1;
310             }
311             # create and initialize the exon object
312             my $exon = Bio::Tools::Sim4::Exon->new(
313             '-start' => $genomseq->{'start'},
314 18         54 '-end' => $genomseq->{'end'},
315             '-strand' => $hit_direction);
316 18 50       27 if(exists($genomseq->{'seqname'})) {
317 0         0 $exon->seq_id($genomseq->{'seqname'});
318             } else {
319             # take filename stripped of path as fall back
320 18         367 my ($basename) = &File::Basename::fileparse($genomseq->{'filename'}, '\..*');
321 18         42 $exon->seq_id($basename);
322             }
323             $exon->feature1()->add_tag_value('filename',
324 18         31 $genomseq->{'filename'});
325             # feature1 is supposed to be initialized to a Similarity object,
326             # but we provide a safety net
327 18 50       25 if($exon->feature1()->can('seqlength')) {
328 18         23 $exon->feature1()->seqlength($genomseq->{'length'});
329             } else {
330             $exon->feature1()->add_tag_value('SeqLength',
331 0         0 $genomseq->{'length'});
332             }
333             # create and initialize the feature wrapping the 'hit' (the EST)
334             my $fea2 = Bio::SeqFeature::Similarity->new(
335             '-start' => $estseq->{'start'},
336 18         43 '-end' => $estseq->{'end'},
337             '-strand' => 0,
338             '-primary' => "aligning_EST");
339 18 50       25 if(exists($estseq->{'seqname'})) {
340 18         32 $fea2->seq_id($estseq->{'seqname'});
341             } else {
342             # take filename stripped of path as fall back
343             my ($basename) =
344 0         0 &File::Basename::fileparse($estseq->{'filename'}, '\..*');
345 0         0 $fea2->seq_id($basename);
346             }
347 18         28 $fea2->add_tag_value('filename', $estseq->{'filename'});
348 18         32 $fea2->seqlength($estseq->{'length'});
349             # store
350 18         35 $exon->est_hit($fea2);
351             # general properties
352 18         21 $exon->source_tag($self->analysis_method());
353 18         31 $exon->percentage_id($pctid);
354 18         23 $exon->score($exon->percentage_id());
355             # push onto array
356 18         21 push(@exons, $exon);
357 18         43 next; # back to while loop
358             }
359             }
360 3         11 return @exons;
361             }
362              
363             =head2 next_exonset
364              
365             Title : next_exonset
366             Usage : $exonset = $sim4_result->parse_next_exonset;
367             print "Exons start at ", $exonset->start(),
368             "and end at ", $exonset->end(), "\n";
369             foreach $exon ($exonset->sub_SeqFeature()) {
370             # do something
371             }
372             Function: Parses the next alignment of the Sim4 result file and returns the
373             set of exons as a container of features. The container is itself
374             a Bio::SeqFeature::Generic object, with the Bio::Tools::Sim4::Exon
375             objects as sub features. Start, end, and strand of the container
376             will represent the total region covered by the exons of this set.
377              
378             See the documentation of parse_next_alignment() for further
379             reference about parsing and how the information is stored.
380              
381             Example :
382             Returns : An Bio::SeqFeature::Generic object holding Bio::Tools::Sim4::Exon
383             objects as sub features.
384             Args :
385              
386             =cut
387              
388             sub next_exonset {
389 3     3 1 503 my $self = shift;
390 3         4 my $exonset;
391              
392             # get the next array of exons
393 3         6 my @exons = $self->parse_next_alignment();
394 3 50       7 unless( @exons ) {
395 0 0       0 return if eof($self->_fh);
396 0         0 return $self->next_exonset;
397             }
398             # create the container of exons as a feature object itself, with the
399             # data of the first exon for initialization
400 3         7 $exonset = Bio::SeqFeature::Generic->new('-start' => $exons[0]->start(),
401             '-end' => $exons[0]->end(),
402             '-strand' => $exons[0]->strand(),
403             '-primary' => "ExonSet");
404 3         10 $exonset->source_tag($exons[0]->source_tag());
405 3         5 $exonset->seq_id($exons[0]->seq_id());
406             # now add all exons as sub features, with enabling EXPANsion of the region
407             # covered in total
408 3         4 foreach my $exon (@exons) {
409 18         50 $exonset->add_sub_SeqFeature($exon, 'EXPAND');
410             }
411 3         8 return $exonset;
412             }
413              
414             =head2 next_feature
415              
416             Title : next_feature
417             Usage : while($exonset = $sim4->next_feature()) {
418             # do something
419             }
420             Function: Does the same as L. See there for documentation of
421             the functionality. Call this method repeatedly until FALSE is
422             returned.
423              
424             The returned object is actually a SeqFeatureI implementing object.
425             This method is required for classes implementing the
426             SeqAnalysisParserI interface, and is merely an alias for
427             next_exonset() at present.
428              
429             Example :
430             Returns : A Bio::SeqFeature::Generic object.
431             Args :
432              
433             =cut
434              
435             sub next_feature {
436 0     0 1   my ($self,@args) = @_;
437             # even though next_exonset doesn't expect any args (and this method
438             # does neither), we pass on args in order to be prepared if this changes
439             # ever
440 0           return $self->next_exonset(@args);
441             }
442              
443             1;