File Coverage

Bio/SearchIO/blasttable.pm
Criterion Covered Total %
statement 150 174 86.2
branch 55 76 72.3
condition 22 36 61.1
subroutine 17 21 80.9
pod 11 12 91.6
total 255 319 79.9


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::SearchIO::blasttable
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SearchIO::blasttable - Driver module for SearchIO for parsing NCBI -m 8/9 format
17              
18             =head1 SYNOPSIS
19              
20             # do not use this module directly
21             use Bio::SearchIO;
22             my $parser = Bio::SearchIO->new(-file => $file,
23             -format => 'blasttable');
24              
25             while( my $result = $parser->next_result ) {
26             }
27              
28             =head1 DESCRIPTION
29              
30             This module will support parsing NCBI -m 8 or -m 9 tabular output
31             and WU-BLAST -mformat 2 or -mformat 3 tabular output.
32              
33             =head1 FEEDBACK
34              
35             =head2 Mailing Lists
36              
37             User feedback is an integral part of the evolution of this and other
38             Bioperl modules. Send your comments and suggestions preferably to
39             the Bioperl mailing list. Your participation is much appreciated.
40              
41             bioperl-l@bioperl.org - General discussion
42             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43              
44             =head2 Support
45              
46             Please direct usage questions or support issues to the mailing list:
47              
48             I
49              
50             rather than to the module maintainer directly. Many experienced and
51             reponsive experts will be able look at the problem and quickly
52             address it. Please include a thorough description of the problem
53             with code and data examples if at all possible.
54              
55             =head2 Reporting Bugs
56              
57             Report bugs to the Bioperl bug tracking system to help us keep track
58             of the bugs and their resolution. Bug reports can be submitted via
59             the web:
60              
61             https://github.com/bioperl/bioperl-live/issues
62              
63             =head1 AUTHOR - Jason Stajich
64              
65             Email jason-at-bioperl-dot-org
66              
67             =head1 APPENDIX
68              
69             The rest of the documentation details each of the object methods.
70             Internal methods are usually preceded with a _
71              
72             =cut
73              
74             # Let the code begin...
75              
76              
77             package Bio::SearchIO::blasttable;
78 2     2   8 use vars qw(%MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $DefaultProgramName);
  2         3  
  2         120  
79 2     2   8 use strict;
  2         1  
  2         36  
80 2     2   578 use Bio::Search::Result::ResultFactory;
  2         3  
  2         42  
81 2     2   546 use Bio::Search::Hit::HitFactory;
  2         3  
  2         45  
82 2     2   582 use Bio::Search::HSP::HSPFactory;
  2         4  
  2         177  
83              
84             $DefaultProgramName = 'BLASTN';
85             $DEFAULT_WRITER_CLASS = 'Bio::SearchIO::Writer::HitTableWriter';
86              
87             # mapping of terms to Bioperl hash keys
88             %MODEMAP = (
89             'Result' => 'result',
90             'Hit' => 'hit',
91             'Hsp' => 'hsp'
92             );
93              
94             %MAPPING = (
95             'Hsp_bit-score' => 'HSP-bits',
96             'Hsp_score' => 'HSP-score',
97             'Hsp_evalue' => 'HSP-evalue',
98             'Hsp_query-from' => 'HSP-query_start',
99             'Hsp_query-to' => 'HSP-query_end',
100             'Hsp_hit-from' => 'HSP-hit_start',
101             'Hsp_hit-to' => 'HSP-hit_end',
102             'Hsp_positive' => 'HSP-conserved',
103             'Hsp_identity' => 'HSP-identical',
104             'Hsp_mismatches' => 'HSP-mismatches',
105             'Hsp_qgapblocks' => 'HSP-query_gapblocks',
106             'Hsp_hgapblocks' => 'HSP-hit_gapblocks',
107             'Hsp_gaps' => 'HSP-hsp_gaps',
108             'Hsp_hitgaps' => 'HSP-hit_gaps',
109             'Hsp_querygaps' => 'HSP-query_gaps',
110             'Hsp_align-len' => 'HSP-hsp_length',
111             'Hsp_query-frame'=> 'HSP-query_frame',
112             'Hsp_hit-frame' => 'HSP-hit_frame',
113              
114             'Hit_id' => 'HIT-name',
115             'Hit_len' => 'HIT-length',
116             'Hit_accession' => 'HIT-accession',
117             'Hit_def' => 'HIT-description',
118             'Hit_signif' => 'HIT-significance',
119             'Hit_score' => 'HIT-score',
120             'Hit_bits' => 'HIT-bits',
121              
122             'Result_program' => 'RESULT-algorithm_name',
123             'Result_version' => 'RESULT-algorithm_version',
124             'Result_query-def'=> 'RESULT-query_name',
125             'Result_query-len'=> 'RESULT-query_length',
126             'Result_query-acc'=> 'RESULT-query_accession',
127             'Result_querydesc'=> 'RESULT-query_description',
128             'Result_db' => 'RESULT-database_name',
129             'Result_db-len' => 'RESULT-database_entries',
130             'Result_db-let' => 'RESULT-database_letters',
131             );
132              
133 2     2   8 use base qw(Bio::SearchIO);
  2         2  
  2         3034  
134              
135             =head2 new
136              
137             Title : new
138             Usage : my $obj = Bio::SearchIO::blasttable->new();
139             Function: Builds a new Bio::SearchIO::blasttable object
140             Returns : an instance of Bio::SearchIO::blasttable
141             Args :
142              
143              
144             =cut
145              
146             sub _initialize {
147 11     11   24 my ($self,@args) = @_;
148 11         39 $self->SUPER::_initialize(@args);
149              
150 11         44 my ($pname) = $self->_rearrange([qw(PROGRAM_NAME)],
151             @args);
152 11   66     62 $self->program_name($pname || $DefaultProgramName);
153 11         35 $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::GenericResult'));
154 11         20 $self->_eventHandler->register_factory('hit', Bio::Search::Hit::HitFactory->new(-type => 'Bio::Search::Hit::GenericHit'));
155 11         23 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::GenericHSP'));
156             }
157              
158              
159             =head2 next_result
160              
161             Title : next_result
162             Usage : my $result = $parser->next_result
163             Function: Parse the next result from the data stream
164             Returns : L
165             Args : none
166              
167              
168             =cut
169              
170             sub next_result{
171 12     12 1 474 my ($self) = @_;
172 12         13 my ($lastquery,$lasthit);
173 12         47 local $/ = "\n";
174 12         12 local $_;
175 12         17 my ($alg, $ver);
176 12         39 while( defined ($_ = $self->_readline) ) {
177             # WU-BLAST -mformat 3 only
178 766 100       1859 if(m{^#\s((?:\S+?)?BLAST[NPX])\s(\d+\.\d+.+\d{4}\])}) {
    100          
179 1         4 ($alg, $ver) = ($1, $2);
180             # only one header for whole file with WU-BLAST
181             # so $alg and $ver won't get set properly for
182             # each result
183 1 50       6 $self->program_name($alg) if $alg;
184 1 50       7 $self->element({'Name' => 'Result_version',
185             'Data' => $ver}) if $ver;
186 1         3 next;
187             }
188             # -m 9 only
189             elsif(m{^#\s+((?:\S+?)?BLAST[NPX])\s+(.+)}) {
190 1         3 ($alg, $ver) = ($1, $2);
191 1         3 next;
192             }
193 764 100 100     2912 next if /^#/ || /^\s*$/;
194              
195 686         2079 my @fields = split;
196 686 50       959 next if @fields == 1;
197 686         497 my ($qname,$hname, $percent_id, $hsp_len, $mismatches,$gapsm,
198             $qstart,$qend,$hstart,$hend,$evalue,$bits);
199             # WU-BLAST-specific
200 0         0 my ($num_scores, $raw_score, $identities, $positives, $percent_pos,
201             $qgap_blocks,$qgaps, $sgap_blocks, $sgaps, $qframe,
202             $sframe);
203             # NCBI -m8 and -m9
204 686 100 33     1112 if (@fields == 12) {
    100          
    50          
205 8         12 ($qname,$hname, $percent_id, $hsp_len, $mismatches,$gapsm,
206             $qstart,$qend,$hstart,$hend,$evalue,$bits) = @fields;
207             # NCBI -m8 and -m9, v 2.2.18+
208             } elsif (@fields == 13) {
209 675         1197 ($qname, $hname, $percent_id, $percent_pos, $hsp_len, $mismatches, $gapsm,
210             $qstart,$qend,$hstart,$hend,$evalue,$bits) = @fields;
211             }
212             # WU-BLAST -mformat 2 and 3
213             elsif ((@fields == 22) or (@fields == 24)) {
214 3         10 ($qname,$hname,$evalue,$num_scores, $bits, $raw_score, $hsp_len,
215             $identities, $positives,$mismatches, $percent_id, $percent_pos,
216             $qgap_blocks, $qgaps, $sgap_blocks, $sgaps, $qframe, $qstart,
217             $qend, $sframe, $hstart,$hend,) = @fields;
218             # we need total gaps in the alignment
219 3         6 $gapsm=$qgaps+$sgaps;
220             }
221              
222 686 100 100     1942 if (@fields == 12 || @fields == 13) {
223             # need to determine total gaps in the alignment for NCBI output
224             # since NCBI reports number of gapopens and NOT total gaps
225 683         1138 my $qlen = abs($qstart - $qend) + 1;
226 683         517 my $querygaps = $hsp_len - $qlen;
227 683         510 my $hlen = abs($hstart - $hend) + 1;
228 683         525 my $hitgaps = $hsp_len - $hlen;
229 683         604 $gapsm = $querygaps + $hitgaps;
230             }
231              
232             # Remember Jim's code is 0 based
233 686 100 100     2718 if( defined $lastquery &&
    100          
    100          
234             $lastquery ne $qname ) {
235 4         13 $self->end_element({'Name' => 'Hit'});
236 4         16 $self->end_element({'Name' => 'Result'});
237 4         20 $self->_pushback($_);
238 4         13 return $self->end_document;
239             } elsif( ! defined $lastquery ) {
240 11         20 $self->{'_result_count'}++;
241 11         40 $self->start_element({'Name' => 'Result'});
242 11   66     50 $self->element({'Name' => 'Result_program',
243             'Data' => $alg || $self->program_name});
244 11 100       39 $self->element({'Name' => 'Result_version',
245             'Data' => $ver}) if $ver;
246 11         37 $self->element({'Name' => 'Result_query-def',
247             'Data' => $qname});
248 11         32 $self->start_element({'Name' => 'Hit'});
249 11         37 $self->element({'Name' => 'Hit_id',
250             'Data' => $hname});
251             # we'll store the 1st hsp bits as the hit bits
252 11         45 $self->element({'Name' => 'Hit_bits',
253             'Data' => $bits});
254             # we'll store the 1st hsp value as the hit evalue
255 11         41 $self->element({'Name' => 'Hit_signif',
256             'Data' => $evalue});
257            
258             } elsif( $lasthit ne $hname ) {
259 640 50       1014 if( $self->in_element('hit') ) {
260 640         1104 $self->end_element({'Name' => 'Hit'});
261             }
262 640         1414 $self->start_element({'Name' => 'Hit'});
263 640         1463 $self->element({'Name' => 'Hit_id',
264             'Data' => $hname});
265             # we'll store the 1st hsp bits as the hit bits
266 640         1475 $self->element({'Name' => 'Hit_bits',
267             'Data' => $bits});
268             # we'll store the 1st hsp value as the hit evalue
269 640         1365 $self->element({'Name' => 'Hit_signif',
270             'Data' => $evalue});
271             }
272 682         1144 my $identical = $hsp_len - $mismatches - $gapsm;
273             # If $positives value is absent, try to recover it from $percent_pos,
274             # this is better than letting the program to assume "conserved == identical"
275 682 100 100     2116 if (not defined $positives and defined $percent_pos) {
276 671         1995 $positives = sprintf "%d", ($percent_pos * $hsp_len / 100);
277             }
278 682         1157 $self->start_element({'Name' => 'Hsp'});
279 682         1381 $self->element({'Name' => 'Hsp_evalue',
280             'Data' => $evalue});
281 682         1560 $self->element({'Name' => 'Hsp_bit-score',
282             'Data' => $bits});
283 682         1460 $self->element({'Name' => 'Hsp_identity',
284             'Data' => $identical});
285 682         1524 $self->element({'Name' => 'Hsp_positive',
286             'Data' => $positives});
287 682         1455 $self->element({'Name' => 'Hsp_gaps',
288             'Data' => $gapsm});
289 682         1594 $self->element({'Name' => 'Hsp_query-from',
290             'Data' => $qstart});
291 682         1476 $self->element({'Name' => 'Hsp_query-to',
292             'Data' => $qend});
293              
294 682         1458 $self->element({'Name' => 'Hsp_hit-from',
295             'Data' => $hstart });
296 682         1553 $self->element({'Name' => 'Hsp_hit-to',
297             'Data' => $hend });
298 682         1462 $self->element({'Name' => 'Hsp_align-len',
299             'Data' => $hsp_len});
300 682         1381 $self->end_element({'Name' => 'Hsp'});
301 682         843 $lastquery = $qname;
302 682         2423 $lasthit = $hname;
303             }
304             # fencepost
305 8 100 66     53 if( defined $lasthit && defined $lastquery ) {
306 7 50       19 if( $self->in_element('hit') ) {
307 7         24 $self->end_element({'Name' => 'Hit'});
308             }
309 7         26 $self->end_element({'Name' => 'Result'});
310 7         25 return $self->end_document;
311             }
312             }
313              
314             =head2 start_element
315              
316             Title : start_element
317             Usage : $eventgenerator->start_element
318             Function: Handles a start element event
319             Returns : none
320             Args : hashref with at least 2 keys 'Data' and 'Name'
321              
322              
323             =cut
324              
325             sub start_element{
326 10142     10142 1 6380 my ($self,$data) = @_;
327             # we currently don't care about attributes
328 10142         7004 my $nm = $data->{'Name'};
329 10142 100       12994 if( my $type = $MODEMAP{$nm} ) {
330 1344         1491 $self->_mode($type);
331 1344 50       1650 if( $self->_will_handle($type) ) {
332 1344         1968 my $func = sprintf("start_%s",lc $type);
333 1344         2020 $self->_eventHandler->$func($data->{'Attributes'});
334             }
335 1344         1347 unshift @{$self->{'_elements'}}, $type;
  1344         1795  
336             }
337 10142 100       13835 if($nm eq 'Result') {
338 11         17 $self->{'_values'} = {};
339 11         22 $self->{'_result'}= undef;
340 11         23 $self->{'_mode'} = '';
341             }
342              
343             }
344              
345             =head2 end_element
346              
347             Title : start_element
348             Usage : $eventgenerator->end_element
349             Function: Handles an end element event
350             Returns : none
351             Args : hashref with at least 2 keys 'Data' and 'Name'
352              
353              
354             =cut
355              
356             sub end_element {
357 10142     10142 1 7187 my ($self,$data) = @_;
358 10142         7100 my $nm = $data->{'Name'};
359 10142         6224 my $rc;
360             # Hsp are sort of weird, in that they end when another
361             # object begins so have to detect this in end_element for now
362            
363 10142 100       16159 if( my $type = $MODEMAP{$nm} ) {
    50          
364 1344 50       1613 if( $self->_will_handle($type) ) {
365 1344         2410 my $func = sprintf("end_%s",lc $type);
366             $rc = $self->_eventHandler->$func($self->{'_reporttype'},
367 1344         2264 $self->{'_values'});
368             }
369 1344         956 shift @{$self->{'_elements'}};
  1344         1328  
370              
371             } elsif( $MAPPING{$nm} ) {
372 8798 50       9238 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
373 0         0 my $key = (keys %{$MAPPING{$nm}})[0];
  0         0  
374 0         0 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
375             } else {
376 8798         10592 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
377             }
378             } else {
379 0         0 $self->warn( "unknown nm $nm ignoring\n");
380             }
381 10142         7945 $self->{'_last_data'} = ''; # remove read data if we are at
382             # end of an element
383 10142 100       12146 $self->{'_result'} = $rc if( $nm eq 'Result' );
384 10142         8876 return $rc;
385              
386             }
387              
388             =head2 element
389              
390             Title : element
391             Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
392             Function: Convience method that calls start_element, characters, end_element
393             Returns : none
394             Args : Hash ref with the keys 'Name' and 'Data'
395              
396              
397             =cut
398              
399             sub element{
400 8798     8798 1 6726 my ($self,$data) = @_;
401 8798         7335 $self->start_element($data);
402 8798         8080 $self->characters($data);
403 8798         8291 $self->end_element($data);
404             }
405              
406              
407             =head2 characters
408              
409             Title : characters
410             Usage : $eventgenerator->characters($str)
411             Function: Send a character events
412             Returns : none
413             Args : string
414              
415              
416             =cut
417              
418             sub characters{
419 8798     8798 1 5401 my ($self,$data) = @_;
420              
421             # deep bug fix: set $self->{'_last_data'} to undef if $$data{Data} is
422             # a valid slot, whose value is undef --
423             # allows an undef to be propagated to object constructors and
424             # handled there as desired; in particular, when Hsp_postive => -conserved
425             # is not defined (in BLASTN, e.g.), the value of hsp's {CONSERVED} property is
426             # set to the value of {IDENTICAL}.
427             #/maj
428             # return unless ( defined $data->{'Data'} );
429 8798 50       22440 return unless ( grep /Data/, keys %$data );
430 8798 100       11535 if ( !defined $data->{'Data'} ) {
431 8         8 $self->{'_last_data'} = undef;
432 8         8 return;
433             }
434 8790 50       13342 if( $data->{'Data'} =~ /^\s+$/ ) {
435 0 0       0 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
436             }
437              
438 8790 50 66     8870 if( $self->in_element('hsp') &&
439             $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
440            
441 0         0 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
442             }
443            
444 8790         9816 $self->{'_last_data'} = $data->{'Data'};
445             }
446              
447             =head2 _mode
448              
449             Title : _mode
450             Usage : $obj->_mode($newval)
451             Function:
452             Example :
453             Returns : value of _mode
454             Args : newvalue (optional)
455              
456              
457             =cut
458              
459             sub _mode{
460 1344     1344   1012 my ($self,$value) = @_;
461 1344 50       1665 if( defined $value) {
462 1344         1150 $self->{'_mode'} = $value;
463             }
464 1344         1098 return $self->{'_mode'};
465             }
466              
467             =head2 within_element
468              
469             Title : within_element
470             Usage : if( $eventgenerator->within_element($element) ) {}
471             Function: Test if we are within a particular element
472             This is different than 'in' because within can be tested
473             for a whole block.
474             Returns : boolean
475             Args : string element name
476              
477              
478             =cut
479              
480             sub within_element{
481 0     0 1 0 my ($self,$name) = @_;
482             return 0 if ( ! defined $name &&
483             ! defined $self->{'_elements'} ||
484 0 0 0     0 scalar @{$self->{'_elements'}} == 0) ;
  0   0     0  
485 0         0 foreach ( @{$self->{'_elements'}} ) {
  0         0  
486 0 0       0 if( $_ eq $name ) {
487 0         0 return 1;
488             }
489             }
490 0         0 return 0;
491             }
492              
493             =head2 in_element
494              
495             Title : in_element
496             Usage : if( $eventgenerator->in_element($element) ) {}
497             Function: Test if we are in a particular element
498             This is different than 'in' because within can be tested
499             for a whole block.
500             Returns : boolean
501             Args : string element name
502              
503              
504             =cut
505              
506             sub in_element{
507 9437     9437 1 7026 my ($self,$name) = @_;
508 9437 100       11251 return 0 if ! defined $self->{'_elements'}->[0];
509 9436         30983 return ( $self->{'_elements'}->[0] eq $name)
510             }
511              
512              
513             =head2 start_document
514              
515             Title : start_document
516             Usage : $eventgenerator->start_document
517             Function: Handles a start document event
518             Returns : none
519             Args : none
520              
521              
522             =cut
523              
524             sub start_document{
525 0     0 1 0 my ($self) = @_;
526 0         0 $self->{'_lasttype'} = '';
527 0         0 $self->{'_values'} = {};
528 0         0 $self->{'_result'}= undef;
529 0         0 $self->{'_mode'} = '';
530 0         0 $self->{'_elements'} = [];
531             }
532              
533              
534             =head2 end_document
535              
536             Title : end_document
537             Usage : $eventgenerator->end_document
538             Function: Handles an end document event
539             Returns : Bio::Search::Result::ResultI object
540             Args : none
541              
542              
543             =cut
544              
545             sub end_document{
546 11     11 1 19 my ($self,@args) = @_;
547 11         92 return $self->{'_result'};
548             }
549              
550             =head2 result_count
551              
552             Title : result_count
553             Usage : my $count = $searchio->result_count
554             Function: Returns the number of results we have processed
555             Returns : integer
556             Args : none
557              
558              
559             =cut
560              
561             sub result_count {
562 0     0 1 0 my $self = shift;
563 0         0 return $self->{'_result_count'};
564             }
565              
566 0     0 0 0 sub report_count { shift->result_count }
567              
568              
569             =head2 program_name
570              
571             Title : program_name
572             Usage : $obj->program_name($newval)
573             Function: Get/Set the program name
574             Returns : value of program_name (a scalar)
575             Args : on set, new value (a scalar or undef, optional)
576              
577              
578             =cut
579              
580             sub program_name{
581 21     21 1 38 my $self = shift;
582              
583 21 100       55 $self->{'program_name'} = shift if @_;
584 21   33     95 return $self->{'program_name'} || $DefaultProgramName;
585             }
586              
587              
588             =head2 _will_handle
589              
590             Title : _will_handle
591             Usage : Private method. For internal use only.
592             if( $self->_will_handle($type) ) { ... }
593             Function: Provides an optimized way to check whether or not an element of a
594             given type is to be handled.
595             Returns : Reference to EventHandler object if the element type is to be handled.
596             undef if the element type is not to be handled.
597             Args : string containing type of element.
598              
599             Optimizations:
600              
601             =over 2
602              
603             =item 1
604              
605             Using the cached pointer to the EventHandler to minimize repeated
606             lookups.
607              
608             =item 2
609              
610             Caching the will_handle status for each type that is encountered so
611             that it only need be checked by calling
612             handler-Ewill_handle($type) once.
613              
614             =back
615              
616             This does not lead to a major savings by itself (only 5-10%). In
617             combination with other optimizations, or for large parse jobs, the
618             savings good be significant.
619              
620             To test against the unoptimized version, remove the parentheses from
621             around the third term in the ternary " ? : " operator and add two
622             calls to $self-E_eventHandler().
623              
624             =cut
625              
626             sub _will_handle {
627 2688     2688   2148 my ($self,$type) = @_;
628 2688         2347 my $handler = $self->{'_handler'};
629             my $will_handle = defined($self->{'_will_handle_cache'}->{$type})
630             ? $self->{'_will_handle_cache'}->{$type}
631 2688 100       4110 : ($self->{'_will_handle_cache'}->{$type} =
632             $handler->will_handle($type));
633              
634 2688 50       5005 return $will_handle ? $handler : undef;
635             }
636              
637             1;