File Coverage

Bio/Search/HSP/GenericHSP.pm
Criterion Covered Total %
statement 508 564 90.0
branch 274 354 77.4
condition 127 178 71.3
subroutine 50 50 100.0
pod 35 35 100.0
total 994 1181 84.1


line stmt bran cond sub pod time code
1             #
2             # BioPerl module for Bio::Search::HSP::GenericHSP
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Jason Stajich
7             #
8             # Copyright Jason Stajich
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::Search::HSP::GenericHSP - A "Generic" implementation of a High Scoring Pair
17              
18             =head1 SYNOPSIS
19              
20             my $hsp = Bio::Search::HSP::GenericHSP->new( -algorithm => 'blastp',
21             -evalue => '1e-30',
22             );
23              
24             $r_type = $hsp->algorithm;
25              
26             $pvalue = $hsp->p();
27              
28             $evalue = $hsp->evalue();
29              
30             $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
31              
32             $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
33              
34             $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
35              
36             $qseq = $hsp->query_string;
37              
38             $hseq = $hsp->hit_string;
39              
40             $homo_string = $hsp->homology_string;
41              
42             $len = $hsp->length( ['query'|'hit'|'total'] );
43              
44             $len = $hsp->length( ['query'|'hit'|'total'] );
45              
46             $rank = $hsp->rank;
47              
48             # TODO: Describe how to configure a SearchIO stream so that it generates
49             # GenericHSP objects.
50              
51             =head1 DESCRIPTION
52              
53             This implementation is "Generic", meaning it is is suitable for
54             holding information about High Scoring pairs from most Search reports
55             such as BLAST and FastA. Specialized objects can be derived from
56             this.
57              
58             Unless you're writing a parser, you won't ever need to create a
59             GenericHSP or any other HSPI-implementing object. If you use
60             the SearchIO system, HSPI objects are created automatically from
61             a SearchIO stream which returns Bio::Search::Result::ResultI objects
62             and you get the HSPI objects via the ResultI API.
63              
64             For documentation on what you can do with GenericHSP (and other HSPI
65             objects), please see the API documentation in
66             L.
67              
68             =head1 FEEDBACK
69              
70             =head2 Mailing Lists
71              
72             User feedback is an integral part of the evolution of this and other
73             Bioperl modules. Send your comments and suggestions preferably to
74             the Bioperl mailing list. Your participation is much appreciated.
75              
76             bioperl-l@bioperl.org - General discussion
77             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
78              
79             =head2 Support
80              
81             Please direct usage questions or support issues to the mailing list:
82              
83             I
84              
85             rather than to the module maintainer directly. Many experienced and
86             reponsive experts will be able look at the problem and quickly
87             address it. Please include a thorough description of the problem
88             with code and data examples if at all possible.
89              
90             =head2 Reporting Bugs
91              
92             Report bugs to the Bioperl bug tracking system to help us keep track
93             of the bugs and their resolution. Bug reports can be submitted via the
94             web:
95              
96             https://github.com/bioperl/bioperl-live/issues
97              
98             =head1 AUTHOR - Jason Stajich and Steve Chervitz
99              
100             Email jason-at-bioperl.org
101             Email sac-at-bioperl.org
102              
103             =head1 CONTRIBUTORS
104              
105             Sendu Bala, bix@sendu.me.uk
106              
107             =head1 APPENDIX
108              
109             The rest of the documentation details each of the object methods.
110             Internal methods are usually preceded with a _
111              
112             =cut
113              
114             # Let the code begin...
115              
116             package Bio::Search::HSP::GenericHSP;
117 27     27   168 use strict;
  27         48  
  27         768  
118              
119 27     27   126 use Bio::Root::Root;
  27         48  
  27         632  
120 27     27   9189 use Bio::SeqFeature::Similarity;
  27         70  
  27         937  
121              
122 27     27   196 use base qw(Bio::Search::HSP::HSPI);
  27         55  
  27         12341  
123              
124             =head2 new
125              
126             Title : new
127             Usage : my $obj = Bio::Search::HSP::GenericHSP->new();
128             Function: Builds a new Bio::Search::HSP::GenericHSP object
129             Returns : Bio::Search::HSP::GenericHSP
130             Args : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc)
131             -evalue => evalue
132             -pvalue => pvalue
133             -bits => bit value for HSP
134             -score => score value for HSP (typically z-score but depends on
135             analysis)
136             -hsp_length=> Length of the HSP (including gaps)
137             -identical => # of residues that that matched identically
138             -percent_identity => (optional) percent identity
139             -conserved => # of residues that matched conservatively
140             (only protein comparisons;
141             conserved == identical in nucleotide comparisons)
142             -hsp_gaps => # of gaps in the HSP
143             -query_gaps => # of gaps in the query in the alignment
144             -hit_gaps => # of gaps in the subject in the alignment
145             -query_name => HSP Query sequence name (if available)
146             -query_start => HSP Query start (in original query sequence coords)
147             -query_end => HSP Query end (in original query sequence coords)
148             -query_length=> total length of the query sequence
149             -query_seq => query sequence portion of the HSP
150             -query_desc => textual description of the query
151             -hit_name => HSP Hit sequence name (if available)
152             -hit_start => HSP Hit start (in original hit sequence coords)
153             -hit_end => HSP Hit end (in original hit sequence coords)
154             -hit_length => total length of the hit sequence
155             -hit_seq => hit sequence portion of the HSP
156             -hit_desc => textual description of the hit
157             -homology_seq=> homology sequence for the HSP
158             -hit_frame => hit frame (only if hit is translated protein)
159             -query_frame => query frame (only if query is translated protein)
160             -rank => HSP rank
161             -links => HSP links information (WU-BLAST only)
162             -hsp_group => HSP Group informat (WU-BLAST only)
163             -gap_symbol => symbol representing a gap (default = '-')
164             -hit_features=> string of features found in or near HSP hit
165             region (reported in some BLAST text output,
166             v. 2.2.13 and up)
167             -stranded => If the algorithm isn't known (i.e. defaults to
168             'generic'), setting this will indicate start/end
169             coordinates are to be used to determine the strand
170             for 'query', 'subject', 'hit', 'both', or 'none'
171             (default = 'none')
172              
173             =cut
174              
175             sub new {
176 1331     1331 1 10612 my($class,%args) = @_;
177              
178             # don't pass anything to SUPER; complex hierarchy results in lots of work
179             # for nothing
180            
181 1331         5254 my $self = $class->SUPER::new();
182              
183             # for speed, don't use _rearrange and just store all input data directly
184             # with no method calls and no work done. work can be carried
185             # out just-in-time later if desired
186 1331         4797 while (my ($arg, $value) = each %args) {
187 26240         30904 $arg =~ tr/a-z\055/A-Z/d;
188 26240         65792 $self->{$arg} = $value;
189             }
190 1331         2642 my $bits = $self->{BITS};
191              
192 1331 100       5183 defined $self->{VERBOSE} && $self->verbose($self->{VERBOSE});
193 1331 50       2649 if (exists $self->{GAP_SYMBOL}) {
194             # not checking anything else but the length (must be 1 as only one gap
195             # symbol allowed currently); can add in support for symbol checks or
196             # multiple symbols later if needed
197             $self->throw("Gap symbol must be of length 1") if
198 0 0       0 CORE::length($self->{GAP_SYMBOL}) != 1;
199             } else {
200             # dafault
201 1331         3228 $self->{GAP_SYMBOL} = '-';
202             }
203 1331   100     3323 $self->{ALGORITHM} ||= 'GENERIC';
204 1331   100     4168 $self->{STRANDED} ||= 'NONE';
205            
206 1331 50 33     5227 if (! defined $self->{QUERY_LENGTH} || ! defined $self->{HIT_LENGTH}) {
207 0         0 $self->throw("Must define hit and query length");
208             }
209              
210 1331         2298 $self->{'_sequenceschanged'} = 1;
211            
212 1331         2031 $self->{_finished_new} = 1;
213 1331         8140 return $self;
214             }
215              
216             sub _logical_length {
217 313     313   459 my ($self, $type) = @_;
218 313 100 66     852 if (!defined($self->{_sbjct_offset}) || !defined($self->{_query_offset})) {
219 105         241 $self->_calculate_seq_offsets();
220             }
221 313 100       1150 my $key = $type =~ /sbjct|hit|tot/i ? 'sbjct' : 'query';
222            
223 313         673 my $offset = $self->{"_${key}_offset"};
224 313         522 return $self->length($type) / $offset ;
225             }
226              
227             =head2 L methods
228              
229             Implementation of L methods follow
230              
231             =head2 algorithm
232              
233             Title : algorithm
234             Usage : my $r_type = $hsp->algorithm
235             Function: Obtain the name of the algorithm used to obtain the HSP
236             Returns : string (e.g., BLASTP)
237             Args : [optional] scalar string to set value
238              
239             =cut
240              
241             sub algorithm{
242 7397     7397 1 14811 my ($self,$value) = @_;
243 7397         10130 my $previous = $self->{'ALGORITHM'};
244 7397 50 33     18515 if( defined $value || ! defined $previous ) {
245 0 0       0 $value = $previous = '' unless defined $value;
246 0         0 $self->{'ALGORITHM'} = $value;
247             }
248              
249 7397         13394 return $previous;
250             }
251              
252             =head2 pvalue
253              
254             Title : pvalue
255             Usage : my $pvalue = $hsp->pvalue();
256             Function: Returns the P-value for this HSP or undef
257             Returns : float or exponential (2e-10)
258             P-value is not defined with NCBI Blast2 reports.
259             Args : [optional] numeric to set value
260              
261             =cut
262              
263             sub pvalue {
264 63     63 1 109 my ($self,$value) = @_;
265 63         95 my $previous = $self->{'PVALUE'};
266 63 50       129 if( defined $value ) {
267 0         0 $self->{'PVALUE'} = $value;
268             }
269 63         173 return $previous;
270             }
271              
272             =head2 evalue
273              
274             Title : evalue
275             Usage : my $evalue = $hsp->evalue();
276             Function: Returns the e-value for this HSP
277             Returns : float or exponential (2e-10)
278             Args : [optional] numeric to set value
279              
280             =cut
281              
282             sub evalue {
283 170     170 1 338 my ($self,$value) = @_;
284 170         295 my $previous = $self->{'EVALUE'};
285 170 50       371 if( defined $value ) {
286 0         0 $self->{'EVALUE'} = $value;
287             }
288 170         1101 return $previous;
289             }
290              
291             =head2 frac_identical
292              
293             Title : frac_identical
294             Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
295             Function: Returns the fraction of identitical positions for this HSP
296             Returns : Float in range 0.0 -> 1.0
297             Args : arg 1: 'query' = num identical / length of query seq (without gaps)
298             'hit' = num identical / length of hit seq (without gaps)
299             synonyms: 'sbjct', 'subject'
300             'total' = num identical / length of alignment (with gaps)
301             synonyms: 'hsp'
302             default = 'total'
303             arg 2: [optional] frac identical value to set for the type requested
304             Note : for translated sequences, this adjusts the length accordingly
305              
306             =cut
307              
308             sub frac_identical {
309 602     602 1 1010 my ($self, $type,$value) = @_;
310              
311 602 100       1007 unless ($self->{_did_prefrac}) {
312 105         251 $self->_pre_frac;
313             }
314              
315 602 50       1175 $type = lc $type if defined $type;
316 602 50 33     1724 $type = 'hit' if( defined $type &&
317             $type =~ /subject|sbjct/);
318 602 100 66     3328 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
      66        
319             $type !~ /query|hit|subject|sbjct|total/);
320 602         1009 my $previous = $self->{'_frac_identical'}->{$type};
321 602 100 66     1313 if( defined $value || ! defined $previous ) {
322 313 50       457 $value = $previous = '' unless defined $value;
323 313 100 100     726 if( $type eq 'hit' || $type eq 'query' ) {
324 208         459 $self->$type()->frac_identical( $value);
325             }
326 313         553 $self->{'_frac_identical'}->{$type} = $value;
327             }
328 602         2069 return $previous;
329              
330             }
331              
332             =head2 frac_conserved
333              
334             Title : frac_conserved
335             Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
336             Function : Returns the fraction of conserved positions for this HSP.
337             This is the fraction of symbols in the alignment with a
338             positive score.
339             Returns : Float in range 0.0 -> 1.0
340             Args : arg 1: 'query' = num conserved / length of query seq (without gaps)
341             'hit' = num conserved / length of hit seq (without gaps)
342             synonyms: 'sbjct', 'subject'
343             'total' = num conserved / length of alignment (with gaps)
344             synonyms: 'hsp'
345             default = 'total'
346             arg 2: [optional] frac conserved value to set for the type requested
347              
348             =cut
349              
350             sub frac_conserved {
351 440     440 1 683 my ($self, $type,$value) = @_;
352              
353 440 50       723 unless ($self->{_did_prefrac}) {
354 0         0 $self->_pre_frac;
355             }
356              
357 440 100       824 $type = lc $type if defined $type;
358 440 50 66     1280 $type = 'hit' if( defined $type && $type =~ /subject|sbjct/);
359 440 100 100     2435 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
      66        
360             $type !~ /query|hit|subject|sbjct|total/);
361 440         703 my $previous = $self->{'_frac_conserved'}->{$type};
362 440 100 66     865 if( defined $value || ! defined $previous ) {
363 313 50       425 $value = $previous = '' unless defined $value;
364 313         422 $self->{'_frac_conserved'}->{$type} = $value;
365             }
366 440         1069 return $previous;
367             }
368              
369             =head2 gaps
370              
371             Title : gaps
372             Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
373             Function : Get the number of gap characters in the query, hit, or total alignment.
374             Returns : Integer, number of gaps or 0 if none
375             Args : arg 1: 'query' = num gap characters in query seq
376             'hit' = num gap characters in hit seq; synonyms: 'sbjct', 'subject'
377             'total' = num gap characters in whole alignment; synonyms: 'hsp'
378             default = 'total'
379             arg 2: [optional] integer gap value to set for the type requested
380              
381             =cut
382              
383             sub gaps {
384 3125     3125 1 4318 my ($self, $type, $value) = @_;
385              
386 3125 100       4870 unless ($self->{_did_pregaps}) {
387 378         859 $self->_pre_gaps;
388             }
389              
390 3125 100       5027 $type = lc $type if defined $type;
391 3125 50 66     12559 $type = 'total' if( ! defined $type || $type eq 'hsp' ||
      66        
392             $type !~ /query|hit|subject|sbjct|total/);
393 3125 100       5177 $type = 'hit' if $type =~ /sbjct|subject/;
394 3125         4424 my $previous = $self->{'_gaps'}->{$type};
395 3125 100 100     6438 if( defined $value || ! defined $previous ) {
396 1110 100       1554 $value = $previous = '' unless defined $value;
397 1110         1830 $self->{'_gaps'}->{$type} = $value;
398             }
399 3125   100     8042 return $previous || 0;
400             }
401              
402             =head2 query_string
403              
404             Title : query_string
405             Usage : my $qseq = $hsp->query_string;
406             Function: Retrieves the query sequence of this HSP as a string
407             Returns : string
408             Args : [optional] string to set for query sequence
409              
410              
411             =cut
412              
413             sub query_string{
414 610     610 1 1460 my ($self,$value) = @_;
415 610         931 my $previous = $self->{QUERY_SEQ};
416 610 100 66     1744 if( defined $value || ! defined $previous ) {
417 3 50       11 $value = $previous = '' unless defined $value;
418 3         7 $self->{QUERY_SEQ} = $value;
419             # do some housekeeping so we know when to
420             # re-run _calculate_seq_positions
421 3         5 $self->{'_sequenceschanged'} = 1;
422             }
423 610         1457 return $previous;
424             }
425              
426             =head2 hit_string
427              
428             Title : hit_string
429             Usage : my $hseq = $hsp->hit_string;
430             Function: Retrieves the hit sequence of this HSP as a string
431             Returns : string
432             Args : [optional] string to set for hit sequence
433              
434              
435             =cut
436              
437             sub hit_string{
438 618     618 1 3760 my ($self,$value) = @_;
439 618         925 my $previous = $self->{HIT_SEQ};
440 618 100 66     2230 if( defined $value || ! defined $previous ) {
441 1 50       4 $value = $previous = '' unless defined $value;
442 1         2 $self->{HIT_SEQ} = $value;
443             # do some housekeeping so we know when to
444             # re-run _calculate_seq_positions
445 1         3 $self->{'_sequenceschanged'} = 1;
446             }
447 618         2058 return $previous;
448             }
449              
450             =head2 homology_string
451              
452             Title : homology_string
453             Usage : my $homo_string = $hsp->homology_string;
454             Function: Retrieves the homology sequence for this HSP as a string.
455             : The homology sequence is the string of symbols in between the
456             : query and hit sequences in the alignment indicating the degree
457             : of conservation (e.g., identical, similar, not similar).
458             Returns : string
459             Args : [optional] string to set for homology sequence
460              
461             =cut
462              
463             sub homology_string{
464 6905     6905 1 9642 my ($self,$value) = @_;
465 6905         8565 my $previous = $self->{HOMOLOGY_SEQ};
466 6905 100 66     16481 if( defined $value || ! defined $previous ) {
467 3 50       15 $value = $previous = '' unless defined $value;
468 3         9 $self->{HOMOLOGY_SEQ} = $value;
469             # do some housekeeping so we know when to
470             # re-run _calculate_seq_positions
471 3         7 $self->{'_sequenceschanged'} = 1;
472             }
473 6905         19860 return $previous;
474             }
475              
476             =head2 consensus_string
477              
478             Title : consensus_string
479             Usage : my $cs_string = $hsp->consensus_string;
480             Function: Retrieves the consensus structure line for this HSP as a string (HMMER).
481             : If the model had any consensus structure or reference line annotation
482             : that it inherited from a multiple alignment (#=GC SS cons,
483             : #=GC RF annotation in Stockholm files), that information is shown
484             : as CS or RF annotation line.
485             Returns : string
486             Args : [optional] string to set for consensus structure
487              
488             =cut
489              
490             sub consensus_string {
491 8     8 1 26 my ($self,$value) = @_;
492 8         22 my $previous = $self->{CS_SEQ};
493 8 50 33     47 if( defined $value || ! defined $previous ) {
494 0 0       0 $value = $previous = '' unless defined $value;
495 0         0 $self->{CS_SEQ} = $value;
496             # do some housekeeping so we know when to
497             # re-run _calculate_seq_positions
498 0         0 $self->{'_sequenceschanged'} = 1;
499             }
500 8         38 return $previous;
501             }
502              
503             =head2 posterior_string
504              
505             Title : posterior_string
506             Usage : my $pp_string = $hsp->posterior_string;
507             Function: Retrieves the posterior probability line for this HSP as a string (HMMer3).
508             : The posterior probability is the string of symbols at the bottom
509             : of the alignment indicating the expected accuracy of each aligned residue.
510             : A 0 means 0-5%, 1 means 5-15%, and so on; 9 means 85-95%,
511             : and a * means 95-100% posterior probability.
512             Returns : string
513             Args : [optional] string to set for posterior probability
514              
515             =cut
516              
517             sub posterior_string {
518 9     9 1 29 my ($self,$value) = @_;
519 9         22 my $previous = $self->{PP_SEQ};
520 9 100 66     63 if( defined $value || ! defined $previous ) {
521 2 50       5 $value = $previous = '' unless defined $value;
522 2         5 $self->{PP_SEQ} = $value;
523             # do some housekeeping so we know when to
524             # re-run _calculate_seq_positions
525 2         2 $self->{'_sequenceschanged'} = 1;
526             }
527 9         46 return $previous;
528             }
529              
530             =head2 length
531              
532             Title : length
533             Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
534             Function : Returns the length of the query or hit in the alignment
535             (without gaps)
536             or the aggregate length of the HSP (including gaps;
537             this may be greater than either hit or query )
538             Returns : integer
539             Args : arg 1: 'query' = length of query seq (without gaps)
540             'hit' = length of hit seq (without gaps) (synonyms: sbjct, subject)
541             'total' = length of alignment (with gaps)
542             default = 'total'
543             arg 2: [optional] integer length value to set for specific type
544              
545             =cut
546              
547             sub length {
548              
549 3839     3839 1 4533 my $self = shift;
550 3839         4116 my $type = shift;
551              
552 3839 100       5830 $type = 'total' unless defined $type;
553 3839         5451 $type = lc $type;
554              
555 3839 100       11077 if( $type =~ /^q/i ) {
    100          
556 1615         3033 return $self->query()->length(shift);
557             } elsif( $type =~ /^(hit|subject|sbjct)/ ) {
558 989         1901 return $self->hit()->length(shift);
559             } else {
560 1235         1452 my $v = shift;
561 1235 100       1919 if( defined $v ) {
562 105         153 $self->{HSP_LENGTH} = $v;
563             }
564 1235         3377 return $self->{HSP_LENGTH};
565             }
566 0         0 return 0; # should never get here
567             }
568              
569             =head2 hsp_length
570              
571             Title : hsp_length
572             Usage : my $len = $hsp->hsp_length()
573             Function: shortcut length('hsp')
574             Returns : floating point between 0 and 100
575             Args : none
576              
577             =cut
578              
579 16     16 1 94 sub hsp_length { return shift->length('hsp', shift); }
580              
581             =head2 percent_identity
582              
583             Title : percent_identity
584             Usage : my $percentid = $hsp->percent_identity()
585             Function: Returns the calculated percent identity for an HSP
586             Returns : floating point between 0 and 100
587             Args : none
588              
589              
590             =cut
591              
592             sub percent_identity {
593 84     84 1 440 my $self = shift;
594              
595 84 100       235 unless ($self->{_did_prepi}) {
596 42         144 $self->_pre_pi;
597             }
598              
599 84         263 return $self->SUPER::percent_identity(@_);
600             }
601              
602             =head2 frame
603              
604             Title : frame
605             Usage : my ($qframe, $hframe) = $hsp->frame('list',$queryframe,$subjectframe)
606             Function: Set the Frame for both query and subject and insure that
607             they agree.
608             This overrides the frame() method implementation in
609             FeaturePair.
610             Returns : array of query and subject frame if return type wants an array
611             or query frame if defined or subject frame if not defined
612             Args : 'hit' or 'subject' or 'sbjct' to retrieve the frame of the subject (default)
613             'query' to retrieve the query frame
614             'list' or 'array' to retrieve both query and hit frames together
615             Note : Frames are stored in the GFF way (0-2) not 1-3
616             as they are in BLAST (negative frames are deduced by checking
617             the strand of the query or hit)
618              
619             =cut
620              
621             # Note: changed 4/19/08 - bug 2485
622             #
623             # frame() is supposed to be a getter/setter (as is implied by the Function desc
624             # above; this is also consistent with Bio::SeqFeature::SimilarityPair). Also,
625             # the API is not consistent with the other HSP/SimilarityPair methods such as
626             # strand(), start(), end(), etc.
627             #
628             # In order to make this consistent with other methods all work is now done
629             # when the features are instantiated and not delayed. We compromise by
630             # defaulting back 'to hit' w/o passed args. Setting is now allowed.
631              
632             sub frame {
633 610     610 1 2327 my $self = shift;
634 610         689 my $val = shift;
635 610 50       1062 if (!defined $val) {
636             # unfortunately, w/o args we need to warn about API changes
637 0         0 $self->warn("API for frame() has changed.\n".
638             "Please refer to documentation for Bio::Search::HSP::GenericHSP;\n".
639             "returning query frame");
640 0         0 $val = 'query';
641             }
642 610         1349 $val =~ s/^\s+//;
643              
644 610 100       1593 if( $val =~ /^q/i ) {
    50          
    0          
    0          
645 284         464 return $self->query->frame(@_);
646             } elsif( $val =~ /^hi|^s/i ) {
647 326         594 return $self->hit->frame(@_);
648             } elsif ( $val =~ /^list|array/i ) {
649 0         0 return ($self->query->frame($_[0]),
650             $self->hit->frame($_[1]) );
651             } elsif ( $val =~ /^\d+$/) {
652             # old API i.e. frame($query_frame, $hit_frame). This catches all but one
653             # case, where no arg is passed (so the hit is wanted).
654 0         0 $self->warn("API for frame() has changed.\n".
655             "Please refer to documentation for Bio::Search::HSP::GenericHSP");
656             wantarray ?
657 0 0       0 return ($self->query->frame($val),
658             $self->hit->frame(@_) ) :
659             return $self->hit->frame($val,@_);
660             } else {
661 0         0 $self->warn("unrecognized component '$val' requested\n");
662             }
663 0         0 return 0;
664             }
665              
666             =head2 get_aln
667              
668             Title : get_aln
669             Usage : my $aln = $hsp->gel_aln
670             Function: Returns a L object representing the HSP alignment
671             Returns : L
672             Args : none
673              
674             =cut
675              
676             sub get_aln {
677 19     19 1 46 my ($self) = @_;
678 19         165 require Bio::LocatableSeq;
679 19         4073 require Bio::SimpleAlign;
680              
681 19         116 my $aln = Bio::SimpleAlign->new();
682 19         84 my $hs = $self->hit_string();
683 19         72 my $qs = $self->query_string();
684             # FASTA specific stuff moved to the FastaHSP object
685 19         42 my $seqonly = $qs;
686 19         152 $seqonly =~ s/[\-\s]//g;
687 19         67 my ($q_nm,$s_nm) = ($self->query->seq_id(),
688             $self->hit->seq_id());
689             # Should we silently change the name of the query or hit if it isn't
690             # defined? May need revisiting... cjfields 2008-12-3 (commented out below)
691            
692             #unless( defined $q_nm && CORE::length ($q_nm) ) {
693             # $q_nm = 'query';
694             #}
695             #unless( defined $s_nm && CORE::length ($s_nm) ) {
696             # $s_nm = 'hit';
697             #}
698            
699             # mapping: 1 residues for every x coordinate positions
700             my $query = Bio::LocatableSeq->new(
701             -seq => $qs,
702             -id => $q_nm,
703             -start => $self->query->start,
704             -end => $self->query->end,
705             -strand => $self->query->strand,
706             -force_nse => $q_nm ? 0 : 1,
707 19 100       73 -mapping => [ 1, $self->{_query_mapping} ]
708             );
709 19         67 $seqonly = $hs;
710 19         195 $seqonly =~ s/[\-\s]//g;
711             my $hit = Bio::LocatableSeq->new(
712             -seq => $hs,
713             -id => $s_nm,
714             -start => $self->hit->start,
715             -end => $self->hit->end,
716             -strand => $self->hit->strand,
717             -force_nse => $s_nm ? 0 : 1,
718 19 50       79 -mapping => [ 1, $self->{_hit_mapping} ]
719             );
720 19         134 $aln->add_seq($query);
721 19         65 $aln->add_seq($hit);
722 19         110 return $aln;
723             }
724              
725             =head2 num_conserved
726              
727             Title : num_conserved
728             Usage : $obj->num_conserved($newval)
729             Function: returns the number of conserved residues in the alignment
730             Returns : integer
731             Args : integer (optional)
732              
733              
734             =cut
735              
736             sub num_conserved{
737 1914     1914 1 2496 my ($self,$value) = @_;
738              
739 1914 100       3164 unless ($self->{_did_presimilar}) {
740 1         4 $self->_pre_similar_stats;
741             }
742              
743 1914 50       2698 if (defined $value) {
744 0         0 $self->{CONSERVED} = $value;
745             }
746 1914         3634 return $self->{CONSERVED};
747             }
748              
749             =head2 num_identical
750              
751             Title : num_identical
752             Usage : $obj->num_identical($newval)
753             Function: returns the number of identical residues in the alignment
754             Returns : integer
755             Args : integer (optional)
756              
757              
758             =cut
759              
760             sub num_identical{
761 1923     1923 1 2800 my ($self,$value) = @_;
762              
763 1923 100       3565 unless ($self->{_did_presimilar}) {
764 437         887 $self->_pre_similar_stats;
765             }
766              
767 1923 50       2837 if( defined $value) {
768 0         0 $self->{IDENTICAL} = $value;
769             }
770 1923         4024 return $self->{IDENTICAL};
771             }
772              
773             =head2 rank
774              
775             Usage : $hsp->rank( [string] );
776             Purpose : Get the rank of the HSP within a given Blast hit.
777             Example : $rank = $hsp->rank;
778             Returns : Integer (1..n) corresponding to the order in which the HSP
779             appears in the BLAST report.
780              
781             =cut
782              
783             sub rank {
784 4514     4514 1 5418 my ($self,$value) = @_;
785 4514 50       5538 if( defined $value) {
786 0         0 $self->{RANK} = $value;
787             }
788 4514         10365 return $self->{RANK};
789             }
790              
791             =head2 seq_inds
792              
793             Title : seq_inds
794             Purpose : Get a list of residue positions (indices) for all identical
795             : or conserved residues in the query or sbjct sequence.
796             Example : @s_ind = $hsp->seq_inds('query', 'identical');
797             : @h_ind = $hsp->seq_inds('hit', 'conserved');
798             : @h_ind = $hsp->seq_inds('hit', 'conserved-not-identical');
799             : @h_ind = $hsp->seq_inds('hit', 'conserved', 1);
800             Returns : List of integers
801             : May include ranges if collapse is true.
802             Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query)
803             : ('sbjct' is synonymous with 'hit')
804             : class = 'identical' - identical positions
805             : 'conserved' - conserved positions
806             : 'nomatch' - mismatched residue or gap positions
807             : 'mismatch' - mismatched residue positions (no gaps)
808             : 'gap' - gap positions only
809             : 'frameshift'- frameshift positions only
810             : 'conserved-not-identical' - conserved positions w/o
811             : identical residues
812             : The name can be shortened to 'id' or 'cons' unless
813             : the name is . The default value is
814             : 'identical'
815             :
816             : collapse = boolean, if true, consecutive positions are merged
817             : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
818             : collapses to "1-5 7 9-11". This is useful for
819             : consolidating long lists. Default = no collapse.
820             :
821             Throws : n/a.
822             Comments : For HSPs partially or completely derived from translated sequences
823             : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
824             : cannot easily be attributed to a single position (i.e. the
825             : positional data is ambiguous). In these cases all three codon
826             : positions are reported. Under these conditions you can check
827             : ambiguous_seq_inds() to determine whether the query, subject,
828             : or both are ambiguous.
829             :
830             See Also : L,
831             L
832              
833             =cut
834              
835             sub seq_inds{
836 194     194 1 464 my ($self, $seqType, $class, $collapse) = @_;
837              
838             # prepare the internal structures - this is cached so
839             # if the strings have not changed we're okay
840 194         496 $self->_calculate_seq_positions();
841              
842 194   50     375 $seqType ||= 'query';
843 194   50     310 $class ||= 'identical';
844 194   100     503 $collapse ||= 0;
845 194 100       382 $seqType = 'sbjct' if $seqType eq 'hit';
846 194         436 my $t = lc(substr($seqType,0,1));
847 194 100 33     526 if( $t eq 'q' ) {
    50          
848 100         150 $seqType = 'query';
849             } elsif ( $t eq 's' || $t eq 'h' ) {
850 94         145 $seqType = 'sbjct';
851             } else {
852 0         0 $self->warn("unknown seqtype $seqType using 'query'");
853 0         0 $seqType = 'query';
854             }
855            
856 194         266 $t = lc(substr($class,0,1));
857              
858 194 100       705 if( $t eq 'c' ) {
    50          
    100          
    100          
    100          
    50          
859 8 50       24 if( $class =~ /conserved\-not\-identical/ ) {
860 0         0 $class = 'conserved';
861             } else {
862 8         17 $class = 'conservedall';
863             }
864             } elsif( $t eq 'i' ) {
865 0         0 $class = 'identical';
866             } elsif( $t eq 'n' ) {
867 18         37 $class = 'nomatch';
868             } elsif( $t eq 'm' ) {
869 10         40 $class = 'mismatch';
870             } elsif( $t eq 'g' ) {
871 150         169 $class = 'gap';
872             } elsif( $t eq 'f' ) {
873 8         11 $class = 'frameshift';
874             } else {
875 0         0 $self->warn("unknown sequence class $class using 'identical'");
876 0         0 $class = 'identical';
877             }
878              
879             ## Sensitive to member name changes.
880 194         326 $seqType = "_\L$seqType\E";
881 194         248 $class = "_\L$class\E";
882 194         241 my @ary;
883              
884 194 100       341 if( $class eq '_gap' ) {
    100          
885             # this means that we are remapping the gap length that is stored
886             # in the hash (for example $self->{'_gapRes_query'} )
887             # so we'll return an array which has the values of the position of the
888             # of the gap (the key in the hash) + the gap length (value in the
889             # hash for this key - 1.
890            
891             # changing this; since the index is the position prior to the insertion,
892             # repeat the position based on the number of gaps inserted
893 521         440 @ary = map { my @tmp;
894             # position holds number of gaps inserted
895 521         885 for my $g (1..$self->{seqinds}{"${class}Res$seqType"}->{$_}) {
896 1148         1283 push @tmp, $_ ;
897             }
898 521         772 @tmp}
899 150         180 sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
  1882         1677  
  150         947  
900             } elsif( $class eq '_conservedall' ) {
901 13476         11173 @ary = sort { $a <=> $b }
902 8         185 keys %{ $self->{seqinds}{"_conservedRes$seqType"}},
903 8         11 keys %{ $self->{seqinds}{"_identicalRes$seqType"}},
  8         404  
904             } else {
905 36         63 @ary = sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
  28308         23914  
  36         1136  
906             }
907 194 100       4152 require Bio::Search::BlastUtils if $collapse;
908              
909 194 100       652 return $collapse ? &Bio::Search::SearchUtils::collapse_nums(@ary) : @ary;
910             }
911              
912             =head2 ambiguous_seq_inds
913              
914             Title : ambiguous_seq_inds
915             Purpose : returns a string denoting whether sequence indices for query,
916             : subject, or both are ambiguous
917             Returns : String; 'query', 'subject', 'query/subject', or empty string ''
918             Argument : none
919             Comments : For HSPs partially or completely derived from translated sequences
920             : (TBLASTN, BLASTX, TBLASTX, or similar), some positional data
921             : cannot easily be attributed to a single position (i.e. the
922             : positional data is ambiguous). In these cases all three codon
923             : positions are reported when using seq_inds(). Under these
924             : conditions you can check ambiguous_seq_inds() to determine whether
925             : the query, subject, or both are ambiguous.
926             See Also : L
927              
928             =cut
929              
930             sub ambiguous_seq_inds {
931 6     6 1 18 my $self = shift;
932 6         18 $self->_calculate_seq_positions();
933             my $type = ($self->{_query_offset} == 3 && $self->{_sbjct_offset} == 3) ?
934             'query/subject' :
935             ($self->{_query_offset} == 3) ? 'query' :
936 6 100 100     50 ($self->{_sbjct_offset} == 3) ? 'subject' : '';
    100          
    100          
937 6         26 return $type;
938             }
939              
940             =head2 Inherited from L
941              
942             These methods come from L
943              
944             =head2 query
945              
946             Title : query
947             Usage : my $query = $hsp->query
948             Function: Returns a SeqFeature representing the query in the HSP
949             Returns : L
950             Args : [optional] new value to set
951              
952             =cut
953              
954             sub query {
955 5590     5590 1 9326 my $self = shift;
956 5590 100       9786 unless ($self->{_created_qff}) {
957 1167         2196 $self->_query_seq_feature;
958             }
959 5590         11300 return $self->SUPER::query(@_);
960             }
961              
962             sub feature1 {
963 6939     6939 1 7808 my $self = shift;
964 6939 100 66     19500 if (! $self->{_finished_new} || $self->{_making_qff}) {
965 1331 50       3159 return $self->{_sim1} if $self->{_sim1};
966 1331         4713 $self->{_sim1} = Bio::SeqFeature::Similarity->new();
967 1331         3662 return $self->{_sim1};
968             }
969 5608 100       7940 unless ($self->{_created_qff}) {
970 9         39 $self->_query_seq_feature;
971             }
972 5608         9846 return $self->SUPER::feature1(@_);
973             }
974              
975             =head2 hit
976              
977             Title : hit
978             Usage : my $hit = $hsp->hit
979             Function: Returns a SeqFeature representing the hit in the HSP
980             Returns : L
981             Args : [optional] new value to set
982              
983             =cut
984              
985             sub hit {
986 4390     4390 1 12786 my $self = shift;
987 4390 100       7269 unless ($self->{_created_sff}) {
988 550         1227 $self->_subject_seq_feature;
989             }
990 4390         8852 return $self->SUPER::hit(@_);
991             }
992              
993             sub feature2 {
994 4408     4408 1 8022 my $self = shift;
995 4408 50 33     12313 if (! $self->{_finished_new} || $self->{_making_sff}) {
996 0 0       0 return $self->{_sim2} if $self->{_sim2};
997 0         0 $self->{_sim2} = Bio::SeqFeature::Similarity->new();
998 0         0 return $self->{_sim2};
999             }
1000 4408 100       6620 unless ($self->{_created_sff}) {
1001 12         43 $self->_subject_seq_feature;
1002             }
1003 4408         8003 return $self->SUPER::feature2(@_);
1004             }
1005              
1006             =head2 significance
1007              
1008             Title : significance
1009             Usage : $evalue = $obj->significance();
1010             $obj->significance($evalue);
1011             Function: Get/Set the significance value
1012             Returns : numeric
1013             Args : [optional] new value to set
1014              
1015             =cut
1016              
1017             # Override significance to return the e-value or, if this is
1018             # not defined (WU-BLAST), return the p-value.
1019             sub significance {
1020 19     19 1 56 my ($self, $val) = @_;
1021 19 50 33     79 if (!defined $self->{SIGNIFICANCE} || defined $val) {
1022 19 50       93 $self->{SIGNIFICANCE} = defined $val ? $val :
    100          
    50          
1023             defined $self->evalue ? $self->evalue :
1024             defined $self->pvalue ? $$self->pvalue :
1025             undef;
1026 19         54 $self->query->significance($self->{SIGNIFICANCE});
1027             }
1028 19         93 return $self->{SIGNIFICANCE};
1029             }
1030              
1031             =head2 strand
1032              
1033             Title : strand
1034             Usage : $hsp->strand('query')
1035             Function: Retrieves the strand for the HSP component requested
1036             Returns : +1 or -1
1037             Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject,
1038             'query' to retrieve the query strand (default)
1039              
1040             =cut
1041              
1042             sub strand {
1043 587     587 1 1044 my ($self,$type) = @_;
1044              
1045 587 100 100     4280 if( $type =~ /^q/i && defined $self->{'QUERY_STRAND'} ) {
    100 100        
1046 2         16 return $self->{'QUERY_STRAND'};
1047             } elsif( $type =~ /^(hit|subject|sbjct)/i && defined $self->{'HIT_STRAND'} ) {
1048 2         20 return $self->{'HIT_STRAND'};
1049             }
1050              
1051 583         1755 return $self->SUPER::strand($type)
1052             }
1053              
1054             =head2 score
1055              
1056             Title : score
1057             Usage : $score = $obj->score();
1058             $obj->score($value);
1059             Function: Get/Set the score value
1060             Returns : numeric
1061             Args : [optional] new value to set
1062              
1063             =head2 bits
1064              
1065             Title : bits
1066             Usage : $bits = $obj->bits();
1067             $obj->bits($value);
1068             Function: Get/Set the bits value
1069             Returns : numeric
1070             Args : [optional] new value to set
1071              
1072             =head1 Private methods
1073              
1074             =cut
1075              
1076             =head2 _calculate_seq_positions
1077              
1078             Title : _calculate_seq_positions
1079             Usage : $self->_calculate_seq_positions
1080             Function: Internal function
1081             Returns :
1082             Args :
1083              
1084              
1085             =cut
1086              
1087             sub _calculate_seq_positions {
1088 205     205   349 my ($self,@args) = @_;
1089 205 100       529 return unless ( $self->{'_sequenceschanged'} );
1090 73         104 $self->{'_sequenceschanged'} = 0;
1091 73         170 my ($seqString, $qseq,$sseq) = ( $self->homology_string(),
1092             $self->query_string(),
1093             $self->hit_string() );
1094 73         195 my ($mlen, $qlen, $slen) = (CORE::length($seqString), CORE::length($qseq), CORE::length($sseq));
1095 73   100     148 my $qdir = $self->query->strand || 1;
1096 73   100     176 my $sdir = $self->hit->strand || 1;
1097 73 100       237 my ($resCount_query, $endpoint_query) = ($qdir <=0) ? ($self->query->end, $self->query->start)
1098             : ($self->query->start, $self->query->end);
1099 73 100       259 my ($resCount_sbjct, $endpoint_sbjct) = ($sdir <=0) ? ($self->hit->end, $self->hit->start)
1100             : ($self->hit->start, $self->hit->end);
1101            
1102 73         193 my $prog = $self->algorithm;
1103            
1104 73 100       346 if( $prog =~ /FAST|SSEARCH|SMITH-WATERMAN/i ) {
1105            
1106             # we infer the end of the regional sequence where the first and last
1107             # non spaces are in the homology string
1108             # then we use the HSP->length to tell us how far to read
1109             # to cut off the end of the sequence
1110            
1111 4         29 my ($start, $rest) = (0,0);
1112 4 50       100 if( $seqString =~ /^(\s+)?(.*?)\s*$/ ) {
1113 4 100       25 ($start, $rest) = ($1 ? CORE::length($1) : 0, CORE::length($2));
1114             }
1115            
1116 4         15 $seqString = substr($seqString, $start, $rest);
1117 4         10 $qseq = substr($qseq, $start, $rest);
1118 4         14 $sseq = substr($sseq, $start, $rest);
1119              
1120             # commented out 10/29/08
1121             # removing frameshift symbols doesn't take into account the following:
1122             # 1) does not remove the same point in the homology string (get
1123             # positional errors)
1124             # 2) adjustments to the overall position in the string due to the
1125             # frameshift must be taken into consideration (get balancing errors)
1126             #
1127             # Frameshifts will be handled directly in the main loop.
1128             # --chris
1129            
1130             # fasta reports some extra 'regional' sequence information
1131             # we need to clear out first
1132             # this seemed a bit insane to me at first, but it appears to
1133             # work --jason
1134            
1135             #$qseq =~ s![\\\/]!!g;
1136             #$sseq =~ s![\\\/]!!g;
1137             }
1138              
1139 73 100 66     315 if (!defined($self->{_sbjct_offset}) || !defined($self->{_query_offset})) {
1140 2         11 $self->_calculate_seq_offsets();
1141             }
1142            
1143 73         149 my ($qfs, $sfs) = (0,0);
1144             CHAR_LOOP:
1145 73         240 for my $pos (0..CORE::length($seqString)-1) {
1146 30602         42446 my @qrange = (0 - $qfs)..($self->{_query_offset} - 1);
1147 30602         33239 my @srange = (0 - $sfs)..($self->{_sbjct_offset} - 1);
1148             # $self->debug("QRange:@qrange SRange:@srange\n") if ($qfs || $sfs);
1149 30602         30831 ($qfs, $sfs) = (0,0);
1150 30602 50 100     115853 my ($mchar, $qchar, $schar) = (
    50          
1151             unpack("x$pos A1",$seqString) || ' ',
1152             $pos < CORE::length($qseq) ? unpack("x$pos A1",$qseq) : '-',
1153             $pos < CORE::length($sseq) ? unpack("x$pos A1",$sseq) : '-'
1154             );
1155 30602         34000 my $matchtype = '';
1156 30602         29410 my ($qgap, $sgap) = (0,0);
1157 30602 100 100     91214 if( $mchar eq '+' || $mchar eq '.') { # conserved
    100 100        
    50          
1158 757         2521 $self->{seqinds}{_conservedRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1159 757         3485 $self->{seqinds}{_conservedRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1160 757         841 $matchtype = 'conserved';
1161             } elsif( $mchar eq ':' || $mchar ne ' ' ) { # identical
1162 25371         60121 $self->{seqinds}{_identicalRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1163 25371         48150 $self->{seqinds}{_identicalRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1164 25371         25370 $matchtype = 'identical';
1165             } elsif( $mchar eq ' ' ) { # possible mismatch/nomatch/frameshift
1166 4474 100       6045 $qfs = $qchar eq '/' ? -1 : # base inserted to match frame
    50          
1167             $qchar eq '\\' ? 1 : # base deleted to match frame
1168             0;
1169 4474 50       5733 $sfs = $schar eq '/' ? -1 :
    50          
1170             $schar eq '\\' ? 1 :
1171             0;
1172 4474 100       8572 if ($qfs) {
    50          
    100          
    100          
1173             # Frameshifts are tricky.
1174            
1175             # '/' indicates that the next residue is shifted back one
1176             # (-1) frame position and is a deletion of one base (so to
1177             # correctly align, a base is inserted). That residue should only
1178             # occupy two sequence positions instead of three.
1179            
1180             # '\' indicates that the next residue is shifted forward
1181             # one (+1) frame position and is an insertion of one base (so to
1182             # correctly align, a base is removed). That residue should
1183             # occupy four sequence positions instead of three.
1184            
1185             # Note that gaps are not counted across from frameshifts.
1186             # Frameshift indices are a range of positions starting in the
1187             # previous sequence position in which the frameshift occurs;
1188             # they are ambiguous by nature.
1189 2         14 $self->{seqinds}{_frameshiftRes_query}{ $resCount_query - ($_ * $qdir * $qfs) } = $qfs for @qrange;
1190 2         5 $matchtype = "$qfs frameshift-query";
1191 2         3 $sgap = $qgap = 1;
1192             }
1193             elsif ($sfs) {
1194 0         0 $self->{seqinds}{_frameshiftRes_sbjct}{ $resCount_sbjct - ($_ * $sdir * $sfs) } = $sfs for @srange;
1195 0         0 $matchtype = "$sfs frameshift-sbcjt";
1196 0         0 $sgap = $qgap = 1;
1197             }
1198             elsif ($qchar eq $self->{GAP_SYMBOL}) {
1199             # gap are counted as being in the immediately preceeding residue
1200             # position; for translated sequences this is not the start of
1201             # the previous codon, but the third codon position
1202 512         1034 $self->{seqinds}{_gapRes_query}{ $resCount_query - $qdir }++ for @qrange;
1203 512         635 $matchtype = 'gap-query';
1204 512         467 $qgap++;
1205             }
1206             elsif ($schar eq $self->{GAP_SYMBOL}) {
1207 491         973 $self->{seqinds}{_gapRes_sbjct}{ $resCount_sbjct - $sdir }++ for @srange;
1208 491         516 $matchtype = 'gap-sbjct';
1209 491         467 $sgap++;
1210             }
1211             else {
1212             # if not a gap or frameshift in either seq, must be mismatch
1213 3469         9023 $self->{seqinds}{_mismatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1214 3469         9528 $self->{seqinds}{_mismatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1215 3469         3663 $matchtype = 'mismatch';
1216             }
1217             # always add a nomatch unless the current position in the seq is a gap
1218 4474 100       5654 if (!$sgap) {
1219 3981         10297 $self->{seqinds}{_nomatchRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
1220             }
1221 4474 100       5580 if (!$qgap) {
1222 3960         7749 $self->{seqinds}{_nomatchRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
1223             }
1224             } else {
1225 0         0 $self->warn("Unknown midline character: [$mchar]");
1226             }
1227             # leave in and uncomment for future debugging
1228             #$self->debug(sprintf("%7d %1s[%1s]%1s %-7d Type: %-20s QOff:%-2d SOff:%-2d\n",
1229             # $resCount_query,
1230             # $qchar,
1231             # $mchar,
1232             # $schar,
1233             # $resCount_sbjct,
1234             # $matchtype,
1235             # ($self->{_query_offset} * $qdir),
1236             # ($self->{_sbjct_offset} * $sdir)));
1237 30602 100       41328 $resCount_query += ($qdir * (scalar(@qrange) + $qfs)) if !$qgap;
1238 30602 100       50956 $resCount_sbjct += ($sdir * (scalar(@srange) + $sfs)) if !$sgap;
1239             }
1240 73         169 return 1;
1241             }
1242              
1243             sub _calculate_seq_offsets {
1244 107     107   136 my $self = shift;
1245 107         230 my $prog = $self->algorithm;
1246 107         269 ($self->{_sbjct_offset}, $self->{_query_offset}) = (1,1);
1247 107 100       560 if($prog =~ /^(?:PSI)?T(BLAST|FAST)(N|X|Y)/oi ) {
    100          
1248 68         96 $self->{_sbjct_offset} = 3;
1249 68 100 66     283 if ($1 eq 'BLAST' && $2 eq 'X') { #TBLASTX
1250 67         98 $self->{_query_offset} = 3;
1251             }
1252             } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) {
1253 3         9 $self->{_query_offset} = 3;
1254             }
1255 107         151 1;
1256             }
1257              
1258             =head2 n
1259              
1260             See documentation in L
1261              
1262             =cut
1263              
1264             sub n {
1265 113     113 1 274 my $self = shift;
1266 113 50       207 if(@_) { $self->{'N'} = shift; }
  0         0  
1267             # note that returning 1 is completely an assumption
1268 113 100       356 defined $self->{'N'} ? $self->{'N'} : 1;
1269             }
1270              
1271             =head2 range
1272              
1273             See documentation in L
1274              
1275             =cut
1276              
1277             sub range {
1278 1659     1659 1 2485 my ($self, $seqType) = @_;
1279              
1280 1659   100     2371 $seqType ||= 'query';
1281 1659 100       2479 $seqType = 'sbjct' if $seqType eq 'hit';
1282              
1283 1659         1752 my ($start, $end);
1284 1659 100       2541 if( $seqType eq 'query' ) {
1285 840         1516 $start = $self->query->start;
1286 840         1376 $end = $self->query->end;
1287             }
1288             else {
1289 819         1479 $start = $self->hit->start;
1290 819         1450 $end = $self->hit->end;
1291             }
1292 1659         4819 return ($start, $end);
1293             }
1294              
1295              
1296             =head2 links
1297              
1298             Title : links
1299             Usage : $obj->links($newval)
1300             Function: Get/Set the Links value (from WU-BLAST)
1301             Indicates the placement of the alignment in the group of HSPs
1302             Returns : Value of links
1303             Args : On set, new value (a scalar or undef, optional)
1304              
1305              
1306             =cut
1307              
1308             sub links{
1309 54     54 1 94 my $self = shift;
1310              
1311 54 50       119 return $self->{LINKS} = shift if @_;
1312 54         176 return $self->{LINKS};
1313             }
1314              
1315             =head2 hsp_group
1316              
1317             Title : hsp_group
1318             Usage : $obj->hsp_group($newval)
1319             Function: Get/Set the Group value (from WU-BLAST)
1320             Indicates a grouping of HSPs
1321             Returns : Value of group
1322             Args : On set, new value (a scalar or undef, optional)
1323              
1324              
1325             =cut
1326              
1327             sub hsp_group {
1328 12     12 1 29 my $self = shift;
1329              
1330 12 50       41 return $self->{HSP_GROUP} = shift if @_;
1331 12         53 return $self->{HSP_GROUP};
1332             }
1333              
1334             =head2 hit_features
1335              
1336             Title : hit_features
1337             Usage : $obj->hit_features($newval)
1338             Function: Get/Set the HSP hit feature string (from some BLAST 2.2.13 text
1339             output), which is a string of overlapping or nearby features in HSP
1340             hit
1341             Returns : Value of hit features, if present
1342             Args : On set, new value (a scalar or undef, optional)
1343              
1344              
1345             =cut
1346              
1347             sub hit_features {
1348 6     6 1 35 my $self = shift;
1349              
1350 6 50       19 return $self->{HIT_FEATURES} = shift if @_;
1351 6         27 return $self->{HIT_FEATURES};
1352             }
1353              
1354             # The cigar string code is written by Juguang Xiao
1355              
1356             =head1 Brief introduction on cigar string
1357              
1358             NOTE: the concept is originally from EnsEMBL docs at
1359             http://may2005.archive.ensembl.org/Docs/wiki/html/EnsemblDocs/CigarFormat.html
1360             Please append to these docs if you have a better definition.
1361              
1362             Sequence alignment hits can be stored in a database as ungapped alignments.
1363             This imposes 2 major constraints on alignments:
1364              
1365             a) alignments for a single hit record require multiple rows in the database,
1366             and
1367             b) it is not possible to accurately retrieve the exact original alignment.
1368              
1369             Alternatively, sequence alignments can be stored as gapped alignments using
1370             the CIGAR line format (where CIGAR stands for Concise Idiosyncratic Gapped
1371             Alignment Report).
1372              
1373             In the cigar line format alignments are stored as follows:
1374              
1375             M: Match
1376             D: Deletion
1377             I: Insertion
1378              
1379             An example of an alignment for a hypthetical protein match is shown below:
1380              
1381              
1382             Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
1383              
1384             PG P G GP R PLGP
1385              
1386             Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
1387              
1388              
1389             protein_align_feature table as the following cigar line:
1390              
1391             7M4D12M2I2MD7M
1392              
1393             =head2 cigar_string
1394              
1395             Name: cigar_string
1396             Usage: $cigar_string = $hsp->cigar_string
1397             Function: Generate and return cigar string for this HSP alignment
1398             Args: No input needed
1399             Return: a cigar string
1400              
1401             =cut
1402              
1403              
1404             sub cigar_string {
1405 8     8 1 24 my ($self, $arg) = @_;
1406 8 50       17 $self->warn("this is not a setter") if(defined $arg);
1407              
1408 8 100       16 unless(defined $self->{_cigar_string}){ # generate cigar string
1409 7         22 my $cigar_string = $self->generate_cigar_string($self->query_string, $self->hit_string);
1410 7         14 $self->{_cigar_string} = $cigar_string;
1411             } # end of unless
1412              
1413 8         29 return $self->{_cigar_string};
1414             }
1415              
1416             =head2 generate_cigar_string
1417              
1418             Name: generate_cigar_string
1419             Usage: my $cigar_string = Bio::Search::HSP::GenericHSP::generate_cigar_string ($qstr, $hstr);
1420             Function: generate cigar string from a simple sequence of alignment.
1421             Args: the string of query and subject
1422             Return: cigar string
1423              
1424             =cut
1425              
1426             sub generate_cigar_string {
1427 7     7 1 13 my ($self, $qstr, $hstr) = @_;
1428 7         211 my @qchars = split //, $qstr;
1429 7         193 my @hchars = split //, $hstr;
1430              
1431 7 50       16 unless(scalar(@qchars) == scalar(@hchars)){
1432 0         0 $self->throw("two sequences are not equal in lengths");
1433             }
1434              
1435 7         15 $self->{_count_for_cigar_string} = 0;
1436 7         10 $self->{_state_for_cigar_string} = 'M';
1437              
1438 7         9 my $cigar_string = '';
1439 7         17 for(my $i=0; $i <= $#qchars; $i++){
1440 1917         1984 my $qchar = $qchars[$i];
1441 1917         1658 my $hchar = $hchars[$i];
1442 1917 100 100     3961 if($qchar ne $self->{GAP_SYMBOL} && $hchar ne $self->{GAP_SYMBOL}){ # Match
    100          
    50          
1443 1837         2096 $cigar_string .= $self->_sub_cigar_string('M');
1444             }elsif($qchar eq $self->{GAP_SYMBOL}){ # Deletion
1445 13         22 $cigar_string .= $self->_sub_cigar_string('D');
1446             }elsif($hchar eq $self->{GAP_SYMBOL}){ # Insertion
1447 67         85 $cigar_string .= $self->_sub_cigar_string('I');
1448             }else{
1449 0         0 $self->throw("Impossible state that 2 gaps on each seq aligned");
1450             }
1451             }
1452 7         12 $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
1453 7         159 return $cigar_string;
1454             }
1455              
1456             # an internal method to help generate cigar string
1457              
1458             sub _sub_cigar_string {
1459 1924     1924   2043 my ($self, $new_state) = @_;
1460              
1461 1924         1681 my $sub_cigar_string = '';
1462 1924 100       2143 if($self->{_state_for_cigar_string} eq $new_state){
1463 1835         1648 $self->{_count_for_cigar_string} += 1; # Remain the state and increase the counter
1464             }else{
1465             $sub_cigar_string .= $self->{_count_for_cigar_string}
1466 89 100       131 unless $self->{_count_for_cigar_string} == 1;
1467 89         87 $sub_cigar_string .= $self->{_state_for_cigar_string};
1468 89         82 $self->{_count_for_cigar_string} = 1;
1469 89         88 $self->{_state_for_cigar_string} = $new_state;
1470             }
1471 1924         3644 return $sub_cigar_string;
1472             }
1473              
1474             # needed before seqfeatures can be made
1475             sub _pre_seq_feature {
1476 1177     1177   1410 my $self = shift;
1477 1177         1890 my $algo = $self->{ALGORITHM};
1478              
1479 1177         1912 my ($queryfactor, $hitfactor) = (0,0);
1480 1177         1790 my ($hitmap, $querymap) = (1,1);
1481 1177 100 100     12547 if( $algo =~ /^(?:PSI)?T(?:BLAST|FAST|SW)[NY]/oi ) {
    100 100        
    100 100        
    100          
1482 7         14 $hitfactor = 1;
1483 7         11 $hitmap = 3;
1484             }
1485             elsif ($algo =~ /^(?:FAST|BLAST)(?:X|Y|XY)/oi || $algo =~ /^P?GENEWISE/oi ) {
1486 25         40 $queryfactor = 1;
1487 25         30 $querymap = 3;
1488             }
1489             elsif ($algo =~ /^T(BLAST|FAST|SW)(X|Y|XY)/oi || $algo =~ /^(BLAST|FAST|SW)N/oi || $algo =~ /^WABA|AXT|BLAT|BLASTZ|PSL|MEGABLAST|EXONERATE|SW|SSEARCH|SMITH\-WATERMAN|SIM4$/){
1490 286 100       893 if ($2) {
1491 126         185 $hitmap = $querymap = 3;
1492             }
1493 286         366 $hitfactor = 1;
1494 286         337 $queryfactor = 1;
1495             }
1496             elsif ($algo =~ /^RPS-BLAST/) {
1497 1 50       4 if ($algo =~ /^RPS-BLAST\(BLASTX\)/) {
1498 0         0 $queryfactor = 1;
1499 0         0 $querymap = 3;
1500             }
1501 1         1 $hitfactor = 0;
1502             }
1503             else {
1504 858         2404 my $stranded = lc substr($self->{STRANDED}, 0,1);
1505 858 100 66     2383 $queryfactor = ($stranded eq 'q' || $stranded eq 'b') ? 1 : 0;
1506 858 100 100     3263 $hitfactor = ($stranded eq 'h' || $stranded eq 's' || $stranded eq 'b') ? 1 : 0;
1507             }
1508 1177         1908 $self->{_query_factor} = $queryfactor;
1509 1177         1698 $self->{_hit_factor} = $hitfactor;
1510 1177         1667 $self->{_hit_mapping} = $hitmap;
1511 1177         2082 $self->{_query_mapping} = $querymap;
1512             }
1513              
1514             # make query seq feature
1515             sub _query_seq_feature {
1516 1176     1176   1472 my $self = shift;
1517 1176         1959 $self->{_making_qff} = 1;
1518 1176         1968 my $qs = $self->{QUERY_START};
1519 1176         1736 my $qe = $self->{QUERY_END};
1520 1176 100       2326 unless (defined $self->{_query_factor}) {
1521 1133         2221 $self->_pre_seq_feature;
1522             }
1523 1176         1658 my $queryfactor = $self->{_query_factor};
1524              
1525 1176 50 33     3483 unless( defined $qe && defined $qs ) { $self->throw("Did not specify a Query End or Query Begin"); }
  0         0  
1526              
1527 1176         1353 my $strand;
1528 1176 100       2051 if ($qe > $qs) { # normal query: start < end
1529 1106 100       1448 if ($queryfactor) {
1530 843         1158 $strand = 1;
1531             }
1532             else {
1533 263         353 $strand = undef;
1534             }
1535             }
1536             else {
1537 70 50       145 if ($queryfactor) {
1538 70         114 $strand = -1;
1539             }
1540             else {
1541 0         0 $strand = undef;
1542             }
1543 70         197 ($qs,$qe) = ($qe,$qs);
1544             }
1545              
1546             # Note: many of these data are not query- and hit-specific.
1547             # Only start, end, name, length are.
1548             # We could be more efficient by only storing this info once.
1549             # steve chervitz --- Sat Apr 5 00:55:07 2003
1550              
1551 1176   33     2586 my $sim1 = $self->{_sim1} || Bio::SeqFeature::Similarity->new();
1552 1176         3217 $sim1->start($qs);
1553 1176         2758 $sim1->end($qe);
1554 1176         4164 $sim1->significance($self->{EVALUE});
1555 1176         3807 $sim1->bits($self->{BITS});
1556 1176         3374 $sim1->score($self->{SCORE});
1557 1176         2666 $sim1->strand($strand);
1558 1176         3475 $sim1->seq_id($self->{QUERY_NAME});
1559 1176         3231 $sim1->seqlength($self->{QUERY_LENGTH});
1560 1176         2965 $sim1->source_tag($self->{ALGORITHM});
1561 1176         3425 $sim1->seqdesc($self->{QUERY_DESC});
1562 1176 100       4698 $sim1->add_tag_value('meta', $self->{META}) if $self->can('meta');
1563             # to determine frame from something like FASTXY which doesn't
1564             # report the frame
1565 1176         1797 my $qframe = $self->{QUERY_FRAME};
1566            
1567 1176 100 100     4828 if (defined $strand && !defined $qframe && $queryfactor) {
    100 66        
1568 603         994 $qframe = ( $qs % 3 ) * $strand;
1569             }
1570             elsif (!defined $strand) {
1571 263         354 $qframe = 0;
1572             }
1573            
1574 1176 50       4908 if( $qframe =~ /^([+-])?([0-3])/ ) {
1575 1176   100     4319 my $dir = $1 || '+';
1576 1176 50 33     4285 if($qframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
      66        
1577 0         0 $self->warn("Query frame ($qframe) did not match strand of query ($strand)");
1578             }
1579 1176 100       3583 $qframe = $2 != 0 ? $2 - 1 : $2;
1580             } else {
1581 0         0 $self->warn("Unknown query frame ($qframe)");
1582 0         0 $qframe = 0;
1583             }
1584            
1585 1176         3270 $sim1->frame($qframe);
1586 1176         3290 $self->SUPER::feature1($sim1);
1587              
1588 1176         1859 $self->{_created_qff} = 1;
1589 1176         1972 $self->{_making_qff} = 0;
1590             }
1591              
1592             # make subject seq feature
1593             sub _subject_seq_feature {
1594 562     562   737 my $self = shift;
1595 562         1041 $self->{_making_sff} = 1;
1596 562         890 my $hs = $self->{HIT_START};
1597 562         847 my $he = $self->{HIT_END};
1598 562 100       1210 unless (defined $self->{_hit_factor}) {
1599 44         108 $self->_pre_seq_feature;
1600             }
1601 562         721 my $hitfactor = $self->{_hit_factor};
1602              
1603 562 50 33     1684 unless( defined $he && defined $hs ) { $self->throw("Did not specify a Hit End or Hit Begin"); }
  0         0  
1604              
1605 562         701 my $strand;
1606 562 100       1271 if ($he > $hs) { # normal subject
1607 455 100       817 if ($hitfactor) {
1608 199         305 $strand = 1;
1609             }
1610             else {
1611 256         336 $strand = undef;
1612             }
1613             }
1614             else {
1615 107 100       242 if ($hitfactor) {
1616 106         157 $strand = -1;
1617             }
1618             else {
1619 1         2 $strand = undef;
1620             }
1621 107         262 ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end
1622             }
1623              
1624 562   33     2182 my $sim2 = $self->{_sim2} || Bio::SeqFeature::Similarity->new();
1625 562         1326 $sim2->start($hs);
1626 562         1292 $sim2->end($he);
1627 562         1585 $sim2->significance($self->{EVALUE});
1628 562         1584 $sim2->bits($self->{BITS});
1629 562         1527 $sim2->score($self->{SCORE});
1630 562         1276 $sim2->strand($strand);
1631 562         1519 $sim2->seq_id($self->{HIT_NAME});
1632 562         1391 $sim2->seqlength($self->{HIT_LENGTH});
1633 562         1464 $sim2->source_tag($self->{ALGORITHM});
1634 562         1242 $sim2->seqdesc($self->{HIT_DESC});
1635 562 100       1917 $sim2->add_tag_value('meta', $self->{META}) if $self->can('meta');
1636 562         985 my $hframe = $self->{HIT_FRAME};
1637            
1638 562 100 100     2216 if (defined $strand && !defined $hframe && $hitfactor) {
    100 66        
1639 3         11 $hframe = ( $hs % 3 ) * $strand;
1640             }
1641             elsif (!defined $strand) {
1642 257         350 $hframe = 0;
1643             }
1644            
1645 562 50       2323 if( $hframe =~ /^([+-])?([0-3])/ ) {
1646 562   100     2132 my $dir = $1 || '+';
1647 562 50 33     1672 if($hframe && (($dir eq '-' && $strand >= 0) || ($dir eq '+' && $strand <= 0)) ) {
      66        
1648 0         0 $self->warn("Subject frame ($hframe) did not match strand of subject ($strand)");
1649             }
1650 562 100       1873 $hframe = $2 != 0 ? $2 - 1 : $2;
1651             } else {
1652 0         0 $self->warn("Unknown subject frame ($hframe)");
1653 0         0 $hframe = 0;
1654             }
1655            
1656 562         1635 $sim2->frame($hframe);
1657 562         1805 $self->SUPER::feature2($sim2);
1658              
1659 562         866 $self->{_created_sff} = 1;
1660 562         978 $self->{_making_sff} = 0;
1661             }
1662              
1663             # before calling the num_* methods
1664             sub _pre_similar_stats {
1665 438     438   556 my $self = shift;
1666 438         721 my $identical = $self->{IDENTICAL};
1667 438         596 my $conserved = $self->{CONSERVED};
1668 438         566 my $percent_id = $self->{PERCENT_IDENTITY};
1669              
1670 438 50       709 if (! defined $identical) {
1671 0 0       0 if (! defined $percent_id) {
1672 0         0 $self->warn("Did not defined the number of identical matches or overall percent identity in the HSP; assuming 0");
1673 0         0 $identical = 0;
1674             }
1675             else {
1676 0         0 $identical = sprintf("%.0f",$percent_id * $self->{HSP_LENGTH});
1677             }
1678             }
1679              
1680 438 50       684 if (! defined $conserved) {
1681             $self->warn("Did not define the number of conserved matches in the HSP; assuming conserved == identical ($identical)")
1682 0 0       0 if( $self->{ALGORITHM} !~ /^((FAST|BLAST)N)|EXONERATE|SIM4|AXT|PSL|BLAT|BLASTZ|WABA/oi);
1683 0         0 $conserved = $identical;
1684             }
1685 438         645 $self->{IDENTICAL} = $identical;
1686 438         597 $self->{CONSERVED} = $conserved;
1687 438         751 $self->{_did_presimilar} = 1;
1688             }
1689              
1690             # before calling the frac_* methods
1691              
1692             sub _pre_frac {
1693 105     105   135 my $self = shift;
1694            
1695 105         159 my $hsp_len = $self->{HSP_LENGTH};
1696 105         165 my $hit_len = $self->{HIT_LENGTH};
1697 105         148 my $query_len = $self->{QUERY_LENGTH};
1698              
1699 105         241 my $identical = $self->num_identical;
1700 105         236 my $conserved = $self->num_conserved;
1701              
1702 105         186 $self->{_did_prefrac} = 1;
1703 105         122 my $logical;
1704 105 50       209 if( $hsp_len ) {
1705 105         247 $self->length('total', $hsp_len);
1706 105         247 $logical = $self->_logical_length('total');
1707 105         342 $self->frac_identical( 'total', $identical / $hsp_len);
1708 105         306 $self->frac_conserved( 'total', $conserved / $hsp_len);
1709             }
1710 105 100       194 if( $hit_len ) {
1711 104         198 $logical = $self->_logical_length('hit');
1712 104         336 $self->frac_identical( 'hit', $identical / $logical);
1713 104         276 $self->frac_conserved( 'hit', $conserved / $logical);
1714             }
1715 105 100       202 if( $query_len ) {
1716 104         209 $logical = $self->_logical_length('query');
1717 104         322 $self->frac_identical( 'query', $identical / $logical) ;
1718 104         228 $self->frac_conserved( 'query', $conserved / $logical);
1719             }
1720             }
1721              
1722             # before calling gaps()
1723             # This relies first on passed parameters (parser-dependent), then on gaps
1724             # calculated by seq_inds() (if set), then falls back to directly checking
1725             # for '-' or '.' as a last resort
1726              
1727             sub _pre_gaps {
1728 378     378   533 my $self = shift;
1729 378         561 my $query_gaps = $self->{QUERY_GAPS};
1730 378         599 my $query_seq = $self->{QUERY_SEQ};
1731 378         460 my $hit_gaps = $self->{HIT_GAPS};
1732 378         560 my $hit_seq = $self->{HIT_SEQ};
1733 378         480 my $gaps = $self->{HSP_GAPS};
1734              
1735 378         578 $self->{_did_pregaps} = 1; # well, we're in the process; avoid recursion
1736 378 50       940 if( defined $query_gaps ) {
    100          
1737 0         0 $self->gaps('query', $query_gaps);
1738             } elsif( defined $query_seq ) {
1739 363 50       982 my $qg = (defined $self->{'_query_offset'}) ? $self->seq_inds('query','gaps')
    100          
1740             : ($self->algorithm eq 'ERPIN') ? scalar( $hit_seq =~ tr/\-//)
1741             : scalar( $query_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1742 363   100     955 my $offset = $self->{'_query_offset'} || 1;
1743 363         1011 $self->gaps('query', $qg/$offset);
1744             }
1745 378 50       953 if( defined $hit_gaps ) {
    100          
1746 0         0 $self->gaps('hit', $hit_gaps);
1747             } elsif( defined $hit_seq ) {
1748 358 100       952 my $hg = (defined $self->{'_sbjct_offset'}) ? $self->seq_inds('hit','gaps')
    100          
1749             : ($self->algorithm eq 'ERPIN') ? scalar( $hit_seq =~ tr/\-//)
1750             : scalar( $hit_seq =~ tr/\-\.// ); # HMMER3 and Infernal uses '.' and '-'
1751 358   100     934 my $offset = $self->{'_sbjct_offset'} || 1;
1752 358         766 $self->gaps('hit', $hg/$offset);
1753             }
1754 378 100       724 if( ! defined $gaps ) {
1755 329         616 $gaps = $self->gaps("query") + $self->gaps("hit");
1756             }
1757 378         775 $self->gaps('total', $gaps);
1758             }
1759              
1760             # before percent_identity
1761             sub _pre_pi {
1762 42     42   73 my $self = shift;
1763 42         92 $self->{_did_prepi} = 1;
1764 42 50 33     324 $self->percent_identity($self->{PERCENT_IDENTITY} || $self->frac_identical('total')*100) if( $self->{HSP_LENGTH} > 0 );
1765             }
1766              
1767             1;